diff options
author | Tadeusz Struk <tadeusz.struk@intel.com> | 2010-11-04 15:00:45 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2010-11-13 07:47:55 -0500 |
commit | 0bd82f5f6355775fbaf7d3c664432ce1b862be1e (patch) | |
tree | 5f7f7348c2681d572e8bc11f27a42a6e2b8f4023 /arch/x86/crypto | |
parent | 895be15745d59cc7ede0e1c203e3432b0abdb71c (diff) |
crypto: aesni-intel - RFC4106 AES-GCM Driver Using Intel New Instructions
This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit
kernels. It supports 128-bit AES key size. This leverages the crypto
AEAD interface type to facilitate a combined AES & GCM operation to
be implemented in assembly code. The assembly code leverages Intel(R)
AES New Instructions and the PCLMULQDQ instruction.
Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com>
Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com>
Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com>
Signed-off-by: James Guilford <james.guilford@intel.com>
Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/aesni-intel_asm.S | 1192 | ||||
-rw-r--r-- | arch/x86/crypto/aesni-intel_glue.c | 518 |
2 files changed, 1708 insertions, 2 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index ff16756a51c1..aafced54df64 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S | |||
@@ -9,6 +9,17 @@ | |||
9 | * Vinodh Gopal <vinodh.gopal@intel.com> | 9 | * Vinodh Gopal <vinodh.gopal@intel.com> |
10 | * Kahraman Akdemir | 10 | * Kahraman Akdemir |
11 | * | 11 | * |
12 | * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD | ||
13 | * interface for 64-bit kernels. | ||
14 | * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) | ||
15 | * Aidan O'Mahony (aidan.o.mahony@intel.com) | ||
16 | * Adrian Hoban <adrian.hoban@intel.com> | ||
17 | * James Guilford (james.guilford@intel.com) | ||
18 | * Gabriele Paoloni <gabriele.paoloni@intel.com> | ||
19 | * Tadeusz Struk (tadeusz.struk@intel.com) | ||
20 | * Wajdi Feghali (wajdi.k.feghali@intel.com) | ||
21 | * Copyright (c) 2010, Intel Corporation. | ||
22 | * | ||
12 | * This program is free software; you can redistribute it and/or modify | 23 | * This program is free software; you can redistribute it and/or modify |
13 | * it under the terms of the GNU General Public License as published by | 24 | * it under the terms of the GNU General Public License as published by |
14 | * the Free Software Foundation; either version 2 of the License, or | 25 | * the Free Software Foundation; either version 2 of the License, or |
@@ -18,8 +29,60 @@ | |||
18 | #include <linux/linkage.h> | 29 | #include <linux/linkage.h> |
19 | #include <asm/inst.h> | 30 | #include <asm/inst.h> |
20 | 31 | ||
32 | .data | ||
33 | POLY: .octa 0xC2000000000000000000000000000001 | ||
34 | TWOONE: .octa 0x00000001000000000000000000000001 | ||
35 | |||
36 | # order of these constants should not change. | ||
37 | # more specifically, ALL_F should follow SHIFT_MASK, | ||
38 | # and ZERO should follow ALL_F | ||
39 | |||
40 | SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F | ||
41 | MASK1: .octa 0x0000000000000000ffffffffffffffff | ||
42 | MASK2: .octa 0xffffffffffffffff0000000000000000 | ||
43 | SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 | ||
44 | ALL_F: .octa 0xffffffffffffffffffffffffffffffff | ||
45 | ZERO: .octa 0x00000000000000000000000000000000 | ||
46 | ONE: .octa 0x00000000000000000000000000000001 | ||
47 | F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 | ||
48 | dec: .octa 0x1 | ||
49 | enc: .octa 0x2 | ||
50 | |||
51 | |||
21 | .text | 52 | .text |
22 | 53 | ||
54 | |||
55 | #define STACK_OFFSET 8*3 | ||
56 | #define HashKey 16*0 // store HashKey <<1 mod poly here | ||
57 | #define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here | ||
58 | #define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here | ||
59 | #define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here | ||
60 | #define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 | ||
61 | // bits of HashKey <<1 mod poly here | ||
62 | //(for Karatsuba purposes) | ||
63 | #define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 | ||
64 | // bits of HashKey^2 <<1 mod poly here | ||
65 | // (for Karatsuba purposes) | ||
66 | #define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 | ||
67 | // bits of HashKey^3 <<1 mod poly here | ||
68 | // (for Karatsuba purposes) | ||
69 | #define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 | ||
70 | // bits of HashKey^4 <<1 mod poly here | ||
71 | // (for Karatsuba purposes) | ||
72 | #define VARIABLE_OFFSET 16*8 | ||
73 | |||
74 | #define arg1 rdi | ||
75 | #define arg2 rsi | ||
76 | #define arg3 rdx | ||
77 | #define arg4 rcx | ||
78 | #define arg5 r8 | ||
79 | #define arg6 r9 | ||
80 | #define arg7 STACK_OFFSET+8(%r14) | ||
81 | #define arg8 STACK_OFFSET+16(%r14) | ||
82 | #define arg9 STACK_OFFSET+24(%r14) | ||
83 | #define arg10 STACK_OFFSET+32(%r14) | ||
84 | |||
85 | |||
23 | #define STATE1 %xmm0 | 86 | #define STATE1 %xmm0 |
24 | #define STATE2 %xmm4 | 87 | #define STATE2 %xmm4 |
25 | #define STATE3 %xmm5 | 88 | #define STATE3 %xmm5 |
@@ -47,6 +110,1135 @@ | |||
47 | #define T2 %r11 | 110 | #define T2 %r11 |
48 | #define TCTR_LOW T2 | 111 | #define TCTR_LOW T2 |
49 | 112 | ||
113 | |||
114 | /* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) | ||
115 | * | ||
116 | * | ||
117 | * Input: A and B (128-bits each, bit-reflected) | ||
118 | * Output: C = A*B*x mod poly, (i.e. >>1 ) | ||
119 | * To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input | ||
120 | * GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. | ||
121 | * | ||
122 | */ | ||
123 | .macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 | ||
124 | movdqa \GH, \TMP1 | ||
125 | pshufd $78, \GH, \TMP2 | ||
126 | pshufd $78, \HK, \TMP3 | ||
127 | pxor \GH, \TMP2 # TMP2 = a1+a0 | ||
128 | pxor \HK, \TMP3 # TMP3 = b1+b0 | ||
129 | PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 | ||
130 | PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 | ||
131 | PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) | ||
132 | pxor \GH, \TMP2 | ||
133 | pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) | ||
134 | movdqa \TMP2, \TMP3 | ||
135 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | ||
136 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
137 | pxor \TMP3, \GH | ||
138 | pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK | ||
139 | |||
140 | # first phase of the reduction | ||
141 | |||
142 | movdqa \GH, \TMP2 | ||
143 | movdqa \GH, \TMP3 | ||
144 | movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 | ||
145 | # in in order to perform | ||
146 | # independent shifts | ||
147 | pslld $31, \TMP2 # packed right shift <<31 | ||
148 | pslld $30, \TMP3 # packed right shift <<30 | ||
149 | pslld $25, \TMP4 # packed right shift <<25 | ||
150 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
151 | pxor \TMP4, \TMP2 | ||
152 | movdqa \TMP2, \TMP5 | ||
153 | psrldq $4, \TMP5 # right shift TMP5 1 DW | ||
154 | pslldq $12, \TMP2 # left shift TMP2 3 DWs | ||
155 | pxor \TMP2, \GH | ||
156 | |||
157 | # second phase of the reduction | ||
158 | |||
159 | movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 | ||
160 | # in in order to perform | ||
161 | # independent shifts | ||
162 | movdqa \GH,\TMP3 | ||
163 | movdqa \GH,\TMP4 | ||
164 | psrld $1,\TMP2 # packed left shift >>1 | ||
165 | psrld $2,\TMP3 # packed left shift >>2 | ||
166 | psrld $7,\TMP4 # packed left shift >>7 | ||
167 | pxor \TMP3,\TMP2 # xor the shifted versions | ||
168 | pxor \TMP4,\TMP2 | ||
169 | pxor \TMP5, \TMP2 | ||
170 | pxor \TMP2, \GH | ||
171 | pxor \TMP1, \GH # result is in TMP1 | ||
172 | .endm | ||
173 | |||
174 | /* | ||
175 | * if a = number of total plaintext bytes | ||
176 | * b = floor(a/16) | ||
177 | * num_initial_blocks = b mod 4 | ||
178 | * encrypt the initial num_initial_blocks blocks and apply ghash on | ||
179 | * the ciphertext | ||
180 | * %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers | ||
181 | * are clobbered | ||
182 | * arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified | ||
183 | */ | ||
184 | |||
185 | .macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ | ||
186 | XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation | ||
187 | |||
188 | mov arg7, %r10 # %r10 = AAD | ||
189 | mov arg8, %r12 # %r12 = aadLen | ||
190 | mov %r12, %r11 | ||
191 | pxor %xmm\i, %xmm\i | ||
192 | _get_AAD_loop\num_initial_blocks\operation: | ||
193 | movd (%r10), \TMP1 | ||
194 | pslldq $12, \TMP1 | ||
195 | psrldq $4, %xmm\i | ||
196 | pxor \TMP1, %xmm\i | ||
197 | add $4, %r10 | ||
198 | sub $4, %r12 | ||
199 | jne _get_AAD_loop\num_initial_blocks\operation | ||
200 | cmp $16, %r11 | ||
201 | je _get_AAD_loop2_done\num_initial_blocks\operation | ||
202 | mov $16, %r12 | ||
203 | _get_AAD_loop2\num_initial_blocks\operation: | ||
204 | psrldq $4, %xmm\i | ||
205 | sub $4, %r12 | ||
206 | cmp %r11, %r12 | ||
207 | jne _get_AAD_loop2\num_initial_blocks\operation | ||
208 | _get_AAD_loop2_done\num_initial_blocks\operation: | ||
209 | pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data | ||
210 | xor %r11, %r11 # initialise the data pointer offset as zero | ||
211 | |||
212 | # start AES for num_initial_blocks blocks | ||
213 | |||
214 | mov %arg5, %rax # %rax = *Y0 | ||
215 | movdqu (%rax), \XMM0 # XMM0 = Y0 | ||
216 | pshufb SHUF_MASK(%rip), \XMM0 | ||
217 | .if \i_seq != 0 | ||
218 | .irpc index, \i_seq | ||
219 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
220 | movdqa \XMM0, %xmm\index | ||
221 | pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap | ||
222 | .endr | ||
223 | .irpc index, \i_seq | ||
224 | pxor 16*0(%arg1), %xmm\index | ||
225 | .endr | ||
226 | .irpc index, \i_seq | ||
227 | movaps 0x10(%rdi), \TMP1 | ||
228 | AESENC \TMP1, %xmm\index # Round 1 | ||
229 | .endr | ||
230 | .irpc index, \i_seq | ||
231 | movaps 0x20(%arg1), \TMP1 | ||
232 | AESENC \TMP1, %xmm\index # Round 2 | ||
233 | .endr | ||
234 | .irpc index, \i_seq | ||
235 | movaps 0x30(%arg1), \TMP1 | ||
236 | AESENC \TMP1, %xmm\index # Round 2 | ||
237 | .endr | ||
238 | .irpc index, \i_seq | ||
239 | movaps 0x40(%arg1), \TMP1 | ||
240 | AESENC \TMP1, %xmm\index # Round 2 | ||
241 | .endr | ||
242 | .irpc index, \i_seq | ||
243 | movaps 0x50(%arg1), \TMP1 | ||
244 | AESENC \TMP1, %xmm\index # Round 2 | ||
245 | .endr | ||
246 | .irpc index, \i_seq | ||
247 | movaps 0x60(%arg1), \TMP1 | ||
248 | AESENC \TMP1, %xmm\index # Round 2 | ||
249 | .endr | ||
250 | .irpc index, \i_seq | ||
251 | movaps 0x70(%arg1), \TMP1 | ||
252 | AESENC \TMP1, %xmm\index # Round 2 | ||
253 | .endr | ||
254 | .irpc index, \i_seq | ||
255 | movaps 0x80(%arg1), \TMP1 | ||
256 | AESENC \TMP1, %xmm\index # Round 2 | ||
257 | .endr | ||
258 | .irpc index, \i_seq | ||
259 | movaps 0x90(%arg1), \TMP1 | ||
260 | AESENC \TMP1, %xmm\index # Round 2 | ||
261 | .endr | ||
262 | .irpc index, \i_seq | ||
263 | movaps 0xa0(%arg1), \TMP1 | ||
264 | AESENCLAST \TMP1, %xmm\index # Round 10 | ||
265 | .endr | ||
266 | .irpc index, \i_seq | ||
267 | movdqu (%arg3 , %r11, 1), \TMP1 | ||
268 | pxor \TMP1, %xmm\index | ||
269 | movdqu %xmm\index, (%arg2 , %r11, 1) | ||
270 | # write back plaintext/ciphertext for num_initial_blocks | ||
271 | add $16, %r11 | ||
272 | .if \operation == dec | ||
273 | movdqa \TMP1, %xmm\index | ||
274 | .endif | ||
275 | pshufb SHUF_MASK(%rip), %xmm\index | ||
276 | # prepare plaintext/ciphertext for GHASH computation | ||
277 | .endr | ||
278 | .endif | ||
279 | GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
280 | # apply GHASH on num_initial_blocks blocks | ||
281 | |||
282 | .if \i == 5 | ||
283 | pxor %xmm5, %xmm6 | ||
284 | GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
285 | pxor %xmm6, %xmm7 | ||
286 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
287 | pxor %xmm7, %xmm8 | ||
288 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
289 | .elseif \i == 6 | ||
290 | pxor %xmm6, %xmm7 | ||
291 | GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
292 | pxor %xmm7, %xmm8 | ||
293 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
294 | .elseif \i == 7 | ||
295 | pxor %xmm7, %xmm8 | ||
296 | GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 | ||
297 | .endif | ||
298 | cmp $64, %r13 | ||
299 | jl _initial_blocks_done\num_initial_blocks\operation | ||
300 | # no need for precomputed values | ||
301 | /* | ||
302 | * | ||
303 | * Precomputations for HashKey parallel with encryption of first 4 blocks. | ||
304 | * Haskey_i_k holds XORed values of the low and high parts of the Haskey_i | ||
305 | */ | ||
306 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
307 | movdqa \XMM0, \XMM1 | ||
308 | pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap | ||
309 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
310 | movdqa \XMM0, \XMM2 | ||
311 | pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap | ||
312 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
313 | movdqa \XMM0, \XMM3 | ||
314 | pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap | ||
315 | paddd ONE(%rip), \XMM0 # INCR Y0 | ||
316 | movdqa \XMM0, \XMM4 | ||
317 | pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap | ||
318 | pxor 16*0(%arg1), \XMM1 | ||
319 | pxor 16*0(%arg1), \XMM2 | ||
320 | pxor 16*0(%arg1), \XMM3 | ||
321 | pxor 16*0(%arg1), \XMM4 | ||
322 | movdqa \TMP3, \TMP5 | ||
323 | pshufd $78, \TMP3, \TMP1 | ||
324 | pxor \TMP3, \TMP1 | ||
325 | movdqa \TMP1, HashKey_k(%rsp) | ||
326 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
327 | # TMP5 = HashKey^2<<1 (mod poly) | ||
328 | movdqa \TMP5, HashKey_2(%rsp) | ||
329 | # HashKey_2 = HashKey^2<<1 (mod poly) | ||
330 | pshufd $78, \TMP5, \TMP1 | ||
331 | pxor \TMP5, \TMP1 | ||
332 | movdqa \TMP1, HashKey_2_k(%rsp) | ||
333 | .irpc index, 1234 # do 4 rounds | ||
334 | movaps 0x10*\index(%arg1), \TMP1 | ||
335 | AESENC \TMP1, \XMM1 | ||
336 | AESENC \TMP1, \XMM2 | ||
337 | AESENC \TMP1, \XMM3 | ||
338 | AESENC \TMP1, \XMM4 | ||
339 | .endr | ||
340 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
341 | # TMP5 = HashKey^3<<1 (mod poly) | ||
342 | movdqa \TMP5, HashKey_3(%rsp) | ||
343 | pshufd $78, \TMP5, \TMP1 | ||
344 | pxor \TMP5, \TMP1 | ||
345 | movdqa \TMP1, HashKey_3_k(%rsp) | ||
346 | .irpc index, 56789 # do next 5 rounds | ||
347 | movaps 0x10*\index(%arg1), \TMP1 | ||
348 | AESENC \TMP1, \XMM1 | ||
349 | AESENC \TMP1, \XMM2 | ||
350 | AESENC \TMP1, \XMM3 | ||
351 | AESENC \TMP1, \XMM4 | ||
352 | .endr | ||
353 | GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 | ||
354 | # TMP5 = HashKey^3<<1 (mod poly) | ||
355 | movdqa \TMP5, HashKey_4(%rsp) | ||
356 | pshufd $78, \TMP5, \TMP1 | ||
357 | pxor \TMP5, \TMP1 | ||
358 | movdqa \TMP1, HashKey_4_k(%rsp) | ||
359 | movaps 0xa0(%arg1), \TMP2 | ||
360 | AESENCLAST \TMP2, \XMM1 | ||
361 | AESENCLAST \TMP2, \XMM2 | ||
362 | AESENCLAST \TMP2, \XMM3 | ||
363 | AESENCLAST \TMP2, \XMM4 | ||
364 | movdqu 16*0(%arg3 , %r11 , 1), \TMP1 | ||
365 | pxor \TMP1, \XMM1 | ||
366 | .if \operation == dec | ||
367 | movdqu \XMM1, 16*0(%arg2 , %r11 , 1) | ||
368 | movdqa \TMP1, \XMM1 | ||
369 | .endif | ||
370 | movdqu 16*1(%arg3 , %r11 , 1), \TMP1 | ||
371 | pxor \TMP1, \XMM2 | ||
372 | .if \operation == dec | ||
373 | movdqu \XMM2, 16*1(%arg2 , %r11 , 1) | ||
374 | movdqa \TMP1, \XMM2 | ||
375 | .endif | ||
376 | movdqu 16*2(%arg3 , %r11 , 1), \TMP1 | ||
377 | pxor \TMP1, \XMM3 | ||
378 | .if \operation == dec | ||
379 | movdqu \XMM3, 16*2(%arg2 , %r11 , 1) | ||
380 | movdqa \TMP1, \XMM3 | ||
381 | .endif | ||
382 | movdqu 16*3(%arg3 , %r11 , 1), \TMP1 | ||
383 | pxor \TMP1, \XMM4 | ||
384 | .if \operation == dec | ||
385 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | ||
386 | movdqa \TMP1, \XMM4 | ||
387 | .else | ||
388 | movdqu \XMM1, 16*0(%arg2 , %r11 , 1) | ||
389 | movdqu \XMM2, 16*1(%arg2 , %r11 , 1) | ||
390 | movdqu \XMM3, 16*2(%arg2 , %r11 , 1) | ||
391 | movdqu \XMM4, 16*3(%arg2 , %r11 , 1) | ||
392 | .endif | ||
393 | add $64, %r11 | ||
394 | pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap | ||
395 | pxor \XMMDst, \XMM1 | ||
396 | # combine GHASHed value with the corresponding ciphertext | ||
397 | pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap | ||
398 | pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap | ||
399 | pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap | ||
400 | _initial_blocks_done\num_initial_blocks\operation: | ||
401 | .endm | ||
402 | |||
403 | /* | ||
404 | * encrypt 4 blocks at a time | ||
405 | * ghash the 4 previously encrypted ciphertext blocks | ||
406 | * arg1, %arg2, %arg3 are used as pointers only, not modified | ||
407 | * %r11 is the data offset value | ||
408 | */ | ||
409 | .macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \ | ||
410 | TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation | ||
411 | |||
412 | movdqa \XMM1, \XMM5 | ||
413 | movdqa \XMM2, \XMM6 | ||
414 | movdqa \XMM3, \XMM7 | ||
415 | movdqa \XMM4, \XMM8 | ||
416 | |||
417 | # multiply TMP5 * HashKey using karatsuba | ||
418 | |||
419 | movdqa \XMM5, \TMP4 | ||
420 | pshufd $78, \XMM5, \TMP6 | ||
421 | pxor \XMM5, \TMP6 | ||
422 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
423 | movdqa HashKey_4(%rsp), \TMP5 | ||
424 | PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 | ||
425 | movdqa \XMM0, \XMM1 | ||
426 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
427 | movdqa \XMM0, \XMM2 | ||
428 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
429 | movdqa \XMM0, \XMM3 | ||
430 | paddd ONE(%rip), \XMM0 # INCR CNT | ||
431 | movdqa \XMM0, \XMM4 | ||
432 | pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap | ||
433 | PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 | ||
434 | pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap | ||
435 | pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap | ||
436 | pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap | ||
437 | pxor (%arg1), \XMM1 | ||
438 | pxor (%arg1), \XMM2 | ||
439 | pxor (%arg1), \XMM3 | ||
440 | pxor (%arg1), \XMM4 | ||
441 | movdqa HashKey_4_k(%rsp), \TMP5 | ||
442 | PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) | ||
443 | movaps 0x10(%arg1), \TMP1 | ||
444 | AESENC \TMP1, \XMM1 # Round 1 | ||
445 | AESENC \TMP1, \XMM2 | ||
446 | AESENC \TMP1, \XMM3 | ||
447 | AESENC \TMP1, \XMM4 | ||
448 | movaps 0x20(%arg1), \TMP1 | ||
449 | AESENC \TMP1, \XMM1 # Round 2 | ||
450 | AESENC \TMP1, \XMM2 | ||
451 | AESENC \TMP1, \XMM3 | ||
452 | AESENC \TMP1, \XMM4 | ||
453 | movdqa \XMM6, \TMP1 | ||
454 | pshufd $78, \XMM6, \TMP2 | ||
455 | pxor \XMM6, \TMP2 | ||
456 | movdqa HashKey_3(%rsp), \TMP5 | ||
457 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 | ||
458 | movaps 0x30(%arg1), \TMP3 | ||
459 | AESENC \TMP3, \XMM1 # Round 3 | ||
460 | AESENC \TMP3, \XMM2 | ||
461 | AESENC \TMP3, \XMM3 | ||
462 | AESENC \TMP3, \XMM4 | ||
463 | PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 | ||
464 | movaps 0x40(%arg1), \TMP3 | ||
465 | AESENC \TMP3, \XMM1 # Round 4 | ||
466 | AESENC \TMP3, \XMM2 | ||
467 | AESENC \TMP3, \XMM3 | ||
468 | AESENC \TMP3, \XMM4 | ||
469 | movdqa HashKey_3_k(%rsp), \TMP5 | ||
470 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
471 | movaps 0x50(%arg1), \TMP3 | ||
472 | AESENC \TMP3, \XMM1 # Round 5 | ||
473 | AESENC \TMP3, \XMM2 | ||
474 | AESENC \TMP3, \XMM3 | ||
475 | AESENC \TMP3, \XMM4 | ||
476 | pxor \TMP1, \TMP4 | ||
477 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | ||
478 | pxor \XMM6, \XMM5 | ||
479 | pxor \TMP2, \TMP6 | ||
480 | movdqa \XMM7, \TMP1 | ||
481 | pshufd $78, \XMM7, \TMP2 | ||
482 | pxor \XMM7, \TMP2 | ||
483 | movdqa HashKey_2(%rsp ), \TMP5 | ||
484 | |||
485 | # Multiply TMP5 * HashKey using karatsuba | ||
486 | |||
487 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
488 | movaps 0x60(%arg1), \TMP3 | ||
489 | AESENC \TMP3, \XMM1 # Round 6 | ||
490 | AESENC \TMP3, \XMM2 | ||
491 | AESENC \TMP3, \XMM3 | ||
492 | AESENC \TMP3, \XMM4 | ||
493 | PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 | ||
494 | movaps 0x70(%arg1), \TMP3 | ||
495 | AESENC \TMP3, \XMM1 # Round 7 | ||
496 | AESENC \TMP3, \XMM2 | ||
497 | AESENC \TMP3, \XMM3 | ||
498 | AESENC \TMP3, \XMM4 | ||
499 | movdqa HashKey_2_k(%rsp), \TMP5 | ||
500 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
501 | movaps 0x80(%arg1), \TMP3 | ||
502 | AESENC \TMP3, \XMM1 # Round 8 | ||
503 | AESENC \TMP3, \XMM2 | ||
504 | AESENC \TMP3, \XMM3 | ||
505 | AESENC \TMP3, \XMM4 | ||
506 | pxor \TMP1, \TMP4 | ||
507 | # accumulate the results in TMP4:XMM5, TMP6 holds the middle part | ||
508 | pxor \XMM7, \XMM5 | ||
509 | pxor \TMP2, \TMP6 | ||
510 | |||
511 | # Multiply XMM8 * HashKey | ||
512 | # XMM8 and TMP5 hold the values for the two operands | ||
513 | |||
514 | movdqa \XMM8, \TMP1 | ||
515 | pshufd $78, \XMM8, \TMP2 | ||
516 | pxor \XMM8, \TMP2 | ||
517 | movdqa HashKey(%rsp), \TMP5 | ||
518 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
519 | movaps 0x90(%arg1), \TMP3 | ||
520 | AESENC \TMP3, \XMM1 # Round 9 | ||
521 | AESENC \TMP3, \XMM2 | ||
522 | AESENC \TMP3, \XMM3 | ||
523 | AESENC \TMP3, \XMM4 | ||
524 | PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 | ||
525 | movaps 0xa0(%arg1), \TMP3 | ||
526 | AESENCLAST \TMP3, \XMM1 # Round 10 | ||
527 | AESENCLAST \TMP3, \XMM2 | ||
528 | AESENCLAST \TMP3, \XMM3 | ||
529 | AESENCLAST \TMP3, \XMM4 | ||
530 | movdqa HashKey_k(%rsp), \TMP5 | ||
531 | PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
532 | movdqu (%arg3,%r11,1), \TMP3 | ||
533 | pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK | ||
534 | .if \operation == dec | ||
535 | movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer | ||
536 | movdqa \TMP3, \XMM1 | ||
537 | .endif | ||
538 | movdqu 16(%arg3,%r11,1), \TMP3 | ||
539 | pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK | ||
540 | .if \operation == dec | ||
541 | movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer | ||
542 | movdqa \TMP3, \XMM2 | ||
543 | .endif | ||
544 | movdqu 32(%arg3,%r11,1), \TMP3 | ||
545 | pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK | ||
546 | .if \operation == dec | ||
547 | movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer | ||
548 | movdqa \TMP3, \XMM3 | ||
549 | .endif | ||
550 | movdqu 48(%arg3,%r11,1), \TMP3 | ||
551 | pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK | ||
552 | .if \operation == dec | ||
553 | movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer | ||
554 | movdqa \TMP3, \XMM4 | ||
555 | .else | ||
556 | movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer | ||
557 | movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer | ||
558 | movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer | ||
559 | movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer | ||
560 | .endif | ||
561 | pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap | ||
562 | pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap | ||
563 | pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap | ||
564 | pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway | ||
565 | |||
566 | pxor \TMP4, \TMP1 | ||
567 | pxor \XMM8, \XMM5 | ||
568 | pxor \TMP6, \TMP2 | ||
569 | pxor \TMP1, \TMP2 | ||
570 | pxor \XMM5, \TMP2 | ||
571 | movdqa \TMP2, \TMP3 | ||
572 | pslldq $8, \TMP3 # left shift TMP3 2 DWs | ||
573 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
574 | pxor \TMP3, \XMM5 | ||
575 | pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 | ||
576 | |||
577 | # first phase of reduction | ||
578 | |||
579 | movdqa \XMM5, \TMP2 | ||
580 | movdqa \XMM5, \TMP3 | ||
581 | movdqa \XMM5, \TMP4 | ||
582 | # move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently | ||
583 | pslld $31, \TMP2 # packed right shift << 31 | ||
584 | pslld $30, \TMP3 # packed right shift << 30 | ||
585 | pslld $25, \TMP4 # packed right shift << 25 | ||
586 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
587 | pxor \TMP4, \TMP2 | ||
588 | movdqa \TMP2, \TMP5 | ||
589 | psrldq $4, \TMP5 # right shift T5 1 DW | ||
590 | pslldq $12, \TMP2 # left shift T2 3 DWs | ||
591 | pxor \TMP2, \XMM5 | ||
592 | |||
593 | # second phase of reduction | ||
594 | |||
595 | movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 | ||
596 | movdqa \XMM5,\TMP3 | ||
597 | movdqa \XMM5,\TMP4 | ||
598 | psrld $1, \TMP2 # packed left shift >>1 | ||
599 | psrld $2, \TMP3 # packed left shift >>2 | ||
600 | psrld $7, \TMP4 # packed left shift >>7 | ||
601 | pxor \TMP3,\TMP2 # xor the shifted versions | ||
602 | pxor \TMP4,\TMP2 | ||
603 | pxor \TMP5, \TMP2 | ||
604 | pxor \TMP2, \XMM5 | ||
605 | pxor \TMP1, \XMM5 # result is in TMP1 | ||
606 | |||
607 | pxor \XMM5, \XMM1 | ||
608 | .endm | ||
609 | |||
610 | /* GHASH the last 4 ciphertext blocks. */ | ||
611 | .macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ | ||
612 | TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst | ||
613 | |||
614 | # Multiply TMP6 * HashKey (using Karatsuba) | ||
615 | |||
616 | movdqa \XMM1, \TMP6 | ||
617 | pshufd $78, \XMM1, \TMP2 | ||
618 | pxor \XMM1, \TMP2 | ||
619 | movdqa HashKey_4(%rsp), \TMP5 | ||
620 | PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 | ||
621 | PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 | ||
622 | movdqa HashKey_4_k(%rsp), \TMP4 | ||
623 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
624 | movdqa \XMM1, \XMMDst | ||
625 | movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 | ||
626 | |||
627 | # Multiply TMP1 * HashKey (using Karatsuba) | ||
628 | |||
629 | movdqa \XMM2, \TMP1 | ||
630 | pshufd $78, \XMM2, \TMP2 | ||
631 | pxor \XMM2, \TMP2 | ||
632 | movdqa HashKey_3(%rsp), \TMP5 | ||
633 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
634 | PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 | ||
635 | movdqa HashKey_3_k(%rsp), \TMP4 | ||
636 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
637 | pxor \TMP1, \TMP6 | ||
638 | pxor \XMM2, \XMMDst | ||
639 | pxor \TMP2, \XMM1 | ||
640 | # results accumulated in TMP6, XMMDst, XMM1 | ||
641 | |||
642 | # Multiply TMP1 * HashKey (using Karatsuba) | ||
643 | |||
644 | movdqa \XMM3, \TMP1 | ||
645 | pshufd $78, \XMM3, \TMP2 | ||
646 | pxor \XMM3, \TMP2 | ||
647 | movdqa HashKey_2(%rsp), \TMP5 | ||
648 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
649 | PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 | ||
650 | movdqa HashKey_2_k(%rsp), \TMP4 | ||
651 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
652 | pxor \TMP1, \TMP6 | ||
653 | pxor \XMM3, \XMMDst | ||
654 | pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 | ||
655 | |||
656 | # Multiply TMP1 * HashKey (using Karatsuba) | ||
657 | movdqa \XMM4, \TMP1 | ||
658 | pshufd $78, \XMM4, \TMP2 | ||
659 | pxor \XMM4, \TMP2 | ||
660 | movdqa HashKey(%rsp), \TMP5 | ||
661 | PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 | ||
662 | PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 | ||
663 | movdqa HashKey_k(%rsp), \TMP4 | ||
664 | PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) | ||
665 | pxor \TMP1, \TMP6 | ||
666 | pxor \XMM4, \XMMDst | ||
667 | pxor \XMM1, \TMP2 | ||
668 | pxor \TMP6, \TMP2 | ||
669 | pxor \XMMDst, \TMP2 | ||
670 | # middle section of the temp results combined as in karatsuba algorithm | ||
671 | movdqa \TMP2, \TMP4 | ||
672 | pslldq $8, \TMP4 # left shift TMP4 2 DWs | ||
673 | psrldq $8, \TMP2 # right shift TMP2 2 DWs | ||
674 | pxor \TMP4, \XMMDst | ||
675 | pxor \TMP2, \TMP6 | ||
676 | # TMP6:XMMDst holds the result of the accumulated carry-less multiplications | ||
677 | # first phase of the reduction | ||
678 | movdqa \XMMDst, \TMP2 | ||
679 | movdqa \XMMDst, \TMP3 | ||
680 | movdqa \XMMDst, \TMP4 | ||
681 | # move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently | ||
682 | pslld $31, \TMP2 # packed right shifting << 31 | ||
683 | pslld $30, \TMP3 # packed right shifting << 30 | ||
684 | pslld $25, \TMP4 # packed right shifting << 25 | ||
685 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
686 | pxor \TMP4, \TMP2 | ||
687 | movdqa \TMP2, \TMP7 | ||
688 | psrldq $4, \TMP7 # right shift TMP7 1 DW | ||
689 | pslldq $12, \TMP2 # left shift TMP2 3 DWs | ||
690 | pxor \TMP2, \XMMDst | ||
691 | |||
692 | # second phase of the reduction | ||
693 | movdqa \XMMDst, \TMP2 | ||
694 | # make 3 copies of XMMDst for doing 3 shift operations | ||
695 | movdqa \XMMDst, \TMP3 | ||
696 | movdqa \XMMDst, \TMP4 | ||
697 | psrld $1, \TMP2 # packed left shift >> 1 | ||
698 | psrld $2, \TMP3 # packed left shift >> 2 | ||
699 | psrld $7, \TMP4 # packed left shift >> 7 | ||
700 | pxor \TMP3, \TMP2 # xor the shifted versions | ||
701 | pxor \TMP4, \TMP2 | ||
702 | pxor \TMP7, \TMP2 | ||
703 | pxor \TMP2, \XMMDst | ||
704 | pxor \TMP6, \XMMDst # reduced result is in XMMDst | ||
705 | .endm | ||
706 | |||
707 | /* Encryption of a single block done*/ | ||
708 | .macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 | ||
709 | |||
710 | pxor (%arg1), \XMM0 | ||
711 | movaps 16(%arg1), \TMP1 | ||
712 | AESENC \TMP1, \XMM0 | ||
713 | movaps 32(%arg1), \TMP1 | ||
714 | AESENC \TMP1, \XMM0 | ||
715 | movaps 48(%arg1), \TMP1 | ||
716 | AESENC \TMP1, \XMM0 | ||
717 | movaps 64(%arg1), \TMP1 | ||
718 | AESENC \TMP1, \XMM0 | ||
719 | movaps 80(%arg1), \TMP1 | ||
720 | AESENC \TMP1, \XMM0 | ||
721 | movaps 96(%arg1), \TMP1 | ||
722 | AESENC \TMP1, \XMM0 | ||
723 | movaps 112(%arg1), \TMP1 | ||
724 | AESENC \TMP1, \XMM0 | ||
725 | movaps 128(%arg1), \TMP1 | ||
726 | AESENC \TMP1, \XMM0 | ||
727 | movaps 144(%arg1), \TMP1 | ||
728 | AESENC \TMP1, \XMM0 | ||
729 | movaps 160(%arg1), \TMP1 | ||
730 | AESENCLAST \TMP1, \XMM0 | ||
731 | .endm | ||
732 | |||
733 | |||
734 | /***************************************************************************** | ||
735 | * void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | ||
736 | * u8 *out, // Plaintext output. Encrypt in-place is allowed. | ||
737 | * const u8 *in, // Ciphertext input | ||
738 | * u64 plaintext_len, // Length of data in bytes for decryption. | ||
739 | * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) | ||
740 | * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | ||
741 | * // concatenated with 0x00000001. 16-byte aligned pointer. | ||
742 | * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. | ||
743 | * const u8 *aad, // Additional Authentication Data (AAD) | ||
744 | * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | ||
745 | * u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the | ||
746 | * // given authentication tag and only return the plaintext if they match. | ||
747 | * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 | ||
748 | * // (most likely), 12 or 8. | ||
749 | * | ||
750 | * Assumptions: | ||
751 | * | ||
752 | * keys: | ||
753 | * keys are pre-expanded and aligned to 16 bytes. we are using the first | ||
754 | * set of 11 keys in the data structure void *aes_ctx | ||
755 | * | ||
756 | * iv: | ||
757 | * 0 1 2 3 | ||
758 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
759 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
760 | * | Salt (From the SA) | | ||
761 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
762 | * | Initialization Vector | | ||
763 | * | (This is the sequence number from IPSec header) | | ||
764 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
765 | * | 0x1 | | ||
766 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
767 | * | ||
768 | * | ||
769 | * | ||
770 | * AAD: | ||
771 | * AAD padded to 128 bits with 0 | ||
772 | * for example, assume AAD is a u32 vector | ||
773 | * | ||
774 | * if AAD is 8 bytes: | ||
775 | * AAD[3] = {A0, A1}; | ||
776 | * padded AAD in xmm register = {A1 A0 0 0} | ||
777 | * | ||
778 | * 0 1 2 3 | ||
779 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
780 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
781 | * | SPI (A1) | | ||
782 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
783 | * | 32-bit Sequence Number (A0) | | ||
784 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
785 | * | 0x0 | | ||
786 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
787 | * | ||
788 | * AAD Format with 32-bit Sequence Number | ||
789 | * | ||
790 | * if AAD is 12 bytes: | ||
791 | * AAD[3] = {A0, A1, A2}; | ||
792 | * padded AAD in xmm register = {A2 A1 A0 0} | ||
793 | * | ||
794 | * 0 1 2 3 | ||
795 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
796 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
797 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
798 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
799 | * | SPI (A2) | | ||
800 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
801 | * | 64-bit Extended Sequence Number {A1,A0} | | ||
802 | * | | | ||
803 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
804 | * | 0x0 | | ||
805 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
806 | * | ||
807 | * AAD Format with 64-bit Extended Sequence Number | ||
808 | * | ||
809 | * aadLen: | ||
810 | * from the definition of the spec, aadLen can only be 8 or 12 bytes. | ||
811 | * The code supports 16 too but for other sizes, the code will fail. | ||
812 | * | ||
813 | * TLen: | ||
814 | * from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | ||
815 | * For other sizes, the code will fail. | ||
816 | * | ||
817 | * poly = x^128 + x^127 + x^126 + x^121 + 1 | ||
818 | * | ||
819 | *****************************************************************************/ | ||
820 | |||
821 | ENTRY(aesni_gcm_dec) | ||
822 | push %r12 | ||
823 | push %r13 | ||
824 | push %r14 | ||
825 | mov %rsp, %r14 | ||
826 | /* | ||
827 | * states of %xmm registers %xmm6:%xmm15 not saved | ||
828 | * all %xmm registers are clobbered | ||
829 | */ | ||
830 | sub $VARIABLE_OFFSET, %rsp | ||
831 | and $~63, %rsp # align rsp to 64 bytes | ||
832 | mov %arg6, %r12 | ||
833 | movdqu (%r12), %xmm13 # %xmm13 = HashKey | ||
834 | pshufb SHUF_MASK(%rip), %xmm13 | ||
835 | |||
836 | # Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) | ||
837 | |||
838 | movdqa %xmm13, %xmm2 | ||
839 | psllq $1, %xmm13 | ||
840 | psrlq $63, %xmm2 | ||
841 | movdqa %xmm2, %xmm1 | ||
842 | pslldq $8, %xmm2 | ||
843 | psrldq $8, %xmm1 | ||
844 | por %xmm2, %xmm13 | ||
845 | |||
846 | # Reduction | ||
847 | |||
848 | pshufd $0x24, %xmm1, %xmm2 | ||
849 | pcmpeqd TWOONE(%rip), %xmm2 | ||
850 | pand POLY(%rip), %xmm2 | ||
851 | pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) | ||
852 | |||
853 | |||
854 | # Decrypt first few blocks | ||
855 | |||
856 | movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) | ||
857 | mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext | ||
858 | and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) | ||
859 | mov %r13, %r12 | ||
860 | and $(3<<4), %r12 | ||
861 | jz _initial_num_blocks_is_0_decrypt | ||
862 | cmp $(2<<4), %r12 | ||
863 | jb _initial_num_blocks_is_1_decrypt | ||
864 | je _initial_num_blocks_is_2_decrypt | ||
865 | _initial_num_blocks_is_3_decrypt: | ||
866 | INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
867 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec | ||
868 | sub $48, %r13 | ||
869 | jmp _initial_blocks_decrypted | ||
870 | _initial_num_blocks_is_2_decrypt: | ||
871 | INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
872 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec | ||
873 | sub $32, %r13 | ||
874 | jmp _initial_blocks_decrypted | ||
875 | _initial_num_blocks_is_1_decrypt: | ||
876 | INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
877 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec | ||
878 | sub $16, %r13 | ||
879 | jmp _initial_blocks_decrypted | ||
880 | _initial_num_blocks_is_0_decrypt: | ||
881 | INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
882 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec | ||
883 | _initial_blocks_decrypted: | ||
884 | cmp $0, %r13 | ||
885 | je _zero_cipher_left_decrypt | ||
886 | sub $64, %r13 | ||
887 | je _four_cipher_left_decrypt | ||
888 | _decrypt_by_4: | ||
889 | GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | ||
890 | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec | ||
891 | add $64, %r11 | ||
892 | sub $64, %r13 | ||
893 | jne _decrypt_by_4 | ||
894 | _four_cipher_left_decrypt: | ||
895 | GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | ||
896 | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | ||
897 | _zero_cipher_left_decrypt: | ||
898 | mov %arg4, %r13 | ||
899 | and $15, %r13 # %r13 = arg4 (mod 16) | ||
900 | je _multiple_of_16_bytes_decrypt | ||
901 | |||
902 | # Handle the last <16 byte block seperately | ||
903 | |||
904 | paddd ONE(%rip), %xmm0 # increment CNT to get Yn | ||
905 | pshufb SHUF_MASK(%rip), %xmm0 | ||
906 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) | ||
907 | sub $16, %r11 | ||
908 | add %r13, %r11 | ||
909 | movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block | ||
910 | lea SHIFT_MASK+16(%rip), %r12 | ||
911 | sub %r13, %r12 | ||
912 | # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes | ||
913 | # (%r13 is the number of bytes in plaintext mod 16) | ||
914 | movdqu (%r12), %xmm2 # get the appropriate shuffle mask | ||
915 | pshufb %xmm2, %xmm1 # right shift 16-%r13 butes | ||
916 | movdqa %xmm1, %xmm2 | ||
917 | pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) | ||
918 | movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 | ||
919 | # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 | ||
920 | pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 | ||
921 | pand %xmm1, %xmm2 | ||
922 | pshufb SHUF_MASK(%rip),%xmm2 | ||
923 | pxor %xmm2, %xmm8 | ||
924 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
925 | # GHASH computation for the last <16 byte block | ||
926 | sub %r13, %r11 | ||
927 | add $16, %r11 | ||
928 | |||
929 | # output %r13 bytes | ||
930 | movq %xmm0, %rax | ||
931 | cmp $8, %r13 | ||
932 | jle _less_than_8_bytes_left_decrypt | ||
933 | mov %rax, (%arg2 , %r11, 1) | ||
934 | add $8, %r11 | ||
935 | psrldq $8, %xmm0 | ||
936 | movq %xmm0, %rax | ||
937 | sub $8, %r13 | ||
938 | _less_than_8_bytes_left_decrypt: | ||
939 | mov %al, (%arg2, %r11, 1) | ||
940 | add $1, %r11 | ||
941 | shr $8, %rax | ||
942 | sub $1, %r13 | ||
943 | jne _less_than_8_bytes_left_decrypt | ||
944 | _multiple_of_16_bytes_decrypt: | ||
945 | mov arg8, %r12 # %r13 = aadLen (number of bytes) | ||
946 | shl $3, %r12 # convert into number of bits | ||
947 | movd %r12d, %xmm15 # len(A) in %xmm15 | ||
948 | shl $3, %arg4 # len(C) in bits (*128) | ||
949 | movq %arg4, %xmm1 | ||
950 | pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 | ||
951 | pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) | ||
952 | pxor %xmm15, %xmm8 | ||
953 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
954 | # final GHASH computation | ||
955 | pshufb SHUF_MASK(%rip), %xmm8 | ||
956 | mov %arg5, %rax # %rax = *Y0 | ||
957 | movdqu (%rax), %xmm0 # %xmm0 = Y0 | ||
958 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) | ||
959 | pxor %xmm8, %xmm0 | ||
960 | _return_T_decrypt: | ||
961 | mov arg9, %r10 # %r10 = authTag | ||
962 | mov arg10, %r11 # %r11 = auth_tag_len | ||
963 | cmp $16, %r11 | ||
964 | je _T_16_decrypt | ||
965 | cmp $12, %r11 | ||
966 | je _T_12_decrypt | ||
967 | _T_8_decrypt: | ||
968 | movq %xmm0, %rax | ||
969 | mov %rax, (%r10) | ||
970 | jmp _return_T_done_decrypt | ||
971 | _T_12_decrypt: | ||
972 | movq %xmm0, %rax | ||
973 | mov %rax, (%r10) | ||
974 | psrldq $8, %xmm0 | ||
975 | movd %xmm0, %eax | ||
976 | mov %eax, 8(%r10) | ||
977 | jmp _return_T_done_decrypt | ||
978 | _T_16_decrypt: | ||
979 | movdqu %xmm0, (%r10) | ||
980 | _return_T_done_decrypt: | ||
981 | mov %r14, %rsp | ||
982 | pop %r14 | ||
983 | pop %r13 | ||
984 | pop %r12 | ||
985 | ret | ||
986 | |||
987 | |||
988 | /***************************************************************************** | ||
989 | * void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. | ||
990 | * u8 *out, // Ciphertext output. Encrypt in-place is allowed. | ||
991 | * const u8 *in, // Plaintext input | ||
992 | * u64 plaintext_len, // Length of data in bytes for encryption. | ||
993 | * u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) | ||
994 | * // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) | ||
995 | * // concatenated with 0x00000001. 16-byte aligned pointer. | ||
996 | * u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. | ||
997 | * const u8 *aad, // Additional Authentication Data (AAD) | ||
998 | * u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes | ||
999 | * u8 *auth_tag, // Authenticated Tag output. | ||
1000 | * u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), | ||
1001 | * // 12 or 8. | ||
1002 | * | ||
1003 | * Assumptions: | ||
1004 | * | ||
1005 | * keys: | ||
1006 | * keys are pre-expanded and aligned to 16 bytes. we are using the | ||
1007 | * first set of 11 keys in the data structure void *aes_ctx | ||
1008 | * | ||
1009 | * | ||
1010 | * iv: | ||
1011 | * 0 1 2 3 | ||
1012 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1013 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1014 | * | Salt (From the SA) | | ||
1015 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1016 | * | Initialization Vector | | ||
1017 | * | (This is the sequence number from IPSec header) | | ||
1018 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1019 | * | 0x1 | | ||
1020 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1021 | * | ||
1022 | * | ||
1023 | * | ||
1024 | * AAD: | ||
1025 | * AAD padded to 128 bits with 0 | ||
1026 | * for example, assume AAD is a u32 vector | ||
1027 | * | ||
1028 | * if AAD is 8 bytes: | ||
1029 | * AAD[3] = {A0, A1}; | ||
1030 | * padded AAD in xmm register = {A1 A0 0 0} | ||
1031 | * | ||
1032 | * 0 1 2 3 | ||
1033 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1034 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1035 | * | SPI (A1) | | ||
1036 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1037 | * | 32-bit Sequence Number (A0) | | ||
1038 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1039 | * | 0x0 | | ||
1040 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1041 | * | ||
1042 | * AAD Format with 32-bit Sequence Number | ||
1043 | * | ||
1044 | * if AAD is 12 bytes: | ||
1045 | * AAD[3] = {A0, A1, A2}; | ||
1046 | * padded AAD in xmm register = {A2 A1 A0 0} | ||
1047 | * | ||
1048 | * 0 1 2 3 | ||
1049 | * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 | ||
1050 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1051 | * | SPI (A2) | | ||
1052 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1053 | * | 64-bit Extended Sequence Number {A1,A0} | | ||
1054 | * | | | ||
1055 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1056 | * | 0x0 | | ||
1057 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | ||
1058 | * | ||
1059 | * AAD Format with 64-bit Extended Sequence Number | ||
1060 | * | ||
1061 | * aadLen: | ||
1062 | * from the definition of the spec, aadLen can only be 8 or 12 bytes. | ||
1063 | * The code supports 16 too but for other sizes, the code will fail. | ||
1064 | * | ||
1065 | * TLen: | ||
1066 | * from the definition of the spec, TLen can only be 8, 12 or 16 bytes. | ||
1067 | * For other sizes, the code will fail. | ||
1068 | * | ||
1069 | * poly = x^128 + x^127 + x^126 + x^121 + 1 | ||
1070 | ***************************************************************************/ | ||
1071 | ENTRY(aesni_gcm_enc) | ||
1072 | push %r12 | ||
1073 | push %r13 | ||
1074 | push %r14 | ||
1075 | mov %rsp, %r14 | ||
1076 | # | ||
1077 | # states of %xmm registers %xmm6:%xmm15 not saved | ||
1078 | # all %xmm registers are clobbered | ||
1079 | # | ||
1080 | sub $VARIABLE_OFFSET, %rsp | ||
1081 | and $~63, %rsp | ||
1082 | mov %arg6, %r12 | ||
1083 | movdqu (%r12), %xmm13 | ||
1084 | pshufb SHUF_MASK(%rip), %xmm13 | ||
1085 | |||
1086 | # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) | ||
1087 | |||
1088 | movdqa %xmm13, %xmm2 | ||
1089 | psllq $1, %xmm13 | ||
1090 | psrlq $63, %xmm2 | ||
1091 | movdqa %xmm2, %xmm1 | ||
1092 | pslldq $8, %xmm2 | ||
1093 | psrldq $8, %xmm1 | ||
1094 | por %xmm2, %xmm13 | ||
1095 | |||
1096 | # reduce HashKey<<1 | ||
1097 | |||
1098 | pshufd $0x24, %xmm1, %xmm2 | ||
1099 | pcmpeqd TWOONE(%rip), %xmm2 | ||
1100 | pand POLY(%rip), %xmm2 | ||
1101 | pxor %xmm2, %xmm13 | ||
1102 | movdqa %xmm13, HashKey(%rsp) | ||
1103 | mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) | ||
1104 | and $-16, %r13 | ||
1105 | mov %r13, %r12 | ||
1106 | |||
1107 | # Encrypt first few blocks | ||
1108 | |||
1109 | and $(3<<4), %r12 | ||
1110 | jz _initial_num_blocks_is_0_encrypt | ||
1111 | cmp $(2<<4), %r12 | ||
1112 | jb _initial_num_blocks_is_1_encrypt | ||
1113 | je _initial_num_blocks_is_2_encrypt | ||
1114 | _initial_num_blocks_is_3_encrypt: | ||
1115 | INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1116 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc | ||
1117 | sub $48, %r13 | ||
1118 | jmp _initial_blocks_encrypted | ||
1119 | _initial_num_blocks_is_2_encrypt: | ||
1120 | INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1121 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc | ||
1122 | sub $32, %r13 | ||
1123 | jmp _initial_blocks_encrypted | ||
1124 | _initial_num_blocks_is_1_encrypt: | ||
1125 | INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1126 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc | ||
1127 | sub $16, %r13 | ||
1128 | jmp _initial_blocks_encrypted | ||
1129 | _initial_num_blocks_is_0_encrypt: | ||
1130 | INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ | ||
1131 | %xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc | ||
1132 | _initial_blocks_encrypted: | ||
1133 | |||
1134 | # Main loop - Encrypt remaining blocks | ||
1135 | |||
1136 | cmp $0, %r13 | ||
1137 | je _zero_cipher_left_encrypt | ||
1138 | sub $64, %r13 | ||
1139 | je _four_cipher_left_encrypt | ||
1140 | _encrypt_by_4_encrypt: | ||
1141 | GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ | ||
1142 | %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc | ||
1143 | add $64, %r11 | ||
1144 | sub $64, %r13 | ||
1145 | jne _encrypt_by_4_encrypt | ||
1146 | _four_cipher_left_encrypt: | ||
1147 | GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ | ||
1148 | %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 | ||
1149 | _zero_cipher_left_encrypt: | ||
1150 | mov %arg4, %r13 | ||
1151 | and $15, %r13 # %r13 = arg4 (mod 16) | ||
1152 | je _multiple_of_16_bytes_encrypt | ||
1153 | |||
1154 | # Handle the last <16 Byte block seperately | ||
1155 | paddd ONE(%rip), %xmm0 # INCR CNT to get Yn | ||
1156 | pshufb SHUF_MASK(%rip), %xmm0 | ||
1157 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) | ||
1158 | sub $16, %r11 | ||
1159 | add %r13, %r11 | ||
1160 | movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks | ||
1161 | lea SHIFT_MASK+16(%rip), %r12 | ||
1162 | sub %r13, %r12 | ||
1163 | # adjust the shuffle mask pointer to be able to shift 16-r13 bytes | ||
1164 | # (%r13 is the number of bytes in plaintext mod 16) | ||
1165 | movdqu (%r12), %xmm2 # get the appropriate shuffle mask | ||
1166 | pshufb %xmm2, %xmm1 # shift right 16-r13 byte | ||
1167 | pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) | ||
1168 | movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 | ||
1169 | # get the appropriate mask to mask out top 16-r13 bytes of xmm0 | ||
1170 | pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 | ||
1171 | |||
1172 | pshufb SHUF_MASK(%rip),%xmm0 | ||
1173 | pxor %xmm0, %xmm8 | ||
1174 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1175 | # GHASH computation for the last <16 byte block | ||
1176 | sub %r13, %r11 | ||
1177 | add $16, %r11 | ||
1178 | pshufb SHUF_MASK(%rip), %xmm0 | ||
1179 | # shuffle xmm0 back to output as ciphertext | ||
1180 | |||
1181 | # Output %r13 bytes | ||
1182 | movq %xmm0, %rax | ||
1183 | cmp $8, %r13 | ||
1184 | jle _less_than_8_bytes_left_encrypt | ||
1185 | mov %rax, (%arg2 , %r11, 1) | ||
1186 | add $8, %r11 | ||
1187 | psrldq $8, %xmm0 | ||
1188 | movq %xmm0, %rax | ||
1189 | sub $8, %r13 | ||
1190 | _less_than_8_bytes_left_encrypt: | ||
1191 | mov %al, (%arg2, %r11, 1) | ||
1192 | add $1, %r11 | ||
1193 | shr $8, %rax | ||
1194 | sub $1, %r13 | ||
1195 | jne _less_than_8_bytes_left_encrypt | ||
1196 | _multiple_of_16_bytes_encrypt: | ||
1197 | mov arg8, %r12 # %r12 = addLen (number of bytes) | ||
1198 | shl $3, %r12 | ||
1199 | movd %r12d, %xmm15 # len(A) in %xmm15 | ||
1200 | shl $3, %arg4 # len(C) in bits (*128) | ||
1201 | movq %arg4, %xmm1 | ||
1202 | pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 | ||
1203 | pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) | ||
1204 | pxor %xmm15, %xmm8 | ||
1205 | GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 | ||
1206 | # final GHASH computation | ||
1207 | |||
1208 | pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap | ||
1209 | mov %arg5, %rax # %rax = *Y0 | ||
1210 | movdqu (%rax), %xmm0 # %xmm0 = Y0 | ||
1211 | ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) | ||
1212 | pxor %xmm8, %xmm0 | ||
1213 | _return_T_encrypt: | ||
1214 | mov arg9, %r10 # %r10 = authTag | ||
1215 | mov arg10, %r11 # %r11 = auth_tag_len | ||
1216 | cmp $16, %r11 | ||
1217 | je _T_16_encrypt | ||
1218 | cmp $12, %r11 | ||
1219 | je _T_12_encrypt | ||
1220 | _T_8_encrypt: | ||
1221 | movq %xmm0, %rax | ||
1222 | mov %rax, (%r10) | ||
1223 | jmp _return_T_done_encrypt | ||
1224 | _T_12_encrypt: | ||
1225 | movq %xmm0, %rax | ||
1226 | mov %rax, (%r10) | ||
1227 | psrldq $8, %xmm0 | ||
1228 | movd %xmm0, %eax | ||
1229 | mov %eax, 8(%r10) | ||
1230 | jmp _return_T_done_encrypt | ||
1231 | _T_16_encrypt: | ||
1232 | movdqu %xmm0, (%r10) | ||
1233 | _return_T_done_encrypt: | ||
1234 | mov %r14, %rsp | ||
1235 | pop %r14 | ||
1236 | pop %r13 | ||
1237 | pop %r12 | ||
1238 | ret | ||
1239 | |||
1240 | |||
1241 | |||
50 | _key_expansion_128: | 1242 | _key_expansion_128: |
51 | _key_expansion_256a: | 1243 | _key_expansion_256a: |
52 | pshufd $0b11111111, %xmm1, %xmm1 | 1244 | pshufd $0b11111111, %xmm1, %xmm1 |
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2cb3dcc4490a..02d349d64423 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c | |||
@@ -5,6 +5,14 @@ | |||
5 | * Copyright (C) 2008, Intel Corp. | 5 | * Copyright (C) 2008, Intel Corp. |
6 | * Author: Huang Ying <ying.huang@intel.com> | 6 | * Author: Huang Ying <ying.huang@intel.com> |
7 | * | 7 | * |
8 | * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD | ||
9 | * interface for 64-bit kernels. | ||
10 | * Authors: Adrian Hoban <adrian.hoban@intel.com> | ||
11 | * Gabriele Paoloni <gabriele.paoloni@intel.com> | ||
12 | * Tadeusz Struk (tadeusz.struk@intel.com) | ||
13 | * Aidan O'Mahony (aidan.o.mahony@intel.com) | ||
14 | * Copyright (c) 2010, Intel Corporation. | ||
15 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | 16 | * This program is free software; you can redistribute it and/or modify |
9 | * it under the terms of the GNU General Public License as published by | 17 | * it under the terms of the GNU General Public License as published by |
10 | * the Free Software Foundation; either version 2 of the License, or | 18 | * the Free Software Foundation; either version 2 of the License, or |
@@ -21,6 +29,10 @@ | |||
21 | #include <crypto/ctr.h> | 29 | #include <crypto/ctr.h> |
22 | #include <asm/i387.h> | 30 | #include <asm/i387.h> |
23 | #include <asm/aes.h> | 31 | #include <asm/aes.h> |
32 | #include <crypto/scatterwalk.h> | ||
33 | #include <crypto/internal/aead.h> | ||
34 | #include <linux/workqueue.h> | ||
35 | #include <linux/spinlock.h> | ||
24 | 36 | ||
25 | #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) | 37 | #if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) |
26 | #define HAS_CTR | 38 | #define HAS_CTR |
@@ -42,8 +54,31 @@ struct async_aes_ctx { | |||
42 | struct cryptd_ablkcipher *cryptd_tfm; | 54 | struct cryptd_ablkcipher *cryptd_tfm; |
43 | }; | 55 | }; |
44 | 56 | ||
45 | #define AESNI_ALIGN 16 | 57 | /* This data is stored at the end of the crypto_tfm struct. |
58 | * It's a type of per "session" data storage location. | ||
59 | * This needs to be 16 byte aligned. | ||
60 | */ | ||
61 | struct aesni_rfc4106_gcm_ctx { | ||
62 | u8 hash_subkey[16]; | ||
63 | struct crypto_aes_ctx aes_key_expanded; | ||
64 | u8 nonce[4]; | ||
65 | struct cryptd_aead *cryptd_tfm; | ||
66 | }; | ||
67 | |||
68 | struct aesni_gcm_set_hash_subkey_result { | ||
69 | int err; | ||
70 | struct completion completion; | ||
71 | }; | ||
72 | |||
73 | struct aesni_hash_subkey_req_data { | ||
74 | u8 iv[16]; | ||
75 | struct aesni_gcm_set_hash_subkey_result result; | ||
76 | struct scatterlist sg; | ||
77 | }; | ||
78 | |||
79 | #define AESNI_ALIGN (16) | ||
46 | #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) | 80 | #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) |
81 | #define RFC4106_HASH_SUBKEY_SIZE 16 | ||
47 | 82 | ||
48 | asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, | 83 | asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, |
49 | unsigned int key_len); | 84 | unsigned int key_len); |
@@ -62,6 +97,57 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, | |||
62 | asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, | 97 | asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, |
63 | const u8 *in, unsigned int len, u8 *iv); | 98 | const u8 *in, unsigned int len, u8 *iv); |
64 | 99 | ||
100 | /* asmlinkage void aesni_gcm_enc() | ||
101 | * void *ctx, AES Key schedule. Starts on a 16 byte boundary. | ||
102 | * u8 *out, Ciphertext output. Encrypt in-place is allowed. | ||
103 | * const u8 *in, Plaintext input | ||
104 | * unsigned long plaintext_len, Length of data in bytes for encryption. | ||
105 | * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) | ||
106 | * concatenated with 8 byte Initialisation Vector (from IPSec ESP | ||
107 | * Payload) concatenated with 0x00000001. 16-byte aligned pointer. | ||
108 | * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. | ||
109 | * const u8 *aad, Additional Authentication Data (AAD) | ||
110 | * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this | ||
111 | * is going to be 8 or 12 bytes | ||
112 | * u8 *auth_tag, Authenticated Tag output. | ||
113 | * unsigned long auth_tag_len), Authenticated Tag Length in bytes. | ||
114 | * Valid values are 16 (most likely), 12 or 8. | ||
115 | */ | ||
116 | asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, | ||
117 | const u8 *in, unsigned long plaintext_len, u8 *iv, | ||
118 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
119 | u8 *auth_tag, unsigned long auth_tag_len); | ||
120 | |||
121 | /* asmlinkage void aesni_gcm_dec() | ||
122 | * void *ctx, AES Key schedule. Starts on a 16 byte boundary. | ||
123 | * u8 *out, Plaintext output. Decrypt in-place is allowed. | ||
124 | * const u8 *in, Ciphertext input | ||
125 | * unsigned long ciphertext_len, Length of data in bytes for decryption. | ||
126 | * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) | ||
127 | * concatenated with 8 byte Initialisation Vector (from IPSec ESP | ||
128 | * Payload) concatenated with 0x00000001. 16-byte aligned pointer. | ||
129 | * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. | ||
130 | * const u8 *aad, Additional Authentication Data (AAD) | ||
131 | * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going | ||
132 | * to be 8 or 12 bytes | ||
133 | * u8 *auth_tag, Authenticated Tag output. | ||
134 | * unsigned long auth_tag_len) Authenticated Tag Length in bytes. | ||
135 | * Valid values are 16 (most likely), 12 or 8. | ||
136 | */ | ||
137 | asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, | ||
138 | const u8 *in, unsigned long ciphertext_len, u8 *iv, | ||
139 | u8 *hash_subkey, const u8 *aad, unsigned long aad_len, | ||
140 | u8 *auth_tag, unsigned long auth_tag_len); | ||
141 | |||
142 | static inline struct | ||
143 | aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) | ||
144 | { | ||
145 | return | ||
146 | (struct aesni_rfc4106_gcm_ctx *) | ||
147 | PTR_ALIGN((u8 *) | ||
148 | crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN); | ||
149 | } | ||
150 | |||
65 | static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) | 151 | static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) |
66 | { | 152 | { |
67 | unsigned long addr = (unsigned long)raw_ctx; | 153 | unsigned long addr = (unsigned long)raw_ctx; |
@@ -730,6 +816,422 @@ static struct crypto_alg ablk_xts_alg = { | |||
730 | }; | 816 | }; |
731 | #endif | 817 | #endif |
732 | 818 | ||
819 | static int rfc4106_init(struct crypto_tfm *tfm) | ||
820 | { | ||
821 | struct cryptd_aead *cryptd_tfm; | ||
822 | struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) | ||
823 | PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); | ||
824 | cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); | ||
825 | if (IS_ERR(cryptd_tfm)) | ||
826 | return PTR_ERR(cryptd_tfm); | ||
827 | ctx->cryptd_tfm = cryptd_tfm; | ||
828 | tfm->crt_aead.reqsize = sizeof(struct aead_request) | ||
829 | + crypto_aead_reqsize(&cryptd_tfm->base); | ||
830 | return 0; | ||
831 | } | ||
832 | |||
833 | static void rfc4106_exit(struct crypto_tfm *tfm) | ||
834 | { | ||
835 | struct aesni_rfc4106_gcm_ctx *ctx = | ||
836 | (struct aesni_rfc4106_gcm_ctx *) | ||
837 | PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); | ||
838 | if (!IS_ERR(ctx->cryptd_tfm)) | ||
839 | cryptd_free_aead(ctx->cryptd_tfm); | ||
840 | return; | ||
841 | } | ||
842 | |||
843 | static void | ||
844 | rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err) | ||
845 | { | ||
846 | struct aesni_gcm_set_hash_subkey_result *result = req->data; | ||
847 | |||
848 | if (err == -EINPROGRESS) | ||
849 | return; | ||
850 | result->err = err; | ||
851 | complete(&result->completion); | ||
852 | } | ||
853 | |||
854 | static int | ||
855 | rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) | ||
856 | { | ||
857 | struct crypto_ablkcipher *ctr_tfm; | ||
858 | struct ablkcipher_request *req; | ||
859 | int ret = -EINVAL; | ||
860 | struct aesni_hash_subkey_req_data *req_data; | ||
861 | |||
862 | ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0); | ||
863 | if (IS_ERR(ctr_tfm)) | ||
864 | return PTR_ERR(ctr_tfm); | ||
865 | |||
866 | crypto_ablkcipher_clear_flags(ctr_tfm, ~0); | ||
867 | |||
868 | ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); | ||
869 | if (ret) { | ||
870 | crypto_free_ablkcipher(ctr_tfm); | ||
871 | return ret; | ||
872 | } | ||
873 | |||
874 | req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); | ||
875 | if (!req) { | ||
876 | crypto_free_ablkcipher(ctr_tfm); | ||
877 | return -EINVAL; | ||
878 | } | ||
879 | |||
880 | req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); | ||
881 | if (!req_data) { | ||
882 | crypto_free_ablkcipher(ctr_tfm); | ||
883 | return -ENOMEM; | ||
884 | } | ||
885 | memset(req_data->iv, 0, sizeof(req_data->iv)); | ||
886 | |||
887 | /* Clear the data in the hash sub key container to zero.*/ | ||
888 | /* We want to cipher all zeros to create the hash sub key. */ | ||
889 | memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); | ||
890 | |||
891 | init_completion(&req_data->result.completion); | ||
892 | sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE); | ||
893 | ablkcipher_request_set_tfm(req, ctr_tfm); | ||
894 | ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | | ||
895 | CRYPTO_TFM_REQ_MAY_BACKLOG, | ||
896 | rfc4106_set_hash_subkey_done, | ||
897 | &req_data->result); | ||
898 | |||
899 | ablkcipher_request_set_crypt(req, &req_data->sg, | ||
900 | &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv); | ||
901 | |||
902 | ret = crypto_ablkcipher_encrypt(req); | ||
903 | if (ret == -EINPROGRESS || ret == -EBUSY) { | ||
904 | ret = wait_for_completion_interruptible | ||
905 | (&req_data->result.completion); | ||
906 | if (!ret) | ||
907 | ret = req_data->result.err; | ||
908 | } | ||
909 | ablkcipher_request_free(req); | ||
910 | kfree(req_data); | ||
911 | crypto_free_ablkcipher(ctr_tfm); | ||
912 | return ret; | ||
913 | } | ||
914 | |||
915 | static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, | ||
916 | unsigned int key_len) | ||
917 | { | ||
918 | int ret = 0; | ||
919 | struct crypto_tfm *tfm = crypto_aead_tfm(parent); | ||
920 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); | ||
921 | u8 *new_key_mem = NULL; | ||
922 | |||
923 | if (key_len < 4) { | ||
924 | crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
925 | return -EINVAL; | ||
926 | } | ||
927 | /*Account for 4 byte nonce at the end.*/ | ||
928 | key_len -= 4; | ||
929 | if (key_len != AES_KEYSIZE_128) { | ||
930 | crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); | ||
931 | return -EINVAL; | ||
932 | } | ||
933 | |||
934 | memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); | ||
935 | /*This must be on a 16 byte boundary!*/ | ||
936 | if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN) | ||
937 | return -EINVAL; | ||
938 | |||
939 | if ((unsigned long)key % AESNI_ALIGN) { | ||
940 | /*key is not aligned: use an auxuliar aligned pointer*/ | ||
941 | new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL); | ||
942 | if (!new_key_mem) | ||
943 | return -ENOMEM; | ||
944 | |||
945 | new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN); | ||
946 | memcpy(new_key_mem, key, key_len); | ||
947 | key = new_key_mem; | ||
948 | } | ||
949 | |||
950 | if (!irq_fpu_usable()) | ||
951 | ret = crypto_aes_expand_key(&(ctx->aes_key_expanded), | ||
952 | key, key_len); | ||
953 | else { | ||
954 | kernel_fpu_begin(); | ||
955 | ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len); | ||
956 | kernel_fpu_end(); | ||
957 | } | ||
958 | /*This must be on a 16 byte boundary!*/ | ||
959 | if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) { | ||
960 | ret = -EINVAL; | ||
961 | goto exit; | ||
962 | } | ||
963 | ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); | ||
964 | exit: | ||
965 | kfree(new_key_mem); | ||
966 | return ret; | ||
967 | } | ||
968 | |||
969 | /* This is the Integrity Check Value (aka the authentication tag length and can | ||
970 | * be 8, 12 or 16 bytes long. */ | ||
971 | static int rfc4106_set_authsize(struct crypto_aead *parent, | ||
972 | unsigned int authsize) | ||
973 | { | ||
974 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); | ||
975 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | ||
976 | |||
977 | switch (authsize) { | ||
978 | case 8: | ||
979 | case 12: | ||
980 | case 16: | ||
981 | break; | ||
982 | default: | ||
983 | return -EINVAL; | ||
984 | } | ||
985 | crypto_aead_crt(parent)->authsize = authsize; | ||
986 | crypto_aead_crt(cryptd_child)->authsize = authsize; | ||
987 | return 0; | ||
988 | } | ||
989 | |||
990 | static int rfc4106_encrypt(struct aead_request *req) | ||
991 | { | ||
992 | int ret; | ||
993 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
994 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
995 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | ||
996 | |||
997 | if (!irq_fpu_usable()) { | ||
998 | struct aead_request *cryptd_req = | ||
999 | (struct aead_request *) aead_request_ctx(req); | ||
1000 | memcpy(cryptd_req, req, sizeof(*req)); | ||
1001 | aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); | ||
1002 | return crypto_aead_encrypt(cryptd_req); | ||
1003 | } else { | ||
1004 | kernel_fpu_begin(); | ||
1005 | ret = cryptd_child->base.crt_aead.encrypt(req); | ||
1006 | kernel_fpu_end(); | ||
1007 | return ret; | ||
1008 | } | ||
1009 | } | ||
1010 | |||
1011 | static int rfc4106_decrypt(struct aead_request *req) | ||
1012 | { | ||
1013 | int ret; | ||
1014 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
1015 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
1016 | struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); | ||
1017 | |||
1018 | if (!irq_fpu_usable()) { | ||
1019 | struct aead_request *cryptd_req = | ||
1020 | (struct aead_request *) aead_request_ctx(req); | ||
1021 | memcpy(cryptd_req, req, sizeof(*req)); | ||
1022 | aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); | ||
1023 | return crypto_aead_decrypt(cryptd_req); | ||
1024 | } else { | ||
1025 | kernel_fpu_begin(); | ||
1026 | ret = cryptd_child->base.crt_aead.decrypt(req); | ||
1027 | kernel_fpu_end(); | ||
1028 | return ret; | ||
1029 | } | ||
1030 | } | ||
1031 | |||
1032 | static struct crypto_alg rfc4106_alg = { | ||
1033 | .cra_name = "rfc4106(gcm(aes))", | ||
1034 | .cra_driver_name = "rfc4106-gcm-aesni", | ||
1035 | .cra_priority = 400, | ||
1036 | .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, | ||
1037 | .cra_blocksize = 1, | ||
1038 | .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, | ||
1039 | .cra_alignmask = 0, | ||
1040 | .cra_type = &crypto_nivaead_type, | ||
1041 | .cra_module = THIS_MODULE, | ||
1042 | .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list), | ||
1043 | .cra_init = rfc4106_init, | ||
1044 | .cra_exit = rfc4106_exit, | ||
1045 | .cra_u = { | ||
1046 | .aead = { | ||
1047 | .setkey = rfc4106_set_key, | ||
1048 | .setauthsize = rfc4106_set_authsize, | ||
1049 | .encrypt = rfc4106_encrypt, | ||
1050 | .decrypt = rfc4106_decrypt, | ||
1051 | .geniv = "seqiv", | ||
1052 | .ivsize = 8, | ||
1053 | .maxauthsize = 16, | ||
1054 | }, | ||
1055 | }, | ||
1056 | }; | ||
1057 | |||
1058 | static int __driver_rfc4106_encrypt(struct aead_request *req) | ||
1059 | { | ||
1060 | u8 one_entry_in_sg = 0; | ||
1061 | u8 *src, *dst, *assoc; | ||
1062 | __be32 counter = cpu_to_be32(1); | ||
1063 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
1064 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
1065 | void *aes_ctx = &(ctx->aes_key_expanded); | ||
1066 | unsigned long auth_tag_len = crypto_aead_authsize(tfm); | ||
1067 | u8 iv_tab[16+AESNI_ALIGN]; | ||
1068 | u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN); | ||
1069 | struct scatter_walk src_sg_walk; | ||
1070 | struct scatter_walk assoc_sg_walk; | ||
1071 | struct scatter_walk dst_sg_walk; | ||
1072 | unsigned int i; | ||
1073 | |||
1074 | /* Assuming we are supporting rfc4106 64-bit extended */ | ||
1075 | /* sequence numbers We need to have the AAD length equal */ | ||
1076 | /* to 8 or 12 bytes */ | ||
1077 | if (unlikely(req->assoclen != 8 && req->assoclen != 12)) | ||
1078 | return -EINVAL; | ||
1079 | /* IV below built */ | ||
1080 | for (i = 0; i < 4; i++) | ||
1081 | *(iv+i) = ctx->nonce[i]; | ||
1082 | for (i = 0; i < 8; i++) | ||
1083 | *(iv+4+i) = req->iv[i]; | ||
1084 | *((__be32 *)(iv+12)) = counter; | ||
1085 | |||
1086 | if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { | ||
1087 | one_entry_in_sg = 1; | ||
1088 | scatterwalk_start(&src_sg_walk, req->src); | ||
1089 | scatterwalk_start(&assoc_sg_walk, req->assoc); | ||
1090 | src = scatterwalk_map(&src_sg_walk, 0); | ||
1091 | assoc = scatterwalk_map(&assoc_sg_walk, 0); | ||
1092 | dst = src; | ||
1093 | if (unlikely(req->src != req->dst)) { | ||
1094 | scatterwalk_start(&dst_sg_walk, req->dst); | ||
1095 | dst = scatterwalk_map(&dst_sg_walk, 0); | ||
1096 | } | ||
1097 | |||
1098 | } else { | ||
1099 | /* Allocate memory for src, dst, assoc */ | ||
1100 | src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, | ||
1101 | GFP_ATOMIC); | ||
1102 | if (unlikely(!src)) | ||
1103 | return -ENOMEM; | ||
1104 | assoc = (src + req->cryptlen + auth_tag_len); | ||
1105 | scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); | ||
1106 | scatterwalk_map_and_copy(assoc, req->assoc, 0, | ||
1107 | req->assoclen, 0); | ||
1108 | dst = src; | ||
1109 | } | ||
1110 | |||
1111 | aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, | ||
1112 | ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst | ||
1113 | + ((unsigned long)req->cryptlen), auth_tag_len); | ||
1114 | |||
1115 | /* The authTag (aka the Integrity Check Value) needs to be written | ||
1116 | * back to the packet. */ | ||
1117 | if (one_entry_in_sg) { | ||
1118 | if (unlikely(req->src != req->dst)) { | ||
1119 | scatterwalk_unmap(dst, 0); | ||
1120 | scatterwalk_done(&dst_sg_walk, 0, 0); | ||
1121 | } | ||
1122 | scatterwalk_unmap(src, 0); | ||
1123 | scatterwalk_unmap(assoc, 0); | ||
1124 | scatterwalk_done(&src_sg_walk, 0, 0); | ||
1125 | scatterwalk_done(&assoc_sg_walk, 0, 0); | ||
1126 | } else { | ||
1127 | scatterwalk_map_and_copy(dst, req->dst, 0, | ||
1128 | req->cryptlen + auth_tag_len, 1); | ||
1129 | kfree(src); | ||
1130 | } | ||
1131 | return 0; | ||
1132 | } | ||
1133 | |||
1134 | static int __driver_rfc4106_decrypt(struct aead_request *req) | ||
1135 | { | ||
1136 | u8 one_entry_in_sg = 0; | ||
1137 | u8 *src, *dst, *assoc; | ||
1138 | unsigned long tempCipherLen = 0; | ||
1139 | __be32 counter = cpu_to_be32(1); | ||
1140 | int retval = 0; | ||
1141 | struct crypto_aead *tfm = crypto_aead_reqtfm(req); | ||
1142 | struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); | ||
1143 | void *aes_ctx = &(ctx->aes_key_expanded); | ||
1144 | unsigned long auth_tag_len = crypto_aead_authsize(tfm); | ||
1145 | u8 iv_and_authTag[32+AESNI_ALIGN]; | ||
1146 | u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN); | ||
1147 | u8 *authTag = iv + 16; | ||
1148 | struct scatter_walk src_sg_walk; | ||
1149 | struct scatter_walk assoc_sg_walk; | ||
1150 | struct scatter_walk dst_sg_walk; | ||
1151 | unsigned int i; | ||
1152 | |||
1153 | if (unlikely((req->cryptlen < auth_tag_len) || | ||
1154 | (req->assoclen != 8 && req->assoclen != 12))) | ||
1155 | return -EINVAL; | ||
1156 | /* Assuming we are supporting rfc4106 64-bit extended */ | ||
1157 | /* sequence numbers We need to have the AAD length */ | ||
1158 | /* equal to 8 or 12 bytes */ | ||
1159 | |||
1160 | tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); | ||
1161 | /* IV below built */ | ||
1162 | for (i = 0; i < 4; i++) | ||
1163 | *(iv+i) = ctx->nonce[i]; | ||
1164 | for (i = 0; i < 8; i++) | ||
1165 | *(iv+4+i) = req->iv[i]; | ||
1166 | *((__be32 *)(iv+12)) = counter; | ||
1167 | |||
1168 | if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { | ||
1169 | one_entry_in_sg = 1; | ||
1170 | scatterwalk_start(&src_sg_walk, req->src); | ||
1171 | scatterwalk_start(&assoc_sg_walk, req->assoc); | ||
1172 | src = scatterwalk_map(&src_sg_walk, 0); | ||
1173 | assoc = scatterwalk_map(&assoc_sg_walk, 0); | ||
1174 | dst = src; | ||
1175 | if (unlikely(req->src != req->dst)) { | ||
1176 | scatterwalk_start(&dst_sg_walk, req->dst); | ||
1177 | dst = scatterwalk_map(&dst_sg_walk, 0); | ||
1178 | } | ||
1179 | |||
1180 | } else { | ||
1181 | /* Allocate memory for src, dst, assoc */ | ||
1182 | src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); | ||
1183 | if (!src) | ||
1184 | return -ENOMEM; | ||
1185 | assoc = (src + req->cryptlen + auth_tag_len); | ||
1186 | scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); | ||
1187 | scatterwalk_map_and_copy(assoc, req->assoc, 0, | ||
1188 | req->assoclen, 0); | ||
1189 | dst = src; | ||
1190 | } | ||
1191 | |||
1192 | aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, | ||
1193 | ctx->hash_subkey, assoc, (unsigned long)req->assoclen, | ||
1194 | authTag, auth_tag_len); | ||
1195 | |||
1196 | /* Compare generated tag with passed in tag. */ | ||
1197 | retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ? | ||
1198 | -EBADMSG : 0; | ||
1199 | |||
1200 | if (one_entry_in_sg) { | ||
1201 | if (unlikely(req->src != req->dst)) { | ||
1202 | scatterwalk_unmap(dst, 0); | ||
1203 | scatterwalk_done(&dst_sg_walk, 0, 0); | ||
1204 | } | ||
1205 | scatterwalk_unmap(src, 0); | ||
1206 | scatterwalk_unmap(assoc, 0); | ||
1207 | scatterwalk_done(&src_sg_walk, 0, 0); | ||
1208 | scatterwalk_done(&assoc_sg_walk, 0, 0); | ||
1209 | } else { | ||
1210 | scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1); | ||
1211 | kfree(src); | ||
1212 | } | ||
1213 | return retval; | ||
1214 | } | ||
1215 | |||
1216 | static struct crypto_alg __rfc4106_alg = { | ||
1217 | .cra_name = "__gcm-aes-aesni", | ||
1218 | .cra_driver_name = "__driver-gcm-aes-aesni", | ||
1219 | .cra_priority = 0, | ||
1220 | .cra_flags = CRYPTO_ALG_TYPE_AEAD, | ||
1221 | .cra_blocksize = 1, | ||
1222 | .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN, | ||
1223 | .cra_alignmask = 0, | ||
1224 | .cra_type = &crypto_aead_type, | ||
1225 | .cra_module = THIS_MODULE, | ||
1226 | .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list), | ||
1227 | .cra_u = { | ||
1228 | .aead = { | ||
1229 | .encrypt = __driver_rfc4106_encrypt, | ||
1230 | .decrypt = __driver_rfc4106_decrypt, | ||
1231 | }, | ||
1232 | }, | ||
1233 | }; | ||
1234 | |||
733 | static int __init aesni_init(void) | 1235 | static int __init aesni_init(void) |
734 | { | 1236 | { |
735 | int err; | 1237 | int err; |
@@ -738,6 +1240,7 @@ static int __init aesni_init(void) | |||
738 | printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); | 1240 | printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); |
739 | return -ENODEV; | 1241 | return -ENODEV; |
740 | } | 1242 | } |
1243 | |||
741 | if ((err = crypto_register_alg(&aesni_alg))) | 1244 | if ((err = crypto_register_alg(&aesni_alg))) |
742 | goto aes_err; | 1245 | goto aes_err; |
743 | if ((err = crypto_register_alg(&__aesni_alg))) | 1246 | if ((err = crypto_register_alg(&__aesni_alg))) |
@@ -770,10 +1273,19 @@ static int __init aesni_init(void) | |||
770 | if ((err = crypto_register_alg(&ablk_xts_alg))) | 1273 | if ((err = crypto_register_alg(&ablk_xts_alg))) |
771 | goto ablk_xts_err; | 1274 | goto ablk_xts_err; |
772 | #endif | 1275 | #endif |
773 | 1276 | err = crypto_register_alg(&__rfc4106_alg); | |
1277 | if (err) | ||
1278 | goto __aead_gcm_err; | ||
1279 | err = crypto_register_alg(&rfc4106_alg); | ||
1280 | if (err) | ||
1281 | goto aead_gcm_err; | ||
774 | return err; | 1282 | return err; |
775 | 1283 | ||
1284 | aead_gcm_err: | ||
1285 | crypto_unregister_alg(&__rfc4106_alg); | ||
1286 | __aead_gcm_err: | ||
776 | #ifdef HAS_XTS | 1287 | #ifdef HAS_XTS |
1288 | crypto_unregister_alg(&ablk_xts_alg); | ||
777 | ablk_xts_err: | 1289 | ablk_xts_err: |
778 | #endif | 1290 | #endif |
779 | #ifdef HAS_PCBC | 1291 | #ifdef HAS_PCBC |
@@ -809,6 +1321,8 @@ aes_err: | |||
809 | 1321 | ||
810 | static void __exit aesni_exit(void) | 1322 | static void __exit aesni_exit(void) |
811 | { | 1323 | { |
1324 | crypto_unregister_alg(&__rfc4106_alg); | ||
1325 | crypto_unregister_alg(&rfc4106_alg); | ||
812 | #ifdef HAS_XTS | 1326 | #ifdef HAS_XTS |
813 | crypto_unregister_alg(&ablk_xts_alg); | 1327 | crypto_unregister_alg(&ablk_xts_alg); |
814 | #endif | 1328 | #endif |