aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorTadeusz Struk <tadeusz.struk@intel.com>2010-11-04 15:00:45 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2010-11-13 07:47:55 -0500
commit0bd82f5f6355775fbaf7d3c664432ce1b862be1e (patch)
tree5f7f7348c2681d572e8bc11f27a42a6e2b8f4023 /arch
parent895be15745d59cc7ede0e1c203e3432b0abdb71c (diff)
crypto: aesni-intel - RFC4106 AES-GCM Driver Using Intel New Instructions
This patch adds an optimized RFC4106 AES-GCM implementation for 64-bit kernels. It supports 128-bit AES key size. This leverages the crypto AEAD interface type to facilitate a combined AES & GCM operation to be implemented in assembly code. The assembly code leverages Intel(R) AES New Instructions and the PCLMULQDQ instruction. Signed-off-by: Adrian Hoban <adrian.hoban@intel.com> Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com> Signed-off-by: Gabriele Paoloni <gabriele.paoloni@intel.com> Signed-off-by: Aidan O'Mahony <aidan.o.mahony@intel.com> Signed-off-by: Erdinc Ozturk <erdinc.ozturk@intel.com> Signed-off-by: James Guilford <james.guilford@intel.com> Signed-off-by: Wajdi Feghali <wajdi.k.feghali@intel.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1192
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c518
2 files changed, 1708 insertions, 2 deletions
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756a51c1..aafced54df64 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,17 @@
9 * Vinodh Gopal <vinodh.gopal@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir 10 * Kahraman Akdemir
11 * 11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
12 * This program is free software; you can redistribute it and/or modify 23 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by 24 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or 25 * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +29,60 @@
18#include <linux/linkage.h> 29#include <linux/linkage.h>
19#include <asm/inst.h> 30#include <asm/inst.h>
20 31
32.data
33POLY: .octa 0xC2000000000000000000000000000001
34TWOONE: .octa 0x00000001000000000000000000000001
35
36# order of these constants should not change.
37# more specifically, ALL_F should follow SHIFT_MASK,
38# and ZERO should follow ALL_F
39
40SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
41MASK1: .octa 0x0000000000000000ffffffffffffffff
42MASK2: .octa 0xffffffffffffffff0000000000000000
43SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
44ALL_F: .octa 0xffffffffffffffffffffffffffffffff
45ZERO: .octa 0x00000000000000000000000000000000
46ONE: .octa 0x00000000000000000000000000000001
47F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
48dec: .octa 0x1
49enc: .octa 0x2
50
51
21.text 52.text
22 53
54
55#define STACK_OFFSET 8*3
56#define HashKey 16*0 // store HashKey <<1 mod poly here
57#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
58#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
59#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
60#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
61 // bits of HashKey <<1 mod poly here
62 //(for Karatsuba purposes)
63#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
64 // bits of HashKey^2 <<1 mod poly here
65 // (for Karatsuba purposes)
66#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
67 // bits of HashKey^3 <<1 mod poly here
68 // (for Karatsuba purposes)
69#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
70 // bits of HashKey^4 <<1 mod poly here
71 // (for Karatsuba purposes)
72#define VARIABLE_OFFSET 16*8
73
74#define arg1 rdi
75#define arg2 rsi
76#define arg3 rdx
77#define arg4 rcx
78#define arg5 r8
79#define arg6 r9
80#define arg7 STACK_OFFSET+8(%r14)
81#define arg8 STACK_OFFSET+16(%r14)
82#define arg9 STACK_OFFSET+24(%r14)
83#define arg10 STACK_OFFSET+32(%r14)
84
85
23#define STATE1 %xmm0 86#define STATE1 %xmm0
24#define STATE2 %xmm4 87#define STATE2 %xmm4
25#define STATE3 %xmm5 88#define STATE3 %xmm5
@@ -47,6 +110,1135 @@
47#define T2 %r11 110#define T2 %r11
48#define TCTR_LOW T2 111#define TCTR_LOW T2
49 112
113
114/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
115*
116*
117* Input: A and B (128-bits each, bit-reflected)
118* Output: C = A*B*x mod poly, (i.e. >>1 )
119* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
120* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
121*
122*/
123.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
124 movdqa \GH, \TMP1
125 pshufd $78, \GH, \TMP2
126 pshufd $78, \HK, \TMP3
127 pxor \GH, \TMP2 # TMP2 = a1+a0
128 pxor \HK, \TMP3 # TMP3 = b1+b0
129 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
130 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
131 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
132 pxor \GH, \TMP2
133 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
134 movdqa \TMP2, \TMP3
135 pslldq $8, \TMP3 # left shift TMP3 2 DWs
136 psrldq $8, \TMP2 # right shift TMP2 2 DWs
137 pxor \TMP3, \GH
138 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
139
140 # first phase of the reduction
141
142 movdqa \GH, \TMP2
143 movdqa \GH, \TMP3
144 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
145 # in in order to perform
146 # independent shifts
147 pslld $31, \TMP2 # packed right shift <<31
148 pslld $30, \TMP3 # packed right shift <<30
149 pslld $25, \TMP4 # packed right shift <<25
150 pxor \TMP3, \TMP2 # xor the shifted versions
151 pxor \TMP4, \TMP2
152 movdqa \TMP2, \TMP5
153 psrldq $4, \TMP5 # right shift TMP5 1 DW
154 pslldq $12, \TMP2 # left shift TMP2 3 DWs
155 pxor \TMP2, \GH
156
157 # second phase of the reduction
158
159 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
160 # in in order to perform
161 # independent shifts
162 movdqa \GH,\TMP3
163 movdqa \GH,\TMP4
164 psrld $1,\TMP2 # packed left shift >>1
165 psrld $2,\TMP3 # packed left shift >>2
166 psrld $7,\TMP4 # packed left shift >>7
167 pxor \TMP3,\TMP2 # xor the shifted versions
168 pxor \TMP4,\TMP2
169 pxor \TMP5, \TMP2
170 pxor \TMP2, \GH
171 pxor \TMP1, \GH # result is in TMP1
172.endm
173
174/*
175* if a = number of total plaintext bytes
176* b = floor(a/16)
177* num_initial_blocks = b mod 4
178* encrypt the initial num_initial_blocks blocks and apply ghash on
179* the ciphertext
180* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
181* are clobbered
182* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
183*/
184
185.macro INITIAL_BLOCKS num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
186XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
187
188 mov arg7, %r10 # %r10 = AAD
189 mov arg8, %r12 # %r12 = aadLen
190 mov %r12, %r11
191 pxor %xmm\i, %xmm\i
192_get_AAD_loop\num_initial_blocks\operation:
193 movd (%r10), \TMP1
194 pslldq $12, \TMP1
195 psrldq $4, %xmm\i
196 pxor \TMP1, %xmm\i
197 add $4, %r10
198 sub $4, %r12
199 jne _get_AAD_loop\num_initial_blocks\operation
200 cmp $16, %r11
201 je _get_AAD_loop2_done\num_initial_blocks\operation
202 mov $16, %r12
203_get_AAD_loop2\num_initial_blocks\operation:
204 psrldq $4, %xmm\i
205 sub $4, %r12
206 cmp %r11, %r12
207 jne _get_AAD_loop2\num_initial_blocks\operation
208_get_AAD_loop2_done\num_initial_blocks\operation:
209 pshufb SHUF_MASK(%rip), %xmm\i # byte-reflect the AAD data
210 xor %r11, %r11 # initialise the data pointer offset as zero
211
212 # start AES for num_initial_blocks blocks
213
214 mov %arg5, %rax # %rax = *Y0
215 movdqu (%rax), \XMM0 # XMM0 = Y0
216 pshufb SHUF_MASK(%rip), \XMM0
217.if \i_seq != 0
218.irpc index, \i_seq
219 paddd ONE(%rip), \XMM0 # INCR Y0
220 movdqa \XMM0, %xmm\index
221 pshufb SHUF_MASK(%rip), %xmm\index # perform a 16 byte swap
222.endr
223.irpc index, \i_seq
224 pxor 16*0(%arg1), %xmm\index
225.endr
226.irpc index, \i_seq
227 movaps 0x10(%rdi), \TMP1
228 AESENC \TMP1, %xmm\index # Round 1
229.endr
230.irpc index, \i_seq
231 movaps 0x20(%arg1), \TMP1
232 AESENC \TMP1, %xmm\index # Round 2
233.endr
234.irpc index, \i_seq
235 movaps 0x30(%arg1), \TMP1
236 AESENC \TMP1, %xmm\index # Round 2
237.endr
238.irpc index, \i_seq
239 movaps 0x40(%arg1), \TMP1
240 AESENC \TMP1, %xmm\index # Round 2
241.endr
242.irpc index, \i_seq
243 movaps 0x50(%arg1), \TMP1
244 AESENC \TMP1, %xmm\index # Round 2
245.endr
246.irpc index, \i_seq
247 movaps 0x60(%arg1), \TMP1
248 AESENC \TMP1, %xmm\index # Round 2
249.endr
250.irpc index, \i_seq
251 movaps 0x70(%arg1), \TMP1
252 AESENC \TMP1, %xmm\index # Round 2
253.endr
254.irpc index, \i_seq
255 movaps 0x80(%arg1), \TMP1
256 AESENC \TMP1, %xmm\index # Round 2
257.endr
258.irpc index, \i_seq
259 movaps 0x90(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0xa0(%arg1), \TMP1
264 AESENCLAST \TMP1, %xmm\index # Round 10
265.endr
266.irpc index, \i_seq
267 movdqu (%arg3 , %r11, 1), \TMP1
268 pxor \TMP1, %xmm\index
269 movdqu %xmm\index, (%arg2 , %r11, 1)
270 # write back plaintext/ciphertext for num_initial_blocks
271 add $16, %r11
272.if \operation == dec
273 movdqa \TMP1, %xmm\index
274.endif
275 pshufb SHUF_MASK(%rip), %xmm\index
276 # prepare plaintext/ciphertext for GHASH computation
277.endr
278.endif
279 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
280 # apply GHASH on num_initial_blocks blocks
281
282.if \i == 5
283 pxor %xmm5, %xmm6
284 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
285 pxor %xmm6, %xmm7
286 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
287 pxor %xmm7, %xmm8
288 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
289.elseif \i == 6
290 pxor %xmm6, %xmm7
291 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
292 pxor %xmm7, %xmm8
293 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
294.elseif \i == 7
295 pxor %xmm7, %xmm8
296 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
297.endif
298 cmp $64, %r13
299 jl _initial_blocks_done\num_initial_blocks\operation
300 # no need for precomputed values
301/*
302*
303* Precomputations for HashKey parallel with encryption of first 4 blocks.
304* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
305*/
306 paddd ONE(%rip), \XMM0 # INCR Y0
307 movdqa \XMM0, \XMM1
308 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
309 paddd ONE(%rip), \XMM0 # INCR Y0
310 movdqa \XMM0, \XMM2
311 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
312 paddd ONE(%rip), \XMM0 # INCR Y0
313 movdqa \XMM0, \XMM3
314 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
315 paddd ONE(%rip), \XMM0 # INCR Y0
316 movdqa \XMM0, \XMM4
317 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
318 pxor 16*0(%arg1), \XMM1
319 pxor 16*0(%arg1), \XMM2
320 pxor 16*0(%arg1), \XMM3
321 pxor 16*0(%arg1), \XMM4
322 movdqa \TMP3, \TMP5
323 pshufd $78, \TMP3, \TMP1
324 pxor \TMP3, \TMP1
325 movdqa \TMP1, HashKey_k(%rsp)
326 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
327# TMP5 = HashKey^2<<1 (mod poly)
328 movdqa \TMP5, HashKey_2(%rsp)
329# HashKey_2 = HashKey^2<<1 (mod poly)
330 pshufd $78, \TMP5, \TMP1
331 pxor \TMP5, \TMP1
332 movdqa \TMP1, HashKey_2_k(%rsp)
333.irpc index, 1234 # do 4 rounds
334 movaps 0x10*\index(%arg1), \TMP1
335 AESENC \TMP1, \XMM1
336 AESENC \TMP1, \XMM2
337 AESENC \TMP1, \XMM3
338 AESENC \TMP1, \XMM4
339.endr
340 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
341# TMP5 = HashKey^3<<1 (mod poly)
342 movdqa \TMP5, HashKey_3(%rsp)
343 pshufd $78, \TMP5, \TMP1
344 pxor \TMP5, \TMP1
345 movdqa \TMP1, HashKey_3_k(%rsp)
346.irpc index, 56789 # do next 5 rounds
347 movaps 0x10*\index(%arg1), \TMP1
348 AESENC \TMP1, \XMM1
349 AESENC \TMP1, \XMM2
350 AESENC \TMP1, \XMM3
351 AESENC \TMP1, \XMM4
352.endr
353 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
354# TMP5 = HashKey^3<<1 (mod poly)
355 movdqa \TMP5, HashKey_4(%rsp)
356 pshufd $78, \TMP5, \TMP1
357 pxor \TMP5, \TMP1
358 movdqa \TMP1, HashKey_4_k(%rsp)
359 movaps 0xa0(%arg1), \TMP2
360 AESENCLAST \TMP2, \XMM1
361 AESENCLAST \TMP2, \XMM2
362 AESENCLAST \TMP2, \XMM3
363 AESENCLAST \TMP2, \XMM4
364 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
365 pxor \TMP1, \XMM1
366.if \operation == dec
367 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
368 movdqa \TMP1, \XMM1
369.endif
370 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
371 pxor \TMP1, \XMM2
372.if \operation == dec
373 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
374 movdqa \TMP1, \XMM2
375.endif
376 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
377 pxor \TMP1, \XMM3
378.if \operation == dec
379 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
380 movdqa \TMP1, \XMM3
381.endif
382 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
383 pxor \TMP1, \XMM4
384.if \operation == dec
385 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
386 movdqa \TMP1, \XMM4
387.else
388 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
389 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
390 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
391 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
392.endif
393 add $64, %r11
394 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
395 pxor \XMMDst, \XMM1
396# combine GHASHed value with the corresponding ciphertext
397 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
398 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
399 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
400_initial_blocks_done\num_initial_blocks\operation:
401.endm
402
403/*
404* encrypt 4 blocks at a time
405* ghash the 4 previously encrypted ciphertext blocks
406* arg1, %arg2, %arg3 are used as pointers only, not modified
407* %r11 is the data offset value
408*/
409.macro GHASH_4_ENCRYPT_4_PARALLEL TMP1 TMP2 TMP3 TMP4 TMP5 \
410TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
411
412 movdqa \XMM1, \XMM5
413 movdqa \XMM2, \XMM6
414 movdqa \XMM3, \XMM7
415 movdqa \XMM4, \XMM8
416
417 # multiply TMP5 * HashKey using karatsuba
418
419 movdqa \XMM5, \TMP4
420 pshufd $78, \XMM5, \TMP6
421 pxor \XMM5, \TMP6
422 paddd ONE(%rip), \XMM0 # INCR CNT
423 movdqa HashKey_4(%rsp), \TMP5
424 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
425 movdqa \XMM0, \XMM1
426 paddd ONE(%rip), \XMM0 # INCR CNT
427 movdqa \XMM0, \XMM2
428 paddd ONE(%rip), \XMM0 # INCR CNT
429 movdqa \XMM0, \XMM3
430 paddd ONE(%rip), \XMM0 # INCR CNT
431 movdqa \XMM0, \XMM4
432 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
433 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
434 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
435 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
436 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte swap
437 pxor (%arg1), \XMM1
438 pxor (%arg1), \XMM2
439 pxor (%arg1), \XMM3
440 pxor (%arg1), \XMM4
441 movdqa HashKey_4_k(%rsp), \TMP5
442 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
443 movaps 0x10(%arg1), \TMP1
444 AESENC \TMP1, \XMM1 # Round 1
445 AESENC \TMP1, \XMM2
446 AESENC \TMP1, \XMM3
447 AESENC \TMP1, \XMM4
448 movaps 0x20(%arg1), \TMP1
449 AESENC \TMP1, \XMM1 # Round 2
450 AESENC \TMP1, \XMM2
451 AESENC \TMP1, \XMM3
452 AESENC \TMP1, \XMM4
453 movdqa \XMM6, \TMP1
454 pshufd $78, \XMM6, \TMP2
455 pxor \XMM6, \TMP2
456 movdqa HashKey_3(%rsp), \TMP5
457 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
458 movaps 0x30(%arg1), \TMP3
459 AESENC \TMP3, \XMM1 # Round 3
460 AESENC \TMP3, \XMM2
461 AESENC \TMP3, \XMM3
462 AESENC \TMP3, \XMM4
463 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
464 movaps 0x40(%arg1), \TMP3
465 AESENC \TMP3, \XMM1 # Round 4
466 AESENC \TMP3, \XMM2
467 AESENC \TMP3, \XMM3
468 AESENC \TMP3, \XMM4
469 movdqa HashKey_3_k(%rsp), \TMP5
470 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
471 movaps 0x50(%arg1), \TMP3
472 AESENC \TMP3, \XMM1 # Round 5
473 AESENC \TMP3, \XMM2
474 AESENC \TMP3, \XMM3
475 AESENC \TMP3, \XMM4
476 pxor \TMP1, \TMP4
477# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
478 pxor \XMM6, \XMM5
479 pxor \TMP2, \TMP6
480 movdqa \XMM7, \TMP1
481 pshufd $78, \XMM7, \TMP2
482 pxor \XMM7, \TMP2
483 movdqa HashKey_2(%rsp ), \TMP5
484
485 # Multiply TMP5 * HashKey using karatsuba
486
487 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
488 movaps 0x60(%arg1), \TMP3
489 AESENC \TMP3, \XMM1 # Round 6
490 AESENC \TMP3, \XMM2
491 AESENC \TMP3, \XMM3
492 AESENC \TMP3, \XMM4
493 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
494 movaps 0x70(%arg1), \TMP3
495 AESENC \TMP3, \XMM1 # Round 7
496 AESENC \TMP3, \XMM2
497 AESENC \TMP3, \XMM3
498 AESENC \TMP3, \XMM4
499 movdqa HashKey_2_k(%rsp), \TMP5
500 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
501 movaps 0x80(%arg1), \TMP3
502 AESENC \TMP3, \XMM1 # Round 8
503 AESENC \TMP3, \XMM2
504 AESENC \TMP3, \XMM3
505 AESENC \TMP3, \XMM4
506 pxor \TMP1, \TMP4
507# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
508 pxor \XMM7, \XMM5
509 pxor \TMP2, \TMP6
510
511 # Multiply XMM8 * HashKey
512 # XMM8 and TMP5 hold the values for the two operands
513
514 movdqa \XMM8, \TMP1
515 pshufd $78, \XMM8, \TMP2
516 pxor \XMM8, \TMP2
517 movdqa HashKey(%rsp), \TMP5
518 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
519 movaps 0x90(%arg1), \TMP3
520 AESENC \TMP3, \XMM1 # Round 9
521 AESENC \TMP3, \XMM2
522 AESENC \TMP3, \XMM3
523 AESENC \TMP3, \XMM4
524 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
525 movaps 0xa0(%arg1), \TMP3
526 AESENCLAST \TMP3, \XMM1 # Round 10
527 AESENCLAST \TMP3, \XMM2
528 AESENCLAST \TMP3, \XMM3
529 AESENCLAST \TMP3, \XMM4
530 movdqa HashKey_k(%rsp), \TMP5
531 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
532 movdqu (%arg3,%r11,1), \TMP3
533 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
534.if \operation == dec
535 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
536 movdqa \TMP3, \XMM1
537.endif
538 movdqu 16(%arg3,%r11,1), \TMP3
539 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
540.if \operation == dec
541 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
542 movdqa \TMP3, \XMM2
543.endif
544 movdqu 32(%arg3,%r11,1), \TMP3
545 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
546.if \operation == dec
547 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
548 movdqa \TMP3, \XMM3
549.endif
550 movdqu 48(%arg3,%r11,1), \TMP3
551 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
552.if \operation == dec
553 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
554 movdqa \TMP3, \XMM4
555.else
556 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
557 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
558 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
559 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
560.endif
561 pshufb SHUF_MASK(%rip), \XMM1 # perform a 16 byte swap
562 pshufb SHUF_MASK(%rip), \XMM2 # perform a 16 byte swap
563 pshufb SHUF_MASK(%rip), \XMM3 # perform a 16 byte swap
564 pshufb SHUF_MASK(%rip), \XMM4 # perform a 16 byte sway
565
566 pxor \TMP4, \TMP1
567 pxor \XMM8, \XMM5
568 pxor \TMP6, \TMP2
569 pxor \TMP1, \TMP2
570 pxor \XMM5, \TMP2
571 movdqa \TMP2, \TMP3
572 pslldq $8, \TMP3 # left shift TMP3 2 DWs
573 psrldq $8, \TMP2 # right shift TMP2 2 DWs
574 pxor \TMP3, \XMM5
575 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
576
577 # first phase of reduction
578
579 movdqa \XMM5, \TMP2
580 movdqa \XMM5, \TMP3
581 movdqa \XMM5, \TMP4
582# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
583 pslld $31, \TMP2 # packed right shift << 31
584 pslld $30, \TMP3 # packed right shift << 30
585 pslld $25, \TMP4 # packed right shift << 25
586 pxor \TMP3, \TMP2 # xor the shifted versions
587 pxor \TMP4, \TMP2
588 movdqa \TMP2, \TMP5
589 psrldq $4, \TMP5 # right shift T5 1 DW
590 pslldq $12, \TMP2 # left shift T2 3 DWs
591 pxor \TMP2, \XMM5
592
593 # second phase of reduction
594
595 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
596 movdqa \XMM5,\TMP3
597 movdqa \XMM5,\TMP4
598 psrld $1, \TMP2 # packed left shift >>1
599 psrld $2, \TMP3 # packed left shift >>2
600 psrld $7, \TMP4 # packed left shift >>7
601 pxor \TMP3,\TMP2 # xor the shifted versions
602 pxor \TMP4,\TMP2
603 pxor \TMP5, \TMP2
604 pxor \TMP2, \XMM5
605 pxor \TMP1, \XMM5 # result is in TMP1
606
607 pxor \XMM5, \XMM1
608.endm
609
610/* GHASH the last 4 ciphertext blocks. */
611.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
612TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
613
614 # Multiply TMP6 * HashKey (using Karatsuba)
615
616 movdqa \XMM1, \TMP6
617 pshufd $78, \XMM1, \TMP2
618 pxor \XMM1, \TMP2
619 movdqa HashKey_4(%rsp), \TMP5
620 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
621 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
622 movdqa HashKey_4_k(%rsp), \TMP4
623 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
624 movdqa \XMM1, \XMMDst
625 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
626
627 # Multiply TMP1 * HashKey (using Karatsuba)
628
629 movdqa \XMM2, \TMP1
630 pshufd $78, \XMM2, \TMP2
631 pxor \XMM2, \TMP2
632 movdqa HashKey_3(%rsp), \TMP5
633 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
634 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
635 movdqa HashKey_3_k(%rsp), \TMP4
636 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
637 pxor \TMP1, \TMP6
638 pxor \XMM2, \XMMDst
639 pxor \TMP2, \XMM1
640# results accumulated in TMP6, XMMDst, XMM1
641
642 # Multiply TMP1 * HashKey (using Karatsuba)
643
644 movdqa \XMM3, \TMP1
645 pshufd $78, \XMM3, \TMP2
646 pxor \XMM3, \TMP2
647 movdqa HashKey_2(%rsp), \TMP5
648 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
649 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
650 movdqa HashKey_2_k(%rsp), \TMP4
651 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
652 pxor \TMP1, \TMP6
653 pxor \XMM3, \XMMDst
654 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
655
656 # Multiply TMP1 * HashKey (using Karatsuba)
657 movdqa \XMM4, \TMP1
658 pshufd $78, \XMM4, \TMP2
659 pxor \XMM4, \TMP2
660 movdqa HashKey(%rsp), \TMP5
661 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
662 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
663 movdqa HashKey_k(%rsp), \TMP4
664 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
665 pxor \TMP1, \TMP6
666 pxor \XMM4, \XMMDst
667 pxor \XMM1, \TMP2
668 pxor \TMP6, \TMP2
669 pxor \XMMDst, \TMP2
670 # middle section of the temp results combined as in karatsuba algorithm
671 movdqa \TMP2, \TMP4
672 pslldq $8, \TMP4 # left shift TMP4 2 DWs
673 psrldq $8, \TMP2 # right shift TMP2 2 DWs
674 pxor \TMP4, \XMMDst
675 pxor \TMP2, \TMP6
676# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
677 # first phase of the reduction
678 movdqa \XMMDst, \TMP2
679 movdqa \XMMDst, \TMP3
680 movdqa \XMMDst, \TMP4
681# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
682 pslld $31, \TMP2 # packed right shifting << 31
683 pslld $30, \TMP3 # packed right shifting << 30
684 pslld $25, \TMP4 # packed right shifting << 25
685 pxor \TMP3, \TMP2 # xor the shifted versions
686 pxor \TMP4, \TMP2
687 movdqa \TMP2, \TMP7
688 psrldq $4, \TMP7 # right shift TMP7 1 DW
689 pslldq $12, \TMP2 # left shift TMP2 3 DWs
690 pxor \TMP2, \XMMDst
691
692 # second phase of the reduction
693 movdqa \XMMDst, \TMP2
694 # make 3 copies of XMMDst for doing 3 shift operations
695 movdqa \XMMDst, \TMP3
696 movdqa \XMMDst, \TMP4
697 psrld $1, \TMP2 # packed left shift >> 1
698 psrld $2, \TMP3 # packed left shift >> 2
699 psrld $7, \TMP4 # packed left shift >> 7
700 pxor \TMP3, \TMP2 # xor the shifted versions
701 pxor \TMP4, \TMP2
702 pxor \TMP7, \TMP2
703 pxor \TMP2, \XMMDst
704 pxor \TMP6, \XMMDst # reduced result is in XMMDst
705.endm
706
707/* Encryption of a single block done*/
708.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
709
710 pxor (%arg1), \XMM0
711 movaps 16(%arg1), \TMP1
712 AESENC \TMP1, \XMM0
713 movaps 32(%arg1), \TMP1
714 AESENC \TMP1, \XMM0
715 movaps 48(%arg1), \TMP1
716 AESENC \TMP1, \XMM0
717 movaps 64(%arg1), \TMP1
718 AESENC \TMP1, \XMM0
719 movaps 80(%arg1), \TMP1
720 AESENC \TMP1, \XMM0
721 movaps 96(%arg1), \TMP1
722 AESENC \TMP1, \XMM0
723 movaps 112(%arg1), \TMP1
724 AESENC \TMP1, \XMM0
725 movaps 128(%arg1), \TMP1
726 AESENC \TMP1, \XMM0
727 movaps 144(%arg1), \TMP1
728 AESENC \TMP1, \XMM0
729 movaps 160(%arg1), \TMP1
730 AESENCLAST \TMP1, \XMM0
731.endm
732
733
734/*****************************************************************************
735* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
736* u8 *out, // Plaintext output. Encrypt in-place is allowed.
737* const u8 *in, // Ciphertext input
738* u64 plaintext_len, // Length of data in bytes for decryption.
739* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
740* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
741* // concatenated with 0x00000001. 16-byte aligned pointer.
742* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
743* const u8 *aad, // Additional Authentication Data (AAD)
744* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
745* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
746* // given authentication tag and only return the plaintext if they match.
747* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
748* // (most likely), 12 or 8.
749*
750* Assumptions:
751*
752* keys:
753* keys are pre-expanded and aligned to 16 bytes. we are using the first
754* set of 11 keys in the data structure void *aes_ctx
755*
756* iv:
757* 0 1 2 3
758* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
759* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
760* | Salt (From the SA) |
761* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
762* | Initialization Vector |
763* | (This is the sequence number from IPSec header) |
764* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
765* | 0x1 |
766* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
767*
768*
769*
770* AAD:
771* AAD padded to 128 bits with 0
772* for example, assume AAD is a u32 vector
773*
774* if AAD is 8 bytes:
775* AAD[3] = {A0, A1};
776* padded AAD in xmm register = {A1 A0 0 0}
777*
778* 0 1 2 3
779* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
780* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
781* | SPI (A1) |
782* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
783* | 32-bit Sequence Number (A0) |
784* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
785* | 0x0 |
786* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
787*
788* AAD Format with 32-bit Sequence Number
789*
790* if AAD is 12 bytes:
791* AAD[3] = {A0, A1, A2};
792* padded AAD in xmm register = {A2 A1 A0 0}
793*
794* 0 1 2 3
795* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
796* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
797* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
798* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
799* | SPI (A2) |
800* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
801* | 64-bit Extended Sequence Number {A1,A0} |
802* | |
803* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
804* | 0x0 |
805* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
806*
807* AAD Format with 64-bit Extended Sequence Number
808*
809* aadLen:
810* from the definition of the spec, aadLen can only be 8 or 12 bytes.
811* The code supports 16 too but for other sizes, the code will fail.
812*
813* TLen:
814* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
815* For other sizes, the code will fail.
816*
817* poly = x^128 + x^127 + x^126 + x^121 + 1
818*
819*****************************************************************************/
820
821ENTRY(aesni_gcm_dec)
822 push %r12
823 push %r13
824 push %r14
825 mov %rsp, %r14
826/*
827* states of %xmm registers %xmm6:%xmm15 not saved
828* all %xmm registers are clobbered
829*/
830 sub $VARIABLE_OFFSET, %rsp
831 and $~63, %rsp # align rsp to 64 bytes
832 mov %arg6, %r12
833 movdqu (%r12), %xmm13 # %xmm13 = HashKey
834 pshufb SHUF_MASK(%rip), %xmm13
835
836# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
837
838 movdqa %xmm13, %xmm2
839 psllq $1, %xmm13
840 psrlq $63, %xmm2
841 movdqa %xmm2, %xmm1
842 pslldq $8, %xmm2
843 psrldq $8, %xmm1
844 por %xmm2, %xmm13
845
846 # Reduction
847
848 pshufd $0x24, %xmm1, %xmm2
849 pcmpeqd TWOONE(%rip), %xmm2
850 pand POLY(%rip), %xmm2
851 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
852
853
854 # Decrypt first few blocks
855
856 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
857 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
858 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
859 mov %r13, %r12
860 and $(3<<4), %r12
861 jz _initial_num_blocks_is_0_decrypt
862 cmp $(2<<4), %r12
863 jb _initial_num_blocks_is_1_decrypt
864 je _initial_num_blocks_is_2_decrypt
865_initial_num_blocks_is_3_decrypt:
866 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
867%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
868 sub $48, %r13
869 jmp _initial_blocks_decrypted
870_initial_num_blocks_is_2_decrypt:
871 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
872%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
873 sub $32, %r13
874 jmp _initial_blocks_decrypted
875_initial_num_blocks_is_1_decrypt:
876 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
877%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
878 sub $16, %r13
879 jmp _initial_blocks_decrypted
880_initial_num_blocks_is_0_decrypt:
881 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
882%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
883_initial_blocks_decrypted:
884 cmp $0, %r13
885 je _zero_cipher_left_decrypt
886 sub $64, %r13
887 je _four_cipher_left_decrypt
888_decrypt_by_4:
889 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
890%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
891 add $64, %r11
892 sub $64, %r13
893 jne _decrypt_by_4
894_four_cipher_left_decrypt:
895 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
896%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
897_zero_cipher_left_decrypt:
898 mov %arg4, %r13
899 and $15, %r13 # %r13 = arg4 (mod 16)
900 je _multiple_of_16_bytes_decrypt
901
902 # Handle the last <16 byte block seperately
903
904 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
905 pshufb SHUF_MASK(%rip), %xmm0
906 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
907 sub $16, %r11
908 add %r13, %r11
909 movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
910 lea SHIFT_MASK+16(%rip), %r12
911 sub %r13, %r12
912# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
913# (%r13 is the number of bytes in plaintext mod 16)
914 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
915 pshufb %xmm2, %xmm1 # right shift 16-%r13 butes
916 movdqa %xmm1, %xmm2
917 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
918 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
919 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
920 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
921 pand %xmm1, %xmm2
922 pshufb SHUF_MASK(%rip),%xmm2
923 pxor %xmm2, %xmm8
924 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
925 # GHASH computation for the last <16 byte block
926 sub %r13, %r11
927 add $16, %r11
928
929 # output %r13 bytes
930 movq %xmm0, %rax
931 cmp $8, %r13
932 jle _less_than_8_bytes_left_decrypt
933 mov %rax, (%arg2 , %r11, 1)
934 add $8, %r11
935 psrldq $8, %xmm0
936 movq %xmm0, %rax
937 sub $8, %r13
938_less_than_8_bytes_left_decrypt:
939 mov %al, (%arg2, %r11, 1)
940 add $1, %r11
941 shr $8, %rax
942 sub $1, %r13
943 jne _less_than_8_bytes_left_decrypt
944_multiple_of_16_bytes_decrypt:
945 mov arg8, %r12 # %r13 = aadLen (number of bytes)
946 shl $3, %r12 # convert into number of bits
947 movd %r12d, %xmm15 # len(A) in %xmm15
948 shl $3, %arg4 # len(C) in bits (*128)
949 movq %arg4, %xmm1
950 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
951 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
952 pxor %xmm15, %xmm8
953 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
954 # final GHASH computation
955 pshufb SHUF_MASK(%rip), %xmm8
956 mov %arg5, %rax # %rax = *Y0
957 movdqu (%rax), %xmm0 # %xmm0 = Y0
958 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
959 pxor %xmm8, %xmm0
960_return_T_decrypt:
961 mov arg9, %r10 # %r10 = authTag
962 mov arg10, %r11 # %r11 = auth_tag_len
963 cmp $16, %r11
964 je _T_16_decrypt
965 cmp $12, %r11
966 je _T_12_decrypt
967_T_8_decrypt:
968 movq %xmm0, %rax
969 mov %rax, (%r10)
970 jmp _return_T_done_decrypt
971_T_12_decrypt:
972 movq %xmm0, %rax
973 mov %rax, (%r10)
974 psrldq $8, %xmm0
975 movd %xmm0, %eax
976 mov %eax, 8(%r10)
977 jmp _return_T_done_decrypt
978_T_16_decrypt:
979 movdqu %xmm0, (%r10)
980_return_T_done_decrypt:
981 mov %r14, %rsp
982 pop %r14
983 pop %r13
984 pop %r12
985 ret
986
987
988/*****************************************************************************
989* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
990* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
991* const u8 *in, // Plaintext input
992* u64 plaintext_len, // Length of data in bytes for encryption.
993* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
994* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
995* // concatenated with 0x00000001. 16-byte aligned pointer.
996* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
997* const u8 *aad, // Additional Authentication Data (AAD)
998* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
999* u8 *auth_tag, // Authenticated Tag output.
1000* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1001* // 12 or 8.
1002*
1003* Assumptions:
1004*
1005* keys:
1006* keys are pre-expanded and aligned to 16 bytes. we are using the
1007* first set of 11 keys in the data structure void *aes_ctx
1008*
1009*
1010* iv:
1011* 0 1 2 3
1012* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1013* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1014* | Salt (From the SA) |
1015* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1016* | Initialization Vector |
1017* | (This is the sequence number from IPSec header) |
1018* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1019* | 0x1 |
1020* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1021*
1022*
1023*
1024* AAD:
1025* AAD padded to 128 bits with 0
1026* for example, assume AAD is a u32 vector
1027*
1028* if AAD is 8 bytes:
1029* AAD[3] = {A0, A1};
1030* padded AAD in xmm register = {A1 A0 0 0}
1031*
1032* 0 1 2 3
1033* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1034* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1035* | SPI (A1) |
1036* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1037* | 32-bit Sequence Number (A0) |
1038* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1039* | 0x0 |
1040* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1041*
1042* AAD Format with 32-bit Sequence Number
1043*
1044* if AAD is 12 bytes:
1045* AAD[3] = {A0, A1, A2};
1046* padded AAD in xmm register = {A2 A1 A0 0}
1047*
1048* 0 1 2 3
1049* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1050* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1051* | SPI (A2) |
1052* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1053* | 64-bit Extended Sequence Number {A1,A0} |
1054* | |
1055* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1056* | 0x0 |
1057* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1058*
1059* AAD Format with 64-bit Extended Sequence Number
1060*
1061* aadLen:
1062* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1063* The code supports 16 too but for other sizes, the code will fail.
1064*
1065* TLen:
1066* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1067* For other sizes, the code will fail.
1068*
1069* poly = x^128 + x^127 + x^126 + x^121 + 1
1070***************************************************************************/
1071ENTRY(aesni_gcm_enc)
1072 push %r12
1073 push %r13
1074 push %r14
1075 mov %rsp, %r14
1076#
1077# states of %xmm registers %xmm6:%xmm15 not saved
1078# all %xmm registers are clobbered
1079#
1080 sub $VARIABLE_OFFSET, %rsp
1081 and $~63, %rsp
1082 mov %arg6, %r12
1083 movdqu (%r12), %xmm13
1084 pshufb SHUF_MASK(%rip), %xmm13
1085
1086# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1087
1088 movdqa %xmm13, %xmm2
1089 psllq $1, %xmm13
1090 psrlq $63, %xmm2
1091 movdqa %xmm2, %xmm1
1092 pslldq $8, %xmm2
1093 psrldq $8, %xmm1
1094 por %xmm2, %xmm13
1095
1096 # reduce HashKey<<1
1097
1098 pshufd $0x24, %xmm1, %xmm2
1099 pcmpeqd TWOONE(%rip), %xmm2
1100 pand POLY(%rip), %xmm2
1101 pxor %xmm2, %xmm13
1102 movdqa %xmm13, HashKey(%rsp)
1103 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1104 and $-16, %r13
1105 mov %r13, %r12
1106
1107 # Encrypt first few blocks
1108
1109 and $(3<<4), %r12
1110 jz _initial_num_blocks_is_0_encrypt
1111 cmp $(2<<4), %r12
1112 jb _initial_num_blocks_is_1_encrypt
1113 je _initial_num_blocks_is_2_encrypt
1114_initial_num_blocks_is_3_encrypt:
1115 INITIAL_BLOCKS 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1116%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1117 sub $48, %r13
1118 jmp _initial_blocks_encrypted
1119_initial_num_blocks_is_2_encrypt:
1120 INITIAL_BLOCKS 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1121%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1122 sub $32, %r13
1123 jmp _initial_blocks_encrypted
1124_initial_num_blocks_is_1_encrypt:
1125 INITIAL_BLOCKS 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1126%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1127 sub $16, %r13
1128 jmp _initial_blocks_encrypted
1129_initial_num_blocks_is_0_encrypt:
1130 INITIAL_BLOCKS 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1131%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1132_initial_blocks_encrypted:
1133
1134 # Main loop - Encrypt remaining blocks
1135
1136 cmp $0, %r13
1137 je _zero_cipher_left_encrypt
1138 sub $64, %r13
1139 je _four_cipher_left_encrypt
1140_encrypt_by_4_encrypt:
1141 GHASH_4_ENCRYPT_4_PARALLEL %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1142%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1143 add $64, %r11
1144 sub $64, %r13
1145 jne _encrypt_by_4_encrypt
1146_four_cipher_left_encrypt:
1147 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1148%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1149_zero_cipher_left_encrypt:
1150 mov %arg4, %r13
1151 and $15, %r13 # %r13 = arg4 (mod 16)
1152 je _multiple_of_16_bytes_encrypt
1153
1154 # Handle the last <16 Byte block seperately
1155 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1156 pshufb SHUF_MASK(%rip), %xmm0
1157 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1158 sub $16, %r11
1159 add %r13, %r11
1160 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1161 lea SHIFT_MASK+16(%rip), %r12
1162 sub %r13, %r12
1163 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1164 # (%r13 is the number of bytes in plaintext mod 16)
1165 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1166 pshufb %xmm2, %xmm1 # shift right 16-r13 byte
1167 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1168 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1169 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1170 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1171
1172 pshufb SHUF_MASK(%rip),%xmm0
1173 pxor %xmm0, %xmm8
1174 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1175 # GHASH computation for the last <16 byte block
1176 sub %r13, %r11
1177 add $16, %r11
1178 pshufb SHUF_MASK(%rip), %xmm0
1179 # shuffle xmm0 back to output as ciphertext
1180
1181 # Output %r13 bytes
1182 movq %xmm0, %rax
1183 cmp $8, %r13
1184 jle _less_than_8_bytes_left_encrypt
1185 mov %rax, (%arg2 , %r11, 1)
1186 add $8, %r11
1187 psrldq $8, %xmm0
1188 movq %xmm0, %rax
1189 sub $8, %r13
1190_less_than_8_bytes_left_encrypt:
1191 mov %al, (%arg2, %r11, 1)
1192 add $1, %r11
1193 shr $8, %rax
1194 sub $1, %r13
1195 jne _less_than_8_bytes_left_encrypt
1196_multiple_of_16_bytes_encrypt:
1197 mov arg8, %r12 # %r12 = addLen (number of bytes)
1198 shl $3, %r12
1199 movd %r12d, %xmm15 # len(A) in %xmm15
1200 shl $3, %arg4 # len(C) in bits (*128)
1201 movq %arg4, %xmm1
1202 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1203 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1204 pxor %xmm15, %xmm8
1205 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1206 # final GHASH computation
1207
1208 pshufb SHUF_MASK(%rip), %xmm8 # perform a 16 byte swap
1209 mov %arg5, %rax # %rax = *Y0
1210 movdqu (%rax), %xmm0 # %xmm0 = Y0
1211 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1212 pxor %xmm8, %xmm0
1213_return_T_encrypt:
1214 mov arg9, %r10 # %r10 = authTag
1215 mov arg10, %r11 # %r11 = auth_tag_len
1216 cmp $16, %r11
1217 je _T_16_encrypt
1218 cmp $12, %r11
1219 je _T_12_encrypt
1220_T_8_encrypt:
1221 movq %xmm0, %rax
1222 mov %rax, (%r10)
1223 jmp _return_T_done_encrypt
1224_T_12_encrypt:
1225 movq %xmm0, %rax
1226 mov %rax, (%r10)
1227 psrldq $8, %xmm0
1228 movd %xmm0, %eax
1229 mov %eax, 8(%r10)
1230 jmp _return_T_done_encrypt
1231_T_16_encrypt:
1232 movdqu %xmm0, (%r10)
1233_return_T_done_encrypt:
1234 mov %r14, %rsp
1235 pop %r14
1236 pop %r13
1237 pop %r12
1238 ret
1239
1240
1241
50_key_expansion_128: 1242_key_expansion_128:
51_key_expansion_256a: 1243_key_expansion_256a:
52 pshufd $0b11111111, %xmm1, %xmm1 1244 pshufd $0b11111111, %xmm1, %xmm1
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc4490a..02d349d64423 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
5 * Copyright (C) 2008, Intel Corp. 5 * Copyright (C) 2008, Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com> 6 * Author: Huang Ying <ying.huang@intel.com>
7 * 7 *
8 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
9 * interface for 64-bit kernels.
10 * Authors: Adrian Hoban <adrian.hoban@intel.com>
11 * Gabriele Paoloni <gabriele.paoloni@intel.com>
12 * Tadeusz Struk (tadeusz.struk@intel.com)
13 * Aidan O'Mahony (aidan.o.mahony@intel.com)
14 * Copyright (c) 2010, Intel Corporation.
15 *
8 * This program is free software; you can redistribute it and/or modify 16 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 17 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or 18 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
21#include <crypto/ctr.h> 29#include <crypto/ctr.h>
22#include <asm/i387.h> 30#include <asm/i387.h>
23#include <asm/aes.h> 31#include <asm/aes.h>
32#include <crypto/scatterwalk.h>
33#include <crypto/internal/aead.h>
34#include <linux/workqueue.h>
35#include <linux/spinlock.h>
24 36
25#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) 37#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
26#define HAS_CTR 38#define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
42 struct cryptd_ablkcipher *cryptd_tfm; 54 struct cryptd_ablkcipher *cryptd_tfm;
43}; 55};
44 56
45#define AESNI_ALIGN 16 57/* This data is stored at the end of the crypto_tfm struct.
58 * It's a type of per "session" data storage location.
59 * This needs to be 16 byte aligned.
60 */
61struct aesni_rfc4106_gcm_ctx {
62 u8 hash_subkey[16];
63 struct crypto_aes_ctx aes_key_expanded;
64 u8 nonce[4];
65 struct cryptd_aead *cryptd_tfm;
66};
67
68struct aesni_gcm_set_hash_subkey_result {
69 int err;
70 struct completion completion;
71};
72
73struct aesni_hash_subkey_req_data {
74 u8 iv[16];
75 struct aesni_gcm_set_hash_subkey_result result;
76 struct scatterlist sg;
77};
78
79#define AESNI_ALIGN (16)
46#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) 80#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
81#define RFC4106_HASH_SUBKEY_SIZE 16
47 82
48asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 83asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
49 unsigned int key_len); 84 unsigned int key_len);
@@ -62,6 +97,57 @@ asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
62asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 97asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
63 const u8 *in, unsigned int len, u8 *iv); 98 const u8 *in, unsigned int len, u8 *iv);
64 99
100/* asmlinkage void aesni_gcm_enc()
101 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
102 * u8 *out, Ciphertext output. Encrypt in-place is allowed.
103 * const u8 *in, Plaintext input
104 * unsigned long plaintext_len, Length of data in bytes for encryption.
105 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
106 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
107 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
108 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
109 * const u8 *aad, Additional Authentication Data (AAD)
110 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
111 * is going to be 8 or 12 bytes
112 * u8 *auth_tag, Authenticated Tag output.
113 * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
114 * Valid values are 16 (most likely), 12 or 8.
115 */
116asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
117 const u8 *in, unsigned long plaintext_len, u8 *iv,
118 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
119 u8 *auth_tag, unsigned long auth_tag_len);
120
121/* asmlinkage void aesni_gcm_dec()
122 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
123 * u8 *out, Plaintext output. Decrypt in-place is allowed.
124 * const u8 *in, Ciphertext input
125 * unsigned long ciphertext_len, Length of data in bytes for decryption.
126 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
127 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
128 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
129 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
130 * const u8 *aad, Additional Authentication Data (AAD)
131 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
132 * to be 8 or 12 bytes
133 * u8 *auth_tag, Authenticated Tag output.
134 * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
135 * Valid values are 16 (most likely), 12 or 8.
136 */
137asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
138 const u8 *in, unsigned long ciphertext_len, u8 *iv,
139 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
140 u8 *auth_tag, unsigned long auth_tag_len);
141
142static inline struct
143aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
144{
145 return
146 (struct aesni_rfc4106_gcm_ctx *)
147 PTR_ALIGN((u8 *)
148 crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
149}
150
65static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) 151static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
66{ 152{
67 unsigned long addr = (unsigned long)raw_ctx; 153 unsigned long addr = (unsigned long)raw_ctx;
@@ -730,6 +816,422 @@ static struct crypto_alg ablk_xts_alg = {
730}; 816};
731#endif 817#endif
732 818
819static int rfc4106_init(struct crypto_tfm *tfm)
820{
821 struct cryptd_aead *cryptd_tfm;
822 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
823 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
824 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
825 if (IS_ERR(cryptd_tfm))
826 return PTR_ERR(cryptd_tfm);
827 ctx->cryptd_tfm = cryptd_tfm;
828 tfm->crt_aead.reqsize = sizeof(struct aead_request)
829 + crypto_aead_reqsize(&cryptd_tfm->base);
830 return 0;
831}
832
833static void rfc4106_exit(struct crypto_tfm *tfm)
834{
835 struct aesni_rfc4106_gcm_ctx *ctx =
836 (struct aesni_rfc4106_gcm_ctx *)
837 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
838 if (!IS_ERR(ctx->cryptd_tfm))
839 cryptd_free_aead(ctx->cryptd_tfm);
840 return;
841}
842
843static void
844rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
845{
846 struct aesni_gcm_set_hash_subkey_result *result = req->data;
847
848 if (err == -EINPROGRESS)
849 return;
850 result->err = err;
851 complete(&result->completion);
852}
853
854static int
855rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
856{
857 struct crypto_ablkcipher *ctr_tfm;
858 struct ablkcipher_request *req;
859 int ret = -EINVAL;
860 struct aesni_hash_subkey_req_data *req_data;
861
862 ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
863 if (IS_ERR(ctr_tfm))
864 return PTR_ERR(ctr_tfm);
865
866 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
867
868 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
869 if (ret) {
870 crypto_free_ablkcipher(ctr_tfm);
871 return ret;
872 }
873
874 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
875 if (!req) {
876 crypto_free_ablkcipher(ctr_tfm);
877 return -EINVAL;
878 }
879
880 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
881 if (!req_data) {
882 crypto_free_ablkcipher(ctr_tfm);
883 return -ENOMEM;
884 }
885 memset(req_data->iv, 0, sizeof(req_data->iv));
886
887 /* Clear the data in the hash sub key container to zero.*/
888 /* We want to cipher all zeros to create the hash sub key. */
889 memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
890
891 init_completion(&req_data->result.completion);
892 sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
893 ablkcipher_request_set_tfm(req, ctr_tfm);
894 ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
895 CRYPTO_TFM_REQ_MAY_BACKLOG,
896 rfc4106_set_hash_subkey_done,
897 &req_data->result);
898
899 ablkcipher_request_set_crypt(req, &req_data->sg,
900 &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
901
902 ret = crypto_ablkcipher_encrypt(req);
903 if (ret == -EINPROGRESS || ret == -EBUSY) {
904 ret = wait_for_completion_interruptible
905 (&req_data->result.completion);
906 if (!ret)
907 ret = req_data->result.err;
908 }
909 ablkcipher_request_free(req);
910 kfree(req_data);
911 crypto_free_ablkcipher(ctr_tfm);
912 return ret;
913}
914
915static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
916 unsigned int key_len)
917{
918 int ret = 0;
919 struct crypto_tfm *tfm = crypto_aead_tfm(parent);
920 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
921 u8 *new_key_mem = NULL;
922
923 if (key_len < 4) {
924 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
925 return -EINVAL;
926 }
927 /*Account for 4 byte nonce at the end.*/
928 key_len -= 4;
929 if (key_len != AES_KEYSIZE_128) {
930 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
931 return -EINVAL;
932 }
933
934 memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
935 /*This must be on a 16 byte boundary!*/
936 if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
937 return -EINVAL;
938
939 if ((unsigned long)key % AESNI_ALIGN) {
940 /*key is not aligned: use an auxuliar aligned pointer*/
941 new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
942 if (!new_key_mem)
943 return -ENOMEM;
944
945 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
946 memcpy(new_key_mem, key, key_len);
947 key = new_key_mem;
948 }
949
950 if (!irq_fpu_usable())
951 ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
952 key, key_len);
953 else {
954 kernel_fpu_begin();
955 ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
956 kernel_fpu_end();
957 }
958 /*This must be on a 16 byte boundary!*/
959 if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
960 ret = -EINVAL;
961 goto exit;
962 }
963 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
964exit:
965 kfree(new_key_mem);
966 return ret;
967}
968
969/* This is the Integrity Check Value (aka the authentication tag length and can
970 * be 8, 12 or 16 bytes long. */
971static int rfc4106_set_authsize(struct crypto_aead *parent,
972 unsigned int authsize)
973{
974 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
975 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
976
977 switch (authsize) {
978 case 8:
979 case 12:
980 case 16:
981 break;
982 default:
983 return -EINVAL;
984 }
985 crypto_aead_crt(parent)->authsize = authsize;
986 crypto_aead_crt(cryptd_child)->authsize = authsize;
987 return 0;
988}
989
990static int rfc4106_encrypt(struct aead_request *req)
991{
992 int ret;
993 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
994 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
995 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
996
997 if (!irq_fpu_usable()) {
998 struct aead_request *cryptd_req =
999 (struct aead_request *) aead_request_ctx(req);
1000 memcpy(cryptd_req, req, sizeof(*req));
1001 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1002 return crypto_aead_encrypt(cryptd_req);
1003 } else {
1004 kernel_fpu_begin();
1005 ret = cryptd_child->base.crt_aead.encrypt(req);
1006 kernel_fpu_end();
1007 return ret;
1008 }
1009}
1010
1011static int rfc4106_decrypt(struct aead_request *req)
1012{
1013 int ret;
1014 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1015 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1016 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1017
1018 if (!irq_fpu_usable()) {
1019 struct aead_request *cryptd_req =
1020 (struct aead_request *) aead_request_ctx(req);
1021 memcpy(cryptd_req, req, sizeof(*req));
1022 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1023 return crypto_aead_decrypt(cryptd_req);
1024 } else {
1025 kernel_fpu_begin();
1026 ret = cryptd_child->base.crt_aead.decrypt(req);
1027 kernel_fpu_end();
1028 return ret;
1029 }
1030}
1031
1032static struct crypto_alg rfc4106_alg = {
1033 .cra_name = "rfc4106(gcm(aes))",
1034 .cra_driver_name = "rfc4106-gcm-aesni",
1035 .cra_priority = 400,
1036 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
1037 .cra_blocksize = 1,
1038 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1039 .cra_alignmask = 0,
1040 .cra_type = &crypto_nivaead_type,
1041 .cra_module = THIS_MODULE,
1042 .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
1043 .cra_init = rfc4106_init,
1044 .cra_exit = rfc4106_exit,
1045 .cra_u = {
1046 .aead = {
1047 .setkey = rfc4106_set_key,
1048 .setauthsize = rfc4106_set_authsize,
1049 .encrypt = rfc4106_encrypt,
1050 .decrypt = rfc4106_decrypt,
1051 .geniv = "seqiv",
1052 .ivsize = 8,
1053 .maxauthsize = 16,
1054 },
1055 },
1056};
1057
1058static int __driver_rfc4106_encrypt(struct aead_request *req)
1059{
1060 u8 one_entry_in_sg = 0;
1061 u8 *src, *dst, *assoc;
1062 __be32 counter = cpu_to_be32(1);
1063 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1064 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1065 void *aes_ctx = &(ctx->aes_key_expanded);
1066 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1067 u8 iv_tab[16+AESNI_ALIGN];
1068 u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
1069 struct scatter_walk src_sg_walk;
1070 struct scatter_walk assoc_sg_walk;
1071 struct scatter_walk dst_sg_walk;
1072 unsigned int i;
1073
1074 /* Assuming we are supporting rfc4106 64-bit extended */
1075 /* sequence numbers We need to have the AAD length equal */
1076 /* to 8 or 12 bytes */
1077 if (unlikely(req->assoclen != 8 && req->assoclen != 12))
1078 return -EINVAL;
1079 /* IV below built */
1080 for (i = 0; i < 4; i++)
1081 *(iv+i) = ctx->nonce[i];
1082 for (i = 0; i < 8; i++)
1083 *(iv+4+i) = req->iv[i];
1084 *((__be32 *)(iv+12)) = counter;
1085
1086 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1087 one_entry_in_sg = 1;
1088 scatterwalk_start(&src_sg_walk, req->src);
1089 scatterwalk_start(&assoc_sg_walk, req->assoc);
1090 src = scatterwalk_map(&src_sg_walk, 0);
1091 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1092 dst = src;
1093 if (unlikely(req->src != req->dst)) {
1094 scatterwalk_start(&dst_sg_walk, req->dst);
1095 dst = scatterwalk_map(&dst_sg_walk, 0);
1096 }
1097
1098 } else {
1099 /* Allocate memory for src, dst, assoc */
1100 src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
1101 GFP_ATOMIC);
1102 if (unlikely(!src))
1103 return -ENOMEM;
1104 assoc = (src + req->cryptlen + auth_tag_len);
1105 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1106 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1107 req->assoclen, 0);
1108 dst = src;
1109 }
1110
1111 aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
1112 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
1113 + ((unsigned long)req->cryptlen), auth_tag_len);
1114
1115 /* The authTag (aka the Integrity Check Value) needs to be written
1116 * back to the packet. */
1117 if (one_entry_in_sg) {
1118 if (unlikely(req->src != req->dst)) {
1119 scatterwalk_unmap(dst, 0);
1120 scatterwalk_done(&dst_sg_walk, 0, 0);
1121 }
1122 scatterwalk_unmap(src, 0);
1123 scatterwalk_unmap(assoc, 0);
1124 scatterwalk_done(&src_sg_walk, 0, 0);
1125 scatterwalk_done(&assoc_sg_walk, 0, 0);
1126 } else {
1127 scatterwalk_map_and_copy(dst, req->dst, 0,
1128 req->cryptlen + auth_tag_len, 1);
1129 kfree(src);
1130 }
1131 return 0;
1132}
1133
1134static int __driver_rfc4106_decrypt(struct aead_request *req)
1135{
1136 u8 one_entry_in_sg = 0;
1137 u8 *src, *dst, *assoc;
1138 unsigned long tempCipherLen = 0;
1139 __be32 counter = cpu_to_be32(1);
1140 int retval = 0;
1141 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1142 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1143 void *aes_ctx = &(ctx->aes_key_expanded);
1144 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1145 u8 iv_and_authTag[32+AESNI_ALIGN];
1146 u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
1147 u8 *authTag = iv + 16;
1148 struct scatter_walk src_sg_walk;
1149 struct scatter_walk assoc_sg_walk;
1150 struct scatter_walk dst_sg_walk;
1151 unsigned int i;
1152
1153 if (unlikely((req->cryptlen < auth_tag_len) ||
1154 (req->assoclen != 8 && req->assoclen != 12)))
1155 return -EINVAL;
1156 /* Assuming we are supporting rfc4106 64-bit extended */
1157 /* sequence numbers We need to have the AAD length */
1158 /* equal to 8 or 12 bytes */
1159
1160 tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
1161 /* IV below built */
1162 for (i = 0; i < 4; i++)
1163 *(iv+i) = ctx->nonce[i];
1164 for (i = 0; i < 8; i++)
1165 *(iv+4+i) = req->iv[i];
1166 *((__be32 *)(iv+12)) = counter;
1167
1168 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1169 one_entry_in_sg = 1;
1170 scatterwalk_start(&src_sg_walk, req->src);
1171 scatterwalk_start(&assoc_sg_walk, req->assoc);
1172 src = scatterwalk_map(&src_sg_walk, 0);
1173 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1174 dst = src;
1175 if (unlikely(req->src != req->dst)) {
1176 scatterwalk_start(&dst_sg_walk, req->dst);
1177 dst = scatterwalk_map(&dst_sg_walk, 0);
1178 }
1179
1180 } else {
1181 /* Allocate memory for src, dst, assoc */
1182 src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
1183 if (!src)
1184 return -ENOMEM;
1185 assoc = (src + req->cryptlen + auth_tag_len);
1186 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1187 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1188 req->assoclen, 0);
1189 dst = src;
1190 }
1191
1192 aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
1193 ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
1194 authTag, auth_tag_len);
1195
1196 /* Compare generated tag with passed in tag. */
1197 retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
1198 -EBADMSG : 0;
1199
1200 if (one_entry_in_sg) {
1201 if (unlikely(req->src != req->dst)) {
1202 scatterwalk_unmap(dst, 0);
1203 scatterwalk_done(&dst_sg_walk, 0, 0);
1204 }
1205 scatterwalk_unmap(src, 0);
1206 scatterwalk_unmap(assoc, 0);
1207 scatterwalk_done(&src_sg_walk, 0, 0);
1208 scatterwalk_done(&assoc_sg_walk, 0, 0);
1209 } else {
1210 scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
1211 kfree(src);
1212 }
1213 return retval;
1214}
1215
1216static struct crypto_alg __rfc4106_alg = {
1217 .cra_name = "__gcm-aes-aesni",
1218 .cra_driver_name = "__driver-gcm-aes-aesni",
1219 .cra_priority = 0,
1220 .cra_flags = CRYPTO_ALG_TYPE_AEAD,
1221 .cra_blocksize = 1,
1222 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1223 .cra_alignmask = 0,
1224 .cra_type = &crypto_aead_type,
1225 .cra_module = THIS_MODULE,
1226 .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
1227 .cra_u = {
1228 .aead = {
1229 .encrypt = __driver_rfc4106_encrypt,
1230 .decrypt = __driver_rfc4106_decrypt,
1231 },
1232 },
1233};
1234
733static int __init aesni_init(void) 1235static int __init aesni_init(void)
734{ 1236{
735 int err; 1237 int err;
@@ -738,6 +1240,7 @@ static int __init aesni_init(void)
738 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); 1240 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
739 return -ENODEV; 1241 return -ENODEV;
740 } 1242 }
1243
741 if ((err = crypto_register_alg(&aesni_alg))) 1244 if ((err = crypto_register_alg(&aesni_alg)))
742 goto aes_err; 1245 goto aes_err;
743 if ((err = crypto_register_alg(&__aesni_alg))) 1246 if ((err = crypto_register_alg(&__aesni_alg)))
@@ -770,10 +1273,19 @@ static int __init aesni_init(void)
770 if ((err = crypto_register_alg(&ablk_xts_alg))) 1273 if ((err = crypto_register_alg(&ablk_xts_alg)))
771 goto ablk_xts_err; 1274 goto ablk_xts_err;
772#endif 1275#endif
773 1276 err = crypto_register_alg(&__rfc4106_alg);
1277 if (err)
1278 goto __aead_gcm_err;
1279 err = crypto_register_alg(&rfc4106_alg);
1280 if (err)
1281 goto aead_gcm_err;
774 return err; 1282 return err;
775 1283
1284aead_gcm_err:
1285 crypto_unregister_alg(&__rfc4106_alg);
1286__aead_gcm_err:
776#ifdef HAS_XTS 1287#ifdef HAS_XTS
1288 crypto_unregister_alg(&ablk_xts_alg);
777ablk_xts_err: 1289ablk_xts_err:
778#endif 1290#endif
779#ifdef HAS_PCBC 1291#ifdef HAS_PCBC
@@ -809,6 +1321,8 @@ aes_err:
809 1321
810static void __exit aesni_exit(void) 1322static void __exit aesni_exit(void)
811{ 1323{
1324 crypto_unregister_alg(&__rfc4106_alg);
1325 crypto_unregister_alg(&rfc4106_alg);
812#ifdef HAS_XTS 1326#ifdef HAS_XTS
813 crypto_unregister_alg(&ablk_xts_alg); 1327 crypto_unregister_alg(&ablk_xts_alg);
814#endif 1328#endif