aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1835
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c557
-rw-r--r--arch/x86/crypto/fpu.c10
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c1
5 files changed, 2359 insertions, 48 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1a58ad89fdf7..c04f1b7a9139 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,8 +2,6 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_FPU) += fpu.o
6
7obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
8obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
9obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -24,6 +22,6 @@ aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
24twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 22twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
25salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 23salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
26 24
27aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 25aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
28 26
29ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 27ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756a51c1..be6d9e365a80 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,20 @@
9 * Vinodh Gopal <vinodh.gopal@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir 10 * Kahraman Akdemir
11 * 11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
12 * This program is free software; you can redistribute it and/or modify 26 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by 27 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or 28 * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +32,62 @@
18#include <linux/linkage.h> 32#include <linux/linkage.h>
19#include <asm/inst.h> 33#include <asm/inst.h>
20 34
35#ifdef __x86_64__
36.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
21.text 56.text
22 57
58
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
88#endif
89
90
23#define STATE1 %xmm0 91#define STATE1 %xmm0
24#define STATE2 %xmm4 92#define STATE2 %xmm4
25#define STATE3 %xmm5 93#define STATE3 %xmm5
@@ -32,12 +100,16 @@
32#define IN IN1 100#define IN IN1
33#define KEY %xmm2 101#define KEY %xmm2
34#define IV %xmm3 102#define IV %xmm3
103
35#define BSWAP_MASK %xmm10 104#define BSWAP_MASK %xmm10
36#define CTR %xmm11 105#define CTR %xmm11
37#define INC %xmm12 106#define INC %xmm12
38 107
108#ifdef __x86_64__
109#define AREG %rax
39#define KEYP %rdi 110#define KEYP %rdi
40#define OUTP %rsi 111#define OUTP %rsi
112#define UKEYP OUTP
41#define INP %rdx 113#define INP %rdx
42#define LEN %rcx 114#define LEN %rcx
43#define IVP %r8 115#define IVP %r8
@@ -46,6 +118,1591 @@
46#define TKEYP T1 118#define TKEYP T1
47#define T2 %r11 119#define T2 %r11
48#define TCTR_LOW T2 120#define TCTR_LOW T2
121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
133
134
135#ifdef __x86_64__
136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
234 xor %r11, %r11 # initialise the data pointer offset as zero
235
236 # start AES for num_initial_blocks blocks
237
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
244.irpc index, \i_seq
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
250.endr
251.irpc index, \i_seq
252 pxor 16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
257.endr
258.irpc index, \i_seq
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
265.endr
266.irpc index, \i_seq
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
269.endr
270.irpc index, \i_seq
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
273.endr
274.irpc index, \i_seq
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
277.endr
278.irpc index, \i_seq
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
281.endr
282.irpc index, \i_seq
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
285.endr
286.irpc index, \i_seq
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
289.endr
290.irpc index, \i_seq
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
293.endr
294.irpc index, \i_seq
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
299 add $16, %r11
300
301 movdqa \TMP1, %xmm\index
302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
304
305 # prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312 pxor %xmm5, %xmm6
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319 pxor %xmm6, %xmm7
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 pxor %xmm7, %xmm8
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324 pxor %xmm7, %xmm8
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327 cmp $64, %r13
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335 paddd ONE(%rip), \XMM0 # INCR Y0
336 movdqa \XMM0, \XMM1
337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
340 paddd ONE(%rip), \XMM0 # INCR Y0
341 movdqa \XMM0, \XMM2
342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
345 paddd ONE(%rip), \XMM0 # INCR Y0
346 movdqa \XMM0, \XMM3
347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
350 paddd ONE(%rip), \XMM0 # INCR Y0
351 movdqa \XMM0, \XMM4
352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
359 movdqa \TMP3, \TMP5
360 pshufd $78, \TMP3, \TMP1
361 pxor \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
368 pxor \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
372 AESENC \TMP1, \XMM1
373 AESENC \TMP1, \XMM2
374 AESENC \TMP1, \XMM3
375 AESENC \TMP1, \XMM4
376.endr
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
381 pxor \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
385 AESENC \TMP1, \XMM1
386 AESENC \TMP1, \XMM2
387 AESENC \TMP1, \XMM3
388 AESENC \TMP1, \XMM4
389.endr
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
394 pxor \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM1
403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404 movdqa \TMP1, \XMM1
405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406 pxor \TMP1, \XMM2
407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM2
409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410 pxor \TMP1, \XMM3
411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412 movdqa \TMP1, \XMM3
413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414 pxor \TMP1, \XMM4
415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 movdqa \TMP1, \XMM4
417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
650
651 add $64, %r11
652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654 pxor \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663_initial_blocks_done\num_initial_blocks\operation:
664
665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868 movdqa \XMM1, \XMM5
869 movdqa \XMM2, \XMM6
870 movdqa \XMM3, \XMM7
871 movdqa \XMM4, \XMM8
872
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
875
876 movdqa \XMM5, \TMP4
877 pshufd $78, \XMM5, \TMP6
878 pxor \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882 movdqa \XMM0, \XMM1
883 paddd ONE(%rip), \XMM0 # INCR CNT
884 movdqa \XMM0, \XMM2
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa \XMM0, \XMM3
887 paddd ONE(%rip), \XMM0 # INCR CNT
888 movdqa \XMM0, \XMM4
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895 pxor (%arg1), \XMM1
896 pxor (%arg1), \XMM2
897 pxor (%arg1), \XMM3
898 pxor (%arg1), \XMM4
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
903 AESENC \TMP1, \XMM2
904 AESENC \TMP1, \XMM3
905 AESENC \TMP1, \XMM4
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movdqa \XMM6, \TMP1
912 pshufd $78, \XMM6, \TMP2
913 pxor \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
931 AESENC \TMP3, \XMM2
932 AESENC \TMP3, \XMM3
933 AESENC \TMP3, \XMM4
934 pxor \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936 pxor \XMM6, \XMM5
937 pxor \TMP2, \TMP6
938 movdqa \XMM7, \TMP1
939 pshufd $78, \XMM7, \TMP2
940 pxor \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
942
943 # Multiply TMP5 * HashKey using karatsuba
944
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
948 AESENC \TMP3, \XMM2
949 AESENC \TMP3, \XMM3
950 AESENC \TMP3, \XMM4
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
961 AESENC \TMP3, \XMM2
962 AESENC \TMP3, \XMM3
963 AESENC \TMP3, \XMM4
964 pxor \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pxor \XMM7, \XMM5
967 pxor \TMP2, \TMP6
968
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
971
972 movdqa \XMM8, \TMP1
973 pshufd $78, \XMM8, \TMP2
974 pxor \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
979 AESENC \TMP3, \XMM2
980 AESENC \TMP3, \XMM3
981 AESENC \TMP3, \XMM4
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993 movdqa \TMP3, \XMM1
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997 movdqa \TMP3, \XMM2
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001 movdqa \TMP3, \XMM3
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005 movdqa \TMP3, \XMM4
1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011 pxor \TMP4, \TMP1
1012 pxor \XMM8, \XMM5
1013 pxor \TMP6, \TMP2
1014 pxor \TMP1, \TMP2
1015 pxor \XMM5, \TMP2
1016 movdqa \TMP2, \TMP3
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019 pxor \TMP3, \XMM5
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022 # first phase of reduction
1023
1024 movdqa \XMM5, \TMP2
1025 movdqa \XMM5, \TMP3
1026 movdqa \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1032 pxor \TMP4, \TMP2
1033 movdqa \TMP2, \TMP5
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1036 pxor \TMP2, \XMM5
1037
1038 # second phase of reduction
1039
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041 movdqa \XMM5,\TMP3
1042 movdqa \XMM5,\TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1047 pxor \TMP4,\TMP2
1048 pxor \TMP5, \TMP2
1049 pxor \TMP2, \XMM5
1050 pxor \TMP1, \XMM5 # result is in TMP1
1051
1052 pxor \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061 movdqa \XMM1, \TMP6
1062 pshufd $78, \XMM1, \TMP2
1063 pxor \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074 movdqa \XMM2, \TMP1
1075 pshufd $78, \XMM2, \TMP2
1076 pxor \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 pxor \TMP1, \TMP6
1083 pxor \XMM2, \XMMDst
1084 pxor \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089 movdqa \XMM3, \TMP1
1090 pshufd $78, \XMM3, \TMP2
1091 pxor \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097 pxor \TMP1, \TMP6
1098 pxor \XMM3, \XMMDst
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1102 movdqa \XMM4, \TMP1
1103 pshufd $78, \XMM4, \TMP2
1104 pxor \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110 pxor \TMP1, \TMP6
1111 pxor \XMM4, \XMMDst
1112 pxor \XMM1, \TMP2
1113 pxor \TMP6, \TMP2
1114 pxor \XMMDst, \TMP2
1115 # middle section of the temp results combined as in karatsuba algorithm
1116 movdqa \TMP2, \TMP4
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119 pxor \TMP4, \XMMDst
1120 pxor \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1131 pxor \TMP4, \TMP2
1132 movdqa \TMP2, \TMP7
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135 pxor \TMP2, \XMMDst
1136
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1146 pxor \TMP4, \TMP2
1147 pxor \TMP7, \TMP2
1148 pxor \TMP2, \XMMDst
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155 pxor (%arg1), \XMM0
1156 movaps 16(%arg1), \TMP1
1157 AESENC \TMP1, \XMM0
1158 movaps 32(%arg1), \TMP1
1159 AESENC \TMP1, \XMM0
1160 movaps 48(%arg1), \TMP1
1161 AESENC \TMP1, \XMM0
1162 movaps 64(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 80(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 96(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 112(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 128(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 144(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182* const u8 *in, // Ciphertext input
1183* u64 plaintext_len, // Length of data in bytes for decryption.
1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186* // concatenated with 0x00000001. 16-byte aligned pointer.
1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188* const u8 *aad, // Additional Authentication Data (AAD)
1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191* // given authentication tag and only return the plaintext if they match.
1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193* // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199* set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202* 0 1 2 3
1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205* | Salt (From the SA) |
1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207* | Initialization Vector |
1208* | (This is the sequence number from IPSec header) |
1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210* | 0x1 |
1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216* AAD padded to 128 bits with 0
1217* for example, assume AAD is a u32 vector
1218*
1219* if AAD is 8 bytes:
1220* AAD[3] = {A0, A1};
1221* padded AAD in xmm register = {A1 A0 0 0}
1222*
1223* 0 1 2 3
1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226* | SPI (A1) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 32-bit Sequence Number (A0) |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230* | 0x0 |
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233* AAD Format with 32-bit Sequence Number
1234*
1235* if AAD is 12 bytes:
1236* AAD[3] = {A0, A1, A2};
1237* padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239* 0 1 2 3
1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A2) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 64-bit Extended Sequence Number {A1,A0} |
1247* | |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x0 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252* AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256* The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260* For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267 push %r12
1268 push %r13
1269 push %r14
1270 mov %rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275 sub $VARIABLE_OFFSET, %rsp
1276 and $~63, %rsp # align rsp to 64 bytes
1277 mov %arg6, %r12
1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1279 movdqa SHUF_MASK(%rip), %xmm2
1280 PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285 movdqa %xmm13, %xmm2
1286 psllq $1, %xmm13
1287 psrlq $63, %xmm2
1288 movdqa %xmm2, %xmm1
1289 pslldq $8, %xmm2
1290 psrldq $8, %xmm1
1291 por %xmm2, %xmm13
1292
1293 # Reduction
1294
1295 pshufd $0x24, %xmm1, %xmm2
1296 pcmpeqd TWOONE(%rip), %xmm2
1297 pand POLY(%rip), %xmm2
1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301 # Decrypt first few blocks
1302
1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1306 mov %r13, %r12
1307 and $(3<<4), %r12
1308 jz _initial_num_blocks_is_0_decrypt
1309 cmp $(2<<4), %r12
1310 jb _initial_num_blocks_is_1_decrypt
1311 je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315 sub $48, %r13
1316 jmp _initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320 sub $32, %r13
1321 jmp _initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325 sub $16, %r13
1326 jmp _initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331 cmp $0, %r13
1332 je _zero_cipher_left_decrypt
1333 sub $64, %r13
1334 je _four_cipher_left_decrypt
1335_decrypt_by_4:
1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338 add $64, %r11
1339 sub $64, %r13
1340 jne _decrypt_by_4
1341_four_cipher_left_decrypt:
1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345 mov %arg4, %r13
1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt
1348
1349 # Handle the last <16 byte block separately
1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10
1353 PSHUFB_XMM %xmm10, %xmm0
1354
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11
1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
1366 movdqa %xmm1, %xmm2
1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371 pand %xmm1, %xmm2
1372 movdqa SHUF_MASK(%rip), %xmm10
1373 PSHUFB_XMM %xmm10 ,%xmm2
1374
1375 pxor %xmm2, %xmm8
1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377 # GHASH computation for the last <16 byte block
1378 sub %r13, %r11
1379 add $16, %r11
1380
1381 # output %r13 bytes
1382 MOVQ_R64_XMM %xmm0, %rax
1383 cmp $8, %r13
1384 jle _less_than_8_bytes_left_decrypt
1385 mov %rax, (%arg2 , %r11, 1)
1386 add $8, %r11
1387 psrldq $8, %xmm0
1388 MOVQ_R64_XMM %xmm0, %rax
1389 sub $8, %r13
1390_less_than_8_bytes_left_decrypt:
1391 mov %al, (%arg2, %r11, 1)
1392 add $1, %r11
1393 shr $8, %rax
1394 sub $1, %r13
1395 jne _less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1398 shl $3, %r12 # convert into number of bits
1399 movd %r12d, %xmm15 # len(A) in %xmm15
1400 shl $3, %arg4 # len(C) in bits (*128)
1401 MOVQ_R64_XMM %arg4, %xmm1
1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404 pxor %xmm15, %xmm8
1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406 # final GHASH computation
1407 movdqa SHUF_MASK(%rip), %xmm10
1408 PSHUFB_XMM %xmm10, %xmm8
1409
1410 mov %arg5, %rax # %rax = *Y0
1411 movdqu (%rax), %xmm0 # %xmm0 = Y0
1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1413 pxor %xmm8, %xmm0
1414_return_T_decrypt:
1415 mov arg9, %r10 # %r10 = authTag
1416 mov arg10, %r11 # %r11 = auth_tag_len
1417 cmp $16, %r11
1418 je _T_16_decrypt
1419 cmp $12, %r11
1420 je _T_12_decrypt
1421_T_8_decrypt:
1422 MOVQ_R64_XMM %xmm0, %rax
1423 mov %rax, (%r10)
1424 jmp _return_T_done_decrypt
1425_T_12_decrypt:
1426 MOVQ_R64_XMM %xmm0, %rax
1427 mov %rax, (%r10)
1428 psrldq $8, %xmm0
1429 movd %xmm0, %eax
1430 mov %eax, 8(%r10)
1431 jmp _return_T_done_decrypt
1432_T_16_decrypt:
1433 movdqu %xmm0, (%r10)
1434_return_T_done_decrypt:
1435 mov %r14, %rsp
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439 ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445* const u8 *in, // Plaintext input
1446* u64 plaintext_len, // Length of data in bytes for encryption.
1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449* // concatenated with 0x00000001. 16-byte aligned pointer.
1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451* const u8 *aad, // Additional Authentication Data (AAD)
1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453* u8 *auth_tag, // Authenticated Tag output.
1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455* // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460* keys are pre-expanded and aligned to 16 bytes. we are using the
1461* first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465* 0 1 2 3
1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468* | Salt (From the SA) |
1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470* | Initialization Vector |
1471* | (This is the sequence number from IPSec header) |
1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473* | 0x1 |
1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479* AAD padded to 128 bits with 0
1480* for example, assume AAD is a u32 vector
1481*
1482* if AAD is 8 bytes:
1483* AAD[3] = {A0, A1};
1484* padded AAD in xmm register = {A1 A0 0 0}
1485*
1486* 0 1 2 3
1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489* | SPI (A1) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 32-bit Sequence Number (A0) |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493* | 0x0 |
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496* AAD Format with 32-bit Sequence Number
1497*
1498* if AAD is 12 bytes:
1499* AAD[3] = {A0, A1, A2};
1500* padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502* 0 1 2 3
1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505* | SPI (A2) |
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | 64-bit Extended Sequence Number {A1,A0} |
1508* | |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517* The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521* For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526 push %r12
1527 push %r13
1528 push %r14
1529 mov %rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534 sub $VARIABLE_OFFSET, %rsp
1535 and $~63, %rsp
1536 mov %arg6, %r12
1537 movdqu (%r12), %xmm13
1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544 movdqa %xmm13, %xmm2
1545 psllq $1, %xmm13
1546 psrlq $63, %xmm2
1547 movdqa %xmm2, %xmm1
1548 pslldq $8, %xmm2
1549 psrldq $8, %xmm1
1550 por %xmm2, %xmm13
1551
1552 # reduce HashKey<<1
1553
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1557 pxor %xmm2, %xmm13
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560 and $-16, %r13
1561 mov %r13, %r12
1562
1563 # Encrypt first few blocks
1564
1565 and $(3<<4), %r12
1566 jz _initial_num_blocks_is_0_encrypt
1567 cmp $(2<<4), %r12
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573 sub $48, %r13
1574 jmp _initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578 sub $32, %r13
1579 jmp _initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583 sub $16, %r13
1584 jmp _initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590 # Main loop - Encrypt remaining blocks
1591
1592 cmp $0, %r13
1593 je _zero_cipher_left_encrypt
1594 sub $64, %r13
1595 je _four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599 add $64, %r11
1600 sub $64, %r13
1601 jne _encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606 mov %arg4, %r13
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1609
1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
1615
1616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1617 sub $16, %r11
1618 add %r13, %r11
1619 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1620 lea SHIFT_MASK+16(%rip), %r12
1621 sub %r13, %r12
1622 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623 # (%r13 is the number of bytes in plaintext mod 16)
1624 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1625 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1626 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1627 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1628 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10,%xmm0
1632
1633 pxor %xmm0, %xmm8
1634 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635 # GHASH computation for the last <16 byte block
1636 sub %r13, %r11
1637 add $16, %r11
1638
1639 movdqa SHUF_MASK(%rip), %xmm10
1640 PSHUFB_XMM %xmm10, %xmm0
1641
1642 # shuffle xmm0 back to output as ciphertext
1643
1644 # Output %r13 bytes
1645 MOVQ_R64_XMM %xmm0, %rax
1646 cmp $8, %r13
1647 jle _less_than_8_bytes_left_encrypt
1648 mov %rax, (%arg2 , %r11, 1)
1649 add $8, %r11
1650 psrldq $8, %xmm0
1651 MOVQ_R64_XMM %xmm0, %rax
1652 sub $8, %r13
1653_less_than_8_bytes_left_encrypt:
1654 mov %al, (%arg2, %r11, 1)
1655 add $1, %r11
1656 shr $8, %rax
1657 sub $1, %r13
1658 jne _less_than_8_bytes_left_encrypt
1659_multiple_of_16_bytes_encrypt:
1660 mov arg8, %r12 # %r12 = addLen (number of bytes)
1661 shl $3, %r12
1662 movd %r12d, %xmm15 # len(A) in %xmm15
1663 shl $3, %arg4 # len(C) in bits (*128)
1664 MOVQ_R64_XMM %arg4, %xmm1
1665 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1666 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1667 pxor %xmm15, %xmm8
1668 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669 # final GHASH computation
1670 movdqa SHUF_MASK(%rip), %xmm10
1671 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1672
1673 mov %arg5, %rax # %rax = *Y0
1674 movdqu (%rax), %xmm0 # %xmm0 = Y0
1675 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1676 pxor %xmm8, %xmm0
1677_return_T_encrypt:
1678 mov arg9, %r10 # %r10 = authTag
1679 mov arg10, %r11 # %r11 = auth_tag_len
1680 cmp $16, %r11
1681 je _T_16_encrypt
1682 cmp $12, %r11
1683 je _T_12_encrypt
1684_T_8_encrypt:
1685 MOVQ_R64_XMM %xmm0, %rax
1686 mov %rax, (%r10)
1687 jmp _return_T_done_encrypt
1688_T_12_encrypt:
1689 MOVQ_R64_XMM %xmm0, %rax
1690 mov %rax, (%r10)
1691 psrldq $8, %xmm0
1692 movd %xmm0, %eax
1693 mov %eax, 8(%r10)
1694 jmp _return_T_done_encrypt
1695_T_16_encrypt:
1696 movdqu %xmm0, (%r10)
1697_return_T_done_encrypt:
1698 mov %r14, %rsp
1699 pop %r14
1700 pop %r13
1701 pop %r12
1702 ret
1703
1704#endif
1705
49 1706
50_key_expansion_128: 1707_key_expansion_128:
51_key_expansion_256a: 1708_key_expansion_256a:
@@ -55,10 +1712,11 @@ _key_expansion_256a:
55 shufps $0b10001100, %xmm0, %xmm4 1712 shufps $0b10001100, %xmm0, %xmm4
56 pxor %xmm4, %xmm0 1713 pxor %xmm4, %xmm0
57 pxor %xmm1, %xmm0 1714 pxor %xmm1, %xmm0
58 movaps %xmm0, (%rcx) 1715 movaps %xmm0, (TKEYP)
59 add $0x10, %rcx 1716 add $0x10, TKEYP
60 ret 1717 ret
61 1718
1719.align 4
62_key_expansion_192a: 1720_key_expansion_192a:
63 pshufd $0b01010101, %xmm1, %xmm1 1721 pshufd $0b01010101, %xmm1, %xmm1
64 shufps $0b00010000, %xmm0, %xmm4 1722 shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +1734,13 @@ _key_expansion_192a:
76 1734
77 movaps %xmm0, %xmm1 1735 movaps %xmm0, %xmm1
78 shufps $0b01000100, %xmm0, %xmm6 1736 shufps $0b01000100, %xmm0, %xmm6
79 movaps %xmm6, (%rcx) 1737 movaps %xmm6, (TKEYP)
80 shufps $0b01001110, %xmm2, %xmm1 1738 shufps $0b01001110, %xmm2, %xmm1
81 movaps %xmm1, 16(%rcx) 1739 movaps %xmm1, 0x10(TKEYP)
82 add $0x20, %rcx 1740 add $0x20, TKEYP
83 ret 1741 ret
84 1742
1743.align 4
85_key_expansion_192b: 1744_key_expansion_192b:
86 pshufd $0b01010101, %xmm1, %xmm1 1745 pshufd $0b01010101, %xmm1, %xmm1
87 shufps $0b00010000, %xmm0, %xmm4 1746 shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +1755,11 @@ _key_expansion_192b:
96 pxor %xmm3, %xmm2 1755 pxor %xmm3, %xmm2
97 pxor %xmm5, %xmm2 1756 pxor %xmm5, %xmm2
98 1757
99 movaps %xmm0, (%rcx) 1758 movaps %xmm0, (TKEYP)
100 add $0x10, %rcx 1759 add $0x10, TKEYP
101 ret 1760 ret
102 1761
1762.align 4
103_key_expansion_256b: 1763_key_expansion_256b:
104 pshufd $0b10101010, %xmm1, %xmm1 1764 pshufd $0b10101010, %xmm1, %xmm1
105 shufps $0b00010000, %xmm2, %xmm4 1765 shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +1767,8 @@ _key_expansion_256b:
107 shufps $0b10001100, %xmm2, %xmm4 1767 shufps $0b10001100, %xmm2, %xmm4
108 pxor %xmm4, %xmm2 1768 pxor %xmm4, %xmm2
109 pxor %xmm1, %xmm2 1769 pxor %xmm1, %xmm2
110 movaps %xmm2, (%rcx) 1770 movaps %xmm2, (TKEYP)
111 add $0x10, %rcx 1771 add $0x10, TKEYP
112 ret 1772 ret
113 1773
114/* 1774/*
@@ -116,17 +1776,23 @@ _key_expansion_256b:
116 * unsigned int key_len) 1776 * unsigned int key_len)
117 */ 1777 */
118ENTRY(aesni_set_key) 1778ENTRY(aesni_set_key)
119 movups (%rsi), %xmm0 # user key (first 16 bytes) 1779#ifndef __x86_64__
120 movaps %xmm0, (%rdi) 1780 pushl KEYP
121 lea 0x10(%rdi), %rcx # key addr 1781 movl 8(%esp), KEYP # ctx
122 movl %edx, 480(%rdi) 1782 movl 12(%esp), UKEYP # in_key
1783 movl 16(%esp), %edx # key_len
1784#endif
1785 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1786 movaps %xmm0, (KEYP)
1787 lea 0x10(KEYP), TKEYP # key addr
1788 movl %edx, 480(KEYP)
123 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1789 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
124 cmp $24, %dl 1790 cmp $24, %dl
125 jb .Lenc_key128 1791 jb .Lenc_key128
126 je .Lenc_key192 1792 je .Lenc_key192
127 movups 0x10(%rsi), %xmm2 # other user key 1793 movups 0x10(UKEYP), %xmm2 # other user key
128 movaps %xmm2, (%rcx) 1794 movaps %xmm2, (TKEYP)
129 add $0x10, %rcx 1795 add $0x10, TKEYP
130 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1796 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
131 call _key_expansion_256a 1797 call _key_expansion_256a
132 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1798 AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +1821,7 @@ ENTRY(aesni_set_key)
155 call _key_expansion_256a 1821 call _key_expansion_256a
156 jmp .Ldec_key 1822 jmp .Ldec_key
157.Lenc_key192: 1823.Lenc_key192:
158 movq 0x10(%rsi), %xmm2 # other user key 1824 movq 0x10(UKEYP), %xmm2 # other user key
159 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1825 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
160 call _key_expansion_192a 1826 call _key_expansion_192a
161 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1827 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
@@ -195,33 +1861,47 @@ ENTRY(aesni_set_key)
195 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1861 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
196 call _key_expansion_128 1862 call _key_expansion_128
197.Ldec_key: 1863.Ldec_key:
198 sub $0x10, %rcx 1864 sub $0x10, TKEYP
199 movaps (%rdi), %xmm0 1865 movaps (KEYP), %xmm0
200 movaps (%rcx), %xmm1 1866 movaps (TKEYP), %xmm1
201 movaps %xmm0, 240(%rcx) 1867 movaps %xmm0, 240(TKEYP)
202 movaps %xmm1, 240(%rdi) 1868 movaps %xmm1, 240(KEYP)
203 add $0x10, %rdi 1869 add $0x10, KEYP
204 lea 240-16(%rcx), %rsi 1870 lea 240-16(TKEYP), UKEYP
205.align 4 1871.align 4
206.Ldec_key_loop: 1872.Ldec_key_loop:
207 movaps (%rdi), %xmm0 1873 movaps (KEYP), %xmm0
208 AESIMC %xmm0 %xmm1 1874 AESIMC %xmm0 %xmm1
209 movaps %xmm1, (%rsi) 1875 movaps %xmm1, (UKEYP)
210 add $0x10, %rdi 1876 add $0x10, KEYP
211 sub $0x10, %rsi 1877 sub $0x10, UKEYP
212 cmp %rcx, %rdi 1878 cmp TKEYP, KEYP
213 jb .Ldec_key_loop 1879 jb .Ldec_key_loop
214 xor %rax, %rax 1880 xor AREG, AREG
1881#ifndef __x86_64__
1882 popl KEYP
1883#endif
215 ret 1884 ret
216 1885
217/* 1886/*
218 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1887 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
219 */ 1888 */
220ENTRY(aesni_enc) 1889ENTRY(aesni_enc)
1890#ifndef __x86_64__
1891 pushl KEYP
1892 pushl KLEN
1893 movl 12(%esp), KEYP
1894 movl 16(%esp), OUTP
1895 movl 20(%esp), INP
1896#endif
221 movl 480(KEYP), KLEN # key length 1897 movl 480(KEYP), KLEN # key length
222 movups (INP), STATE # input 1898 movups (INP), STATE # input
223 call _aesni_enc1 1899 call _aesni_enc1
224 movups STATE, (OUTP) # output 1900 movups STATE, (OUTP) # output
1901#ifndef __x86_64__
1902 popl KLEN
1903 popl KEYP
1904#endif
225 ret 1905 ret
226 1906
227/* 1907/*
@@ -236,6 +1916,7 @@ ENTRY(aesni_enc)
236 * KEY 1916 * KEY
237 * TKEYP (T1) 1917 * TKEYP (T1)
238 */ 1918 */
1919.align 4
239_aesni_enc1: 1920_aesni_enc1:
240 movaps (KEYP), KEY # key 1921 movaps (KEYP), KEY # key
241 mov KEYP, TKEYP 1922 mov KEYP, TKEYP
@@ -298,6 +1979,7 @@ _aesni_enc1:
298 * KEY 1979 * KEY
299 * TKEYP (T1) 1980 * TKEYP (T1)
300 */ 1981 */
1982.align 4
301_aesni_enc4: 1983_aesni_enc4:
302 movaps (KEYP), KEY # key 1984 movaps (KEYP), KEY # key
303 mov KEYP, TKEYP 1985 mov KEYP, TKEYP
@@ -391,11 +2073,22 @@ _aesni_enc4:
391 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2073 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
392 */ 2074 */
393ENTRY(aesni_dec) 2075ENTRY(aesni_dec)
2076#ifndef __x86_64__
2077 pushl KEYP
2078 pushl KLEN
2079 movl 12(%esp), KEYP
2080 movl 16(%esp), OUTP
2081 movl 20(%esp), INP
2082#endif
394 mov 480(KEYP), KLEN # key length 2083 mov 480(KEYP), KLEN # key length
395 add $240, KEYP 2084 add $240, KEYP
396 movups (INP), STATE # input 2085 movups (INP), STATE # input
397 call _aesni_dec1 2086 call _aesni_dec1
398 movups STATE, (OUTP) #output 2087 movups STATE, (OUTP) #output
2088#ifndef __x86_64__
2089 popl KLEN
2090 popl KEYP
2091#endif
399 ret 2092 ret
400 2093
401/* 2094/*
@@ -410,6 +2103,7 @@ ENTRY(aesni_dec)
410 * KEY 2103 * KEY
411 * TKEYP (T1) 2104 * TKEYP (T1)
412 */ 2105 */
2106.align 4
413_aesni_dec1: 2107_aesni_dec1:
414 movaps (KEYP), KEY # key 2108 movaps (KEYP), KEY # key
415 mov KEYP, TKEYP 2109 mov KEYP, TKEYP
@@ -472,6 +2166,7 @@ _aesni_dec1:
472 * KEY 2166 * KEY
473 * TKEYP (T1) 2167 * TKEYP (T1)
474 */ 2168 */
2169.align 4
475_aesni_dec4: 2170_aesni_dec4:
476 movaps (KEYP), KEY # key 2171 movaps (KEYP), KEY # key
477 mov KEYP, TKEYP 2172 mov KEYP, TKEYP
@@ -566,6 +2261,15 @@ _aesni_dec4:
566 * size_t len) 2261 * size_t len)
567 */ 2262 */
568ENTRY(aesni_ecb_enc) 2263ENTRY(aesni_ecb_enc)
2264#ifndef __x86_64__
2265 pushl LEN
2266 pushl KEYP
2267 pushl KLEN
2268 movl 16(%esp), KEYP
2269 movl 20(%esp), OUTP
2270 movl 24(%esp), INP
2271 movl 28(%esp), LEN
2272#endif
569 test LEN, LEN # check length 2273 test LEN, LEN # check length
570 jz .Lecb_enc_ret 2274 jz .Lecb_enc_ret
571 mov 480(KEYP), KLEN 2275 mov 480(KEYP), KLEN
@@ -602,6 +2306,11 @@ ENTRY(aesni_ecb_enc)
602 cmp $16, LEN 2306 cmp $16, LEN
603 jge .Lecb_enc_loop1 2307 jge .Lecb_enc_loop1
604.Lecb_enc_ret: 2308.Lecb_enc_ret:
2309#ifndef __x86_64__
2310 popl KLEN
2311 popl KEYP
2312 popl LEN
2313#endif
605 ret 2314 ret
606 2315
607/* 2316/*
@@ -609,6 +2318,15 @@ ENTRY(aesni_ecb_enc)
609 * size_t len); 2318 * size_t len);
610 */ 2319 */
611ENTRY(aesni_ecb_dec) 2320ENTRY(aesni_ecb_dec)
2321#ifndef __x86_64__
2322 pushl LEN
2323 pushl KEYP
2324 pushl KLEN
2325 movl 16(%esp), KEYP
2326 movl 20(%esp), OUTP
2327 movl 24(%esp), INP
2328 movl 28(%esp), LEN
2329#endif
612 test LEN, LEN 2330 test LEN, LEN
613 jz .Lecb_dec_ret 2331 jz .Lecb_dec_ret
614 mov 480(KEYP), KLEN 2332 mov 480(KEYP), KLEN
@@ -646,6 +2364,11 @@ ENTRY(aesni_ecb_dec)
646 cmp $16, LEN 2364 cmp $16, LEN
647 jge .Lecb_dec_loop1 2365 jge .Lecb_dec_loop1
648.Lecb_dec_ret: 2366.Lecb_dec_ret:
2367#ifndef __x86_64__
2368 popl KLEN
2369 popl KEYP
2370 popl LEN
2371#endif
649 ret 2372 ret
650 2373
651/* 2374/*
@@ -653,6 +2376,17 @@ ENTRY(aesni_ecb_dec)
653 * size_t len, u8 *iv) 2376 * size_t len, u8 *iv)
654 */ 2377 */
655ENTRY(aesni_cbc_enc) 2378ENTRY(aesni_cbc_enc)
2379#ifndef __x86_64__
2380 pushl IVP
2381 pushl LEN
2382 pushl KEYP
2383 pushl KLEN
2384 movl 20(%esp), KEYP
2385 movl 24(%esp), OUTP
2386 movl 28(%esp), INP
2387 movl 32(%esp), LEN
2388 movl 36(%esp), IVP
2389#endif
656 cmp $16, LEN 2390 cmp $16, LEN
657 jb .Lcbc_enc_ret 2391 jb .Lcbc_enc_ret
658 mov 480(KEYP), KLEN 2392 mov 480(KEYP), KLEN
@@ -670,6 +2404,12 @@ ENTRY(aesni_cbc_enc)
670 jge .Lcbc_enc_loop 2404 jge .Lcbc_enc_loop
671 movups STATE, (IVP) 2405 movups STATE, (IVP)
672.Lcbc_enc_ret: 2406.Lcbc_enc_ret:
2407#ifndef __x86_64__
2408 popl KLEN
2409 popl KEYP
2410 popl LEN
2411 popl IVP
2412#endif
673 ret 2413 ret
674 2414
675/* 2415/*
@@ -677,6 +2417,17 @@ ENTRY(aesni_cbc_enc)
677 * size_t len, u8 *iv) 2417 * size_t len, u8 *iv)
678 */ 2418 */
679ENTRY(aesni_cbc_dec) 2419ENTRY(aesni_cbc_dec)
2420#ifndef __x86_64__
2421 pushl IVP
2422 pushl LEN
2423 pushl KEYP
2424 pushl KLEN
2425 movl 20(%esp), KEYP
2426 movl 24(%esp), OUTP
2427 movl 28(%esp), INP
2428 movl 32(%esp), LEN
2429 movl 36(%esp), IVP
2430#endif
680 cmp $16, LEN 2431 cmp $16, LEN
681 jb .Lcbc_dec_just_ret 2432 jb .Lcbc_dec_just_ret
682 mov 480(KEYP), KLEN 2433 mov 480(KEYP), KLEN
@@ -690,16 +2441,30 @@ ENTRY(aesni_cbc_dec)
690 movaps IN1, STATE1 2441 movaps IN1, STATE1
691 movups 0x10(INP), IN2 2442 movups 0x10(INP), IN2
692 movaps IN2, STATE2 2443 movaps IN2, STATE2
2444#ifdef __x86_64__
693 movups 0x20(INP), IN3 2445 movups 0x20(INP), IN3
694 movaps IN3, STATE3 2446 movaps IN3, STATE3
695 movups 0x30(INP), IN4 2447 movups 0x30(INP), IN4
696 movaps IN4, STATE4 2448 movaps IN4, STATE4
2449#else
2450 movups 0x20(INP), IN1
2451 movaps IN1, STATE3
2452 movups 0x30(INP), IN2
2453 movaps IN2, STATE4
2454#endif
697 call _aesni_dec4 2455 call _aesni_dec4
698 pxor IV, STATE1 2456 pxor IV, STATE1
2457#ifdef __x86_64__
699 pxor IN1, STATE2 2458 pxor IN1, STATE2
700 pxor IN2, STATE3 2459 pxor IN2, STATE3
701 pxor IN3, STATE4 2460 pxor IN3, STATE4
702 movaps IN4, IV 2461 movaps IN4, IV
2462#else
2463 pxor (INP), STATE2
2464 pxor 0x10(INP), STATE3
2465 pxor IN1, STATE4
2466 movaps IN2, IV
2467#endif
703 movups STATE1, (OUTP) 2468 movups STATE1, (OUTP)
704 movups STATE2, 0x10(OUTP) 2469 movups STATE2, 0x10(OUTP)
705 movups STATE3, 0x20(OUTP) 2470 movups STATE3, 0x20(OUTP)
@@ -727,8 +2492,15 @@ ENTRY(aesni_cbc_dec)
727.Lcbc_dec_ret: 2492.Lcbc_dec_ret:
728 movups IV, (IVP) 2493 movups IV, (IVP)
729.Lcbc_dec_just_ret: 2494.Lcbc_dec_just_ret:
2495#ifndef __x86_64__
2496 popl KLEN
2497 popl KEYP
2498 popl LEN
2499 popl IVP
2500#endif
730 ret 2501 ret
731 2502
2503#ifdef __x86_64__
732.align 16 2504.align 16
733.Lbswap_mask: 2505.Lbswap_mask:
734 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2506 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -744,6 +2516,7 @@ ENTRY(aesni_cbc_dec)
744 * INC: == 1, in little endian 2516 * INC: == 1, in little endian
745 * BSWAP_MASK == endian swapping mask 2517 * BSWAP_MASK == endian swapping mask
746 */ 2518 */
2519.align 4
747_aesni_inc_init: 2520_aesni_inc_init:
748 movaps .Lbswap_mask, BSWAP_MASK 2521 movaps .Lbswap_mask, BSWAP_MASK
749 movaps IV, CTR 2522 movaps IV, CTR
@@ -768,6 +2541,7 @@ _aesni_inc_init:
768 * CTR: == output IV, in little endian 2541 * CTR: == output IV, in little endian
769 * TCTR_LOW: == lower qword of CTR 2542 * TCTR_LOW: == lower qword of CTR
770 */ 2543 */
2544.align 4
771_aesni_inc: 2545_aesni_inc:
772 paddq INC, CTR 2546 paddq INC, CTR
773 add $1, TCTR_LOW 2547 add $1, TCTR_LOW
@@ -839,3 +2613,4 @@ ENTRY(aesni_ctr_enc)
839 movups IV, (IVP) 2613 movups IV, (IVP)
840.Lctr_enc_just_ret: 2614.Lctr_enc_just_ret:
841 ret 2615 ret
2616#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc4490a..feee8ff1d05e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
5 * Copyright (C) 2008, Intel Corp. 5 * Copyright (C) 2008, Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com> 6 * Author: Huang Ying <ying.huang@intel.com>
7 * 7 *
8 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
9 * interface for 64-bit kernels.
10 * Authors: Adrian Hoban <adrian.hoban@intel.com>
11 * Gabriele Paoloni <gabriele.paoloni@intel.com>
12 * Tadeusz Struk (tadeusz.struk@intel.com)
13 * Aidan O'Mahony (aidan.o.mahony@intel.com)
14 * Copyright (c) 2010, Intel Corporation.
15 *
8 * This program is free software; you can redistribute it and/or modify 16 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 17 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or 18 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
21#include <crypto/ctr.h> 29#include <crypto/ctr.h>
22#include <asm/i387.h> 30#include <asm/i387.h>
23#include <asm/aes.h> 31#include <asm/aes.h>
32#include <crypto/scatterwalk.h>
33#include <crypto/internal/aead.h>
34#include <linux/workqueue.h>
35#include <linux/spinlock.h>
24 36
25#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) 37#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
26#define HAS_CTR 38#define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
42 struct cryptd_ablkcipher *cryptd_tfm; 54 struct cryptd_ablkcipher *cryptd_tfm;
43}; 55};
44 56
45#define AESNI_ALIGN 16 57/* This data is stored at the end of the crypto_tfm struct.
58 * It's a type of per "session" data storage location.
59 * This needs to be 16 byte aligned.
60 */
61struct aesni_rfc4106_gcm_ctx {
62 u8 hash_subkey[16];
63 struct crypto_aes_ctx aes_key_expanded;
64 u8 nonce[4];
65 struct cryptd_aead *cryptd_tfm;
66};
67
68struct aesni_gcm_set_hash_subkey_result {
69 int err;
70 struct completion completion;
71};
72
73struct aesni_hash_subkey_req_data {
74 u8 iv[16];
75 struct aesni_gcm_set_hash_subkey_result result;
76 struct scatterlist sg;
77};
78
79#define AESNI_ALIGN (16)
46#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) 80#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
81#define RFC4106_HASH_SUBKEY_SIZE 16
47 82
48asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 83asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
49 unsigned int key_len); 84 unsigned int key_len);
@@ -59,9 +94,66 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
59 const u8 *in, unsigned int len, u8 *iv); 94 const u8 *in, unsigned int len, u8 *iv);
60asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 95asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
61 const u8 *in, unsigned int len, u8 *iv); 96 const u8 *in, unsigned int len, u8 *iv);
97
98int crypto_fpu_init(void);
99void crypto_fpu_exit(void);
100
101#ifdef CONFIG_X86_64
62asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 102asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
63 const u8 *in, unsigned int len, u8 *iv); 103 const u8 *in, unsigned int len, u8 *iv);
64 104
105/* asmlinkage void aesni_gcm_enc()
106 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
107 * u8 *out, Ciphertext output. Encrypt in-place is allowed.
108 * const u8 *in, Plaintext input
109 * unsigned long plaintext_len, Length of data in bytes for encryption.
110 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
111 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
112 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
113 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
114 * const u8 *aad, Additional Authentication Data (AAD)
115 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
116 * is going to be 8 or 12 bytes
117 * u8 *auth_tag, Authenticated Tag output.
118 * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
119 * Valid values are 16 (most likely), 12 or 8.
120 */
121asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
122 const u8 *in, unsigned long plaintext_len, u8 *iv,
123 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
124 u8 *auth_tag, unsigned long auth_tag_len);
125
126/* asmlinkage void aesni_gcm_dec()
127 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
128 * u8 *out, Plaintext output. Decrypt in-place is allowed.
129 * const u8 *in, Ciphertext input
130 * unsigned long ciphertext_len, Length of data in bytes for decryption.
131 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
132 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
133 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
134 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
135 * const u8 *aad, Additional Authentication Data (AAD)
136 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
137 * to be 8 or 12 bytes
138 * u8 *auth_tag, Authenticated Tag output.
139 * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
140 * Valid values are 16 (most likely), 12 or 8.
141 */
142asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
143 const u8 *in, unsigned long ciphertext_len, u8 *iv,
144 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
145 u8 *auth_tag, unsigned long auth_tag_len);
146
147static inline struct
148aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
149{
150 return
151 (struct aesni_rfc4106_gcm_ctx *)
152 PTR_ALIGN((u8 *)
153 crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
154}
155#endif
156
65static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) 157static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
66{ 158{
67 unsigned long addr = (unsigned long)raw_ctx; 159 unsigned long addr = (unsigned long)raw_ctx;
@@ -324,6 +416,7 @@ static struct crypto_alg blk_cbc_alg = {
324 }, 416 },
325}; 417};
326 418
419#ifdef CONFIG_X86_64
327static void ctr_crypt_final(struct crypto_aes_ctx *ctx, 420static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
328 struct blkcipher_walk *walk) 421 struct blkcipher_walk *walk)
329{ 422{
@@ -389,6 +482,7 @@ static struct crypto_alg blk_ctr_alg = {
389 }, 482 },
390 }, 483 },
391}; 484};
485#endif
392 486
393static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, 487static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
394 unsigned int key_len) 488 unsigned int key_len)
@@ -536,6 +630,7 @@ static struct crypto_alg ablk_cbc_alg = {
536 }, 630 },
537}; 631};
538 632
633#ifdef CONFIG_X86_64
539static int ablk_ctr_init(struct crypto_tfm *tfm) 634static int ablk_ctr_init(struct crypto_tfm *tfm)
540{ 635{
541 struct cryptd_ablkcipher *cryptd_tfm; 636 struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +707,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
612 }, 707 },
613}; 708};
614#endif 709#endif
710#endif
615 711
616#ifdef HAS_LRW 712#ifdef HAS_LRW
617static int ablk_lrw_init(struct crypto_tfm *tfm) 713static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -730,6 +826,432 @@ static struct crypto_alg ablk_xts_alg = {
730}; 826};
731#endif 827#endif
732 828
829#ifdef CONFIG_X86_64
830static int rfc4106_init(struct crypto_tfm *tfm)
831{
832 struct cryptd_aead *cryptd_tfm;
833 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
834 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
835 struct crypto_aead *cryptd_child;
836 struct aesni_rfc4106_gcm_ctx *child_ctx;
837 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
838 if (IS_ERR(cryptd_tfm))
839 return PTR_ERR(cryptd_tfm);
840
841 cryptd_child = cryptd_aead_child(cryptd_tfm);
842 child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child);
843 memcpy(child_ctx, ctx, sizeof(*ctx));
844 ctx->cryptd_tfm = cryptd_tfm;
845 tfm->crt_aead.reqsize = sizeof(struct aead_request)
846 + crypto_aead_reqsize(&cryptd_tfm->base);
847 return 0;
848}
849
850static void rfc4106_exit(struct crypto_tfm *tfm)
851{
852 struct aesni_rfc4106_gcm_ctx *ctx =
853 (struct aesni_rfc4106_gcm_ctx *)
854 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
855 if (!IS_ERR(ctx->cryptd_tfm))
856 cryptd_free_aead(ctx->cryptd_tfm);
857 return;
858}
859
860static void
861rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
862{
863 struct aesni_gcm_set_hash_subkey_result *result = req->data;
864
865 if (err == -EINPROGRESS)
866 return;
867 result->err = err;
868 complete(&result->completion);
869}
870
871static int
872rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
873{
874 struct crypto_ablkcipher *ctr_tfm;
875 struct ablkcipher_request *req;
876 int ret = -EINVAL;
877 struct aesni_hash_subkey_req_data *req_data;
878
879 ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
880 if (IS_ERR(ctr_tfm))
881 return PTR_ERR(ctr_tfm);
882
883 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
884
885 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
886 if (ret)
887 goto out_free_ablkcipher;
888
889 ret = -ENOMEM;
890 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
891 if (!req)
892 goto out_free_ablkcipher;
893
894 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
895 if (!req_data)
896 goto out_free_request;
897
898 memset(req_data->iv, 0, sizeof(req_data->iv));
899
900 /* Clear the data in the hash sub key container to zero.*/
901 /* We want to cipher all zeros to create the hash sub key. */
902 memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
903
904 init_completion(&req_data->result.completion);
905 sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
906 ablkcipher_request_set_tfm(req, ctr_tfm);
907 ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
908 CRYPTO_TFM_REQ_MAY_BACKLOG,
909 rfc4106_set_hash_subkey_done,
910 &req_data->result);
911
912 ablkcipher_request_set_crypt(req, &req_data->sg,
913 &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
914
915 ret = crypto_ablkcipher_encrypt(req);
916 if (ret == -EINPROGRESS || ret == -EBUSY) {
917 ret = wait_for_completion_interruptible
918 (&req_data->result.completion);
919 if (!ret)
920 ret = req_data->result.err;
921 }
922 kfree(req_data);
923out_free_request:
924 ablkcipher_request_free(req);
925out_free_ablkcipher:
926 crypto_free_ablkcipher(ctr_tfm);
927 return ret;
928}
929
930static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
931 unsigned int key_len)
932{
933 int ret = 0;
934 struct crypto_tfm *tfm = crypto_aead_tfm(parent);
935 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
936 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
937 struct aesni_rfc4106_gcm_ctx *child_ctx =
938 aesni_rfc4106_gcm_ctx_get(cryptd_child);
939 u8 *new_key_mem = NULL;
940
941 if (key_len < 4) {
942 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
943 return -EINVAL;
944 }
945 /*Account for 4 byte nonce at the end.*/
946 key_len -= 4;
947 if (key_len != AES_KEYSIZE_128) {
948 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
949 return -EINVAL;
950 }
951
952 memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
953 /*This must be on a 16 byte boundary!*/
954 if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
955 return -EINVAL;
956
957 if ((unsigned long)key % AESNI_ALIGN) {
958 /*key is not aligned: use an auxuliar aligned pointer*/
959 new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
960 if (!new_key_mem)
961 return -ENOMEM;
962
963 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
964 memcpy(new_key_mem, key, key_len);
965 key = new_key_mem;
966 }
967
968 if (!irq_fpu_usable())
969 ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
970 key, key_len);
971 else {
972 kernel_fpu_begin();
973 ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
974 kernel_fpu_end();
975 }
976 /*This must be on a 16 byte boundary!*/
977 if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
978 ret = -EINVAL;
979 goto exit;
980 }
981 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
982 memcpy(child_ctx, ctx, sizeof(*ctx));
983exit:
984 kfree(new_key_mem);
985 return ret;
986}
987
988/* This is the Integrity Check Value (aka the authentication tag length and can
989 * be 8, 12 or 16 bytes long. */
990static int rfc4106_set_authsize(struct crypto_aead *parent,
991 unsigned int authsize)
992{
993 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
994 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
995
996 switch (authsize) {
997 case 8:
998 case 12:
999 case 16:
1000 break;
1001 default:
1002 return -EINVAL;
1003 }
1004 crypto_aead_crt(parent)->authsize = authsize;
1005 crypto_aead_crt(cryptd_child)->authsize = authsize;
1006 return 0;
1007}
1008
1009static int rfc4106_encrypt(struct aead_request *req)
1010{
1011 int ret;
1012 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1013 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1014
1015 if (!irq_fpu_usable()) {
1016 struct aead_request *cryptd_req =
1017 (struct aead_request *) aead_request_ctx(req);
1018 memcpy(cryptd_req, req, sizeof(*req));
1019 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1020 return crypto_aead_encrypt(cryptd_req);
1021 } else {
1022 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1023 kernel_fpu_begin();
1024 ret = cryptd_child->base.crt_aead.encrypt(req);
1025 kernel_fpu_end();
1026 return ret;
1027 }
1028}
1029
1030static int rfc4106_decrypt(struct aead_request *req)
1031{
1032 int ret;
1033 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1034 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1035
1036 if (!irq_fpu_usable()) {
1037 struct aead_request *cryptd_req =
1038 (struct aead_request *) aead_request_ctx(req);
1039 memcpy(cryptd_req, req, sizeof(*req));
1040 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1041 return crypto_aead_decrypt(cryptd_req);
1042 } else {
1043 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1044 kernel_fpu_begin();
1045 ret = cryptd_child->base.crt_aead.decrypt(req);
1046 kernel_fpu_end();
1047 return ret;
1048 }
1049}
1050
1051static struct crypto_alg rfc4106_alg = {
1052 .cra_name = "rfc4106(gcm(aes))",
1053 .cra_driver_name = "rfc4106-gcm-aesni",
1054 .cra_priority = 400,
1055 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
1056 .cra_blocksize = 1,
1057 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1058 .cra_alignmask = 0,
1059 .cra_type = &crypto_nivaead_type,
1060 .cra_module = THIS_MODULE,
1061 .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
1062 .cra_init = rfc4106_init,
1063 .cra_exit = rfc4106_exit,
1064 .cra_u = {
1065 .aead = {
1066 .setkey = rfc4106_set_key,
1067 .setauthsize = rfc4106_set_authsize,
1068 .encrypt = rfc4106_encrypt,
1069 .decrypt = rfc4106_decrypt,
1070 .geniv = "seqiv",
1071 .ivsize = 8,
1072 .maxauthsize = 16,
1073 },
1074 },
1075};
1076
1077static int __driver_rfc4106_encrypt(struct aead_request *req)
1078{
1079 u8 one_entry_in_sg = 0;
1080 u8 *src, *dst, *assoc;
1081 __be32 counter = cpu_to_be32(1);
1082 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1083 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1084 void *aes_ctx = &(ctx->aes_key_expanded);
1085 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1086 u8 iv_tab[16+AESNI_ALIGN];
1087 u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
1088 struct scatter_walk src_sg_walk;
1089 struct scatter_walk assoc_sg_walk;
1090 struct scatter_walk dst_sg_walk;
1091 unsigned int i;
1092
1093 /* Assuming we are supporting rfc4106 64-bit extended */
1094 /* sequence numbers We need to have the AAD length equal */
1095 /* to 8 or 12 bytes */
1096 if (unlikely(req->assoclen != 8 && req->assoclen != 12))
1097 return -EINVAL;
1098 /* IV below built */
1099 for (i = 0; i < 4; i++)
1100 *(iv+i) = ctx->nonce[i];
1101 for (i = 0; i < 8; i++)
1102 *(iv+4+i) = req->iv[i];
1103 *((__be32 *)(iv+12)) = counter;
1104
1105 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1106 one_entry_in_sg = 1;
1107 scatterwalk_start(&src_sg_walk, req->src);
1108 scatterwalk_start(&assoc_sg_walk, req->assoc);
1109 src = scatterwalk_map(&src_sg_walk, 0);
1110 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1111 dst = src;
1112 if (unlikely(req->src != req->dst)) {
1113 scatterwalk_start(&dst_sg_walk, req->dst);
1114 dst = scatterwalk_map(&dst_sg_walk, 0);
1115 }
1116
1117 } else {
1118 /* Allocate memory for src, dst, assoc */
1119 src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
1120 GFP_ATOMIC);
1121 if (unlikely(!src))
1122 return -ENOMEM;
1123 assoc = (src + req->cryptlen + auth_tag_len);
1124 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1125 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1126 req->assoclen, 0);
1127 dst = src;
1128 }
1129
1130 aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
1131 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
1132 + ((unsigned long)req->cryptlen), auth_tag_len);
1133
1134 /* The authTag (aka the Integrity Check Value) needs to be written
1135 * back to the packet. */
1136 if (one_entry_in_sg) {
1137 if (unlikely(req->src != req->dst)) {
1138 scatterwalk_unmap(dst, 0);
1139 scatterwalk_done(&dst_sg_walk, 0, 0);
1140 }
1141 scatterwalk_unmap(src, 0);
1142 scatterwalk_unmap(assoc, 0);
1143 scatterwalk_done(&src_sg_walk, 0, 0);
1144 scatterwalk_done(&assoc_sg_walk, 0, 0);
1145 } else {
1146 scatterwalk_map_and_copy(dst, req->dst, 0,
1147 req->cryptlen + auth_tag_len, 1);
1148 kfree(src);
1149 }
1150 return 0;
1151}
1152
1153static int __driver_rfc4106_decrypt(struct aead_request *req)
1154{
1155 u8 one_entry_in_sg = 0;
1156 u8 *src, *dst, *assoc;
1157 unsigned long tempCipherLen = 0;
1158 __be32 counter = cpu_to_be32(1);
1159 int retval = 0;
1160 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1161 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1162 void *aes_ctx = &(ctx->aes_key_expanded);
1163 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1164 u8 iv_and_authTag[32+AESNI_ALIGN];
1165 u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
1166 u8 *authTag = iv + 16;
1167 struct scatter_walk src_sg_walk;
1168 struct scatter_walk assoc_sg_walk;
1169 struct scatter_walk dst_sg_walk;
1170 unsigned int i;
1171
1172 if (unlikely((req->cryptlen < auth_tag_len) ||
1173 (req->assoclen != 8 && req->assoclen != 12)))
1174 return -EINVAL;
1175 /* Assuming we are supporting rfc4106 64-bit extended */
1176 /* sequence numbers We need to have the AAD length */
1177 /* equal to 8 or 12 bytes */
1178
1179 tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
1180 /* IV below built */
1181 for (i = 0; i < 4; i++)
1182 *(iv+i) = ctx->nonce[i];
1183 for (i = 0; i < 8; i++)
1184 *(iv+4+i) = req->iv[i];
1185 *((__be32 *)(iv+12)) = counter;
1186
1187 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1188 one_entry_in_sg = 1;
1189 scatterwalk_start(&src_sg_walk, req->src);
1190 scatterwalk_start(&assoc_sg_walk, req->assoc);
1191 src = scatterwalk_map(&src_sg_walk, 0);
1192 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1193 dst = src;
1194 if (unlikely(req->src != req->dst)) {
1195 scatterwalk_start(&dst_sg_walk, req->dst);
1196 dst = scatterwalk_map(&dst_sg_walk, 0);
1197 }
1198
1199 } else {
1200 /* Allocate memory for src, dst, assoc */
1201 src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
1202 if (!src)
1203 return -ENOMEM;
1204 assoc = (src + req->cryptlen + auth_tag_len);
1205 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1206 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1207 req->assoclen, 0);
1208 dst = src;
1209 }
1210
1211 aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
1212 ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
1213 authTag, auth_tag_len);
1214
1215 /* Compare generated tag with passed in tag. */
1216 retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
1217 -EBADMSG : 0;
1218
1219 if (one_entry_in_sg) {
1220 if (unlikely(req->src != req->dst)) {
1221 scatterwalk_unmap(dst, 0);
1222 scatterwalk_done(&dst_sg_walk, 0, 0);
1223 }
1224 scatterwalk_unmap(src, 0);
1225 scatterwalk_unmap(assoc, 0);
1226 scatterwalk_done(&src_sg_walk, 0, 0);
1227 scatterwalk_done(&assoc_sg_walk, 0, 0);
1228 } else {
1229 scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
1230 kfree(src);
1231 }
1232 return retval;
1233}
1234
1235static struct crypto_alg __rfc4106_alg = {
1236 .cra_name = "__gcm-aes-aesni",
1237 .cra_driver_name = "__driver-gcm-aes-aesni",
1238 .cra_priority = 0,
1239 .cra_flags = CRYPTO_ALG_TYPE_AEAD,
1240 .cra_blocksize = 1,
1241 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1242 .cra_alignmask = 0,
1243 .cra_type = &crypto_aead_type,
1244 .cra_module = THIS_MODULE,
1245 .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
1246 .cra_u = {
1247 .aead = {
1248 .encrypt = __driver_rfc4106_encrypt,
1249 .decrypt = __driver_rfc4106_decrypt,
1250 },
1251 },
1252};
1253#endif
1254
733static int __init aesni_init(void) 1255static int __init aesni_init(void)
734{ 1256{
735 int err; 1257 int err;
@@ -738,6 +1260,9 @@ static int __init aesni_init(void)
738 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); 1260 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
739 return -ENODEV; 1261 return -ENODEV;
740 } 1262 }
1263
1264 if ((err = crypto_fpu_init()))
1265 goto fpu_err;
741 if ((err = crypto_register_alg(&aesni_alg))) 1266 if ((err = crypto_register_alg(&aesni_alg)))
742 goto aes_err; 1267 goto aes_err;
743 if ((err = crypto_register_alg(&__aesni_alg))) 1268 if ((err = crypto_register_alg(&__aesni_alg)))
@@ -746,18 +1271,24 @@ static int __init aesni_init(void)
746 goto blk_ecb_err; 1271 goto blk_ecb_err;
747 if ((err = crypto_register_alg(&blk_cbc_alg))) 1272 if ((err = crypto_register_alg(&blk_cbc_alg)))
748 goto blk_cbc_err; 1273 goto blk_cbc_err;
749 if ((err = crypto_register_alg(&blk_ctr_alg)))
750 goto blk_ctr_err;
751 if ((err = crypto_register_alg(&ablk_ecb_alg))) 1274 if ((err = crypto_register_alg(&ablk_ecb_alg)))
752 goto ablk_ecb_err; 1275 goto ablk_ecb_err;
753 if ((err = crypto_register_alg(&ablk_cbc_alg))) 1276 if ((err = crypto_register_alg(&ablk_cbc_alg)))
754 goto ablk_cbc_err; 1277 goto ablk_cbc_err;
1278#ifdef CONFIG_X86_64
1279 if ((err = crypto_register_alg(&blk_ctr_alg)))
1280 goto blk_ctr_err;
755 if ((err = crypto_register_alg(&ablk_ctr_alg))) 1281 if ((err = crypto_register_alg(&ablk_ctr_alg)))
756 goto ablk_ctr_err; 1282 goto ablk_ctr_err;
1283 if ((err = crypto_register_alg(&__rfc4106_alg)))
1284 goto __aead_gcm_err;
1285 if ((err = crypto_register_alg(&rfc4106_alg)))
1286 goto aead_gcm_err;
757#ifdef HAS_CTR 1287#ifdef HAS_CTR
758 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) 1288 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
759 goto ablk_rfc3686_ctr_err; 1289 goto ablk_rfc3686_ctr_err;
760#endif 1290#endif
1291#endif
761#ifdef HAS_LRW 1292#ifdef HAS_LRW
762 if ((err = crypto_register_alg(&ablk_lrw_alg))) 1293 if ((err = crypto_register_alg(&ablk_lrw_alg)))
763 goto ablk_lrw_err; 1294 goto ablk_lrw_err;
@@ -770,7 +1301,6 @@ static int __init aesni_init(void)
770 if ((err = crypto_register_alg(&ablk_xts_alg))) 1301 if ((err = crypto_register_alg(&ablk_xts_alg)))
771 goto ablk_xts_err; 1302 goto ablk_xts_err;
772#endif 1303#endif
773
774 return err; 1304 return err;
775 1305
776#ifdef HAS_XTS 1306#ifdef HAS_XTS
@@ -784,18 +1314,24 @@ ablk_pcbc_err:
784 crypto_unregister_alg(&ablk_lrw_alg); 1314 crypto_unregister_alg(&ablk_lrw_alg);
785ablk_lrw_err: 1315ablk_lrw_err:
786#endif 1316#endif
1317#ifdef CONFIG_X86_64
787#ifdef HAS_CTR 1318#ifdef HAS_CTR
788 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1319 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
789ablk_rfc3686_ctr_err: 1320ablk_rfc3686_ctr_err:
790#endif 1321#endif
1322 crypto_unregister_alg(&rfc4106_alg);
1323aead_gcm_err:
1324 crypto_unregister_alg(&__rfc4106_alg);
1325__aead_gcm_err:
791 crypto_unregister_alg(&ablk_ctr_alg); 1326 crypto_unregister_alg(&ablk_ctr_alg);
792ablk_ctr_err: 1327ablk_ctr_err:
1328 crypto_unregister_alg(&blk_ctr_alg);
1329blk_ctr_err:
1330#endif
793 crypto_unregister_alg(&ablk_cbc_alg); 1331 crypto_unregister_alg(&ablk_cbc_alg);
794ablk_cbc_err: 1332ablk_cbc_err:
795 crypto_unregister_alg(&ablk_ecb_alg); 1333 crypto_unregister_alg(&ablk_ecb_alg);
796ablk_ecb_err: 1334ablk_ecb_err:
797 crypto_unregister_alg(&blk_ctr_alg);
798blk_ctr_err:
799 crypto_unregister_alg(&blk_cbc_alg); 1335 crypto_unregister_alg(&blk_cbc_alg);
800blk_cbc_err: 1336blk_cbc_err:
801 crypto_unregister_alg(&blk_ecb_alg); 1337 crypto_unregister_alg(&blk_ecb_alg);
@@ -804,6 +1340,7 @@ blk_ecb_err:
804__aes_err: 1340__aes_err:
805 crypto_unregister_alg(&aesni_alg); 1341 crypto_unregister_alg(&aesni_alg);
806aes_err: 1342aes_err:
1343fpu_err:
807 return err; 1344 return err;
808} 1345}
809 1346
@@ -818,17 +1355,23 @@ static void __exit aesni_exit(void)
818#ifdef HAS_LRW 1355#ifdef HAS_LRW
819 crypto_unregister_alg(&ablk_lrw_alg); 1356 crypto_unregister_alg(&ablk_lrw_alg);
820#endif 1357#endif
1358#ifdef CONFIG_X86_64
821#ifdef HAS_CTR 1359#ifdef HAS_CTR
822 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1360 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
823#endif 1361#endif
1362 crypto_unregister_alg(&rfc4106_alg);
1363 crypto_unregister_alg(&__rfc4106_alg);
824 crypto_unregister_alg(&ablk_ctr_alg); 1364 crypto_unregister_alg(&ablk_ctr_alg);
1365 crypto_unregister_alg(&blk_ctr_alg);
1366#endif
825 crypto_unregister_alg(&ablk_cbc_alg); 1367 crypto_unregister_alg(&ablk_cbc_alg);
826 crypto_unregister_alg(&ablk_ecb_alg); 1368 crypto_unregister_alg(&ablk_ecb_alg);
827 crypto_unregister_alg(&blk_ctr_alg);
828 crypto_unregister_alg(&blk_cbc_alg); 1369 crypto_unregister_alg(&blk_cbc_alg);
829 crypto_unregister_alg(&blk_ecb_alg); 1370 crypto_unregister_alg(&blk_ecb_alg);
830 crypto_unregister_alg(&__aesni_alg); 1371 crypto_unregister_alg(&__aesni_alg);
831 crypto_unregister_alg(&aesni_alg); 1372 crypto_unregister_alg(&aesni_alg);
1373
1374 crypto_fpu_exit();
832} 1375}
833 1376
834module_init(aesni_init); 1377module_init(aesni_init);
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index 1a8f8649c035..98d7a188f46b 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -150,18 +150,12 @@ static struct crypto_template crypto_fpu_tmpl = {
150 .module = THIS_MODULE, 150 .module = THIS_MODULE,
151}; 151};
152 152
153static int __init crypto_fpu_module_init(void) 153int __init crypto_fpu_init(void)
154{ 154{
155 return crypto_register_template(&crypto_fpu_tmpl); 155 return crypto_register_template(&crypto_fpu_tmpl);
156} 156}
157 157
158static void __exit crypto_fpu_module_exit(void) 158void __exit crypto_fpu_exit(void)
159{ 159{
160 crypto_unregister_template(&crypto_fpu_tmpl); 160 crypto_unregister_template(&crypto_fpu_tmpl);
161} 161}
162
163module_init(crypto_fpu_module_init);
164module_exit(crypto_fpu_module_exit);
165
166MODULE_LICENSE("GPL");
167MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index cbcc8d8ea93a..7a6e68e4f748 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -10,6 +10,7 @@
10 * by the Free Software Foundation. 10 * by the Free Software Foundation.
11 */ 11 */
12 12
13#include <linux/err.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/kernel.h> 16#include <linux/kernel.h>