diff options
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/Makefile | 3 | ||||
-rw-r--r-- | arch/x86/crypto/camellia-aesni-avx-asm_64.S | 1102 | ||||
-rw-r--r-- | arch/x86/crypto/camellia_aesni_avx_glue.c | 558 |
3 files changed, 1663 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 84d7dbaba26e..e0ca7c9ac383 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o | |||
12 | 12 | ||
13 | obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o | 13 | obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o |
14 | obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o | 14 | obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o |
15 | obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o | ||
15 | obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o | 16 | obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o |
16 | obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o | 17 | obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o |
17 | obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o | 18 | obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o |
@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o | |||
34 | 35 | ||
35 | aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o | 36 | aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o |
36 | camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o | 37 | camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o |
38 | camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ | ||
39 | camellia_aesni_avx_glue.o | ||
37 | cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o | 40 | cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o |
38 | cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o | 41 | cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o |
39 | blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o | 42 | blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o |
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S new file mode 100644 index 000000000000..2306d2e4816f --- /dev/null +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S | |||
@@ -0,0 +1,1102 @@ | |||
1 | /* | ||
2 | * x86_64/AVX/AES-NI assembler implementation of Camellia | ||
3 | * | ||
4 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | * Version licensed under 2-clause BSD License is available at: | ||
15 | * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz | ||
16 | */ | ||
17 | |||
18 | #define CAMELLIA_TABLE_BYTE_LEN 272 | ||
19 | |||
20 | /* struct camellia_ctx: */ | ||
21 | #define key_table 0 | ||
22 | #define key_length CAMELLIA_TABLE_BYTE_LEN | ||
23 | |||
24 | /* register macros */ | ||
25 | #define CTX %rdi | ||
26 | |||
27 | /********************************************************************** | ||
28 | 16-way camellia | ||
29 | **********************************************************************/ | ||
30 | #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ | ||
31 | vpand x, mask4bit, tmp0; \ | ||
32 | vpandn x, mask4bit, x; \ | ||
33 | vpsrld $4, x, x; \ | ||
34 | \ | ||
35 | vpshufb tmp0, lo_t, tmp0; \ | ||
36 | vpshufb x, hi_t, x; \ | ||
37 | vpxor tmp0, x, x; | ||
38 | |||
39 | /* | ||
40 | * IN: | ||
41 | * x0..x7: byte-sliced AB state | ||
42 | * mem_cd: register pointer storing CD state | ||
43 | * key: index for key material | ||
44 | * OUT: | ||
45 | * x0..x7: new byte-sliced CD state | ||
46 | */ | ||
47 | #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ | ||
48 | t7, mem_cd, key) \ | ||
49 | /* \ | ||
50 | * S-function with AES subbytes \ | ||
51 | */ \ | ||
52 | vmovdqa .Linv_shift_row, t4; \ | ||
53 | vbroadcastss .L0f0f0f0f, t7; \ | ||
54 | vmovdqa .Lpre_tf_lo_s1, t0; \ | ||
55 | vmovdqa .Lpre_tf_hi_s1, t1; \ | ||
56 | \ | ||
57 | /* AES inverse shift rows */ \ | ||
58 | vpshufb t4, x0, x0; \ | ||
59 | vpshufb t4, x7, x7; \ | ||
60 | vpshufb t4, x1, x1; \ | ||
61 | vpshufb t4, x4, x4; \ | ||
62 | vpshufb t4, x2, x2; \ | ||
63 | vpshufb t4, x5, x5; \ | ||
64 | vpshufb t4, x3, x3; \ | ||
65 | vpshufb t4, x6, x6; \ | ||
66 | \ | ||
67 | /* prefilter sboxes 1, 2 and 3 */ \ | ||
68 | vmovdqa .Lpre_tf_lo_s4, t2; \ | ||
69 | vmovdqa .Lpre_tf_hi_s4, t3; \ | ||
70 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
71 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
72 | filter_8bit(x1, t0, t1, t7, t6); \ | ||
73 | filter_8bit(x4, t0, t1, t7, t6); \ | ||
74 | filter_8bit(x2, t0, t1, t7, t6); \ | ||
75 | filter_8bit(x5, t0, t1, t7, t6); \ | ||
76 | \ | ||
77 | /* prefilter sbox 4 */ \ | ||
78 | vpxor t4, t4, t4; \ | ||
79 | filter_8bit(x3, t2, t3, t7, t6); \ | ||
80 | filter_8bit(x6, t2, t3, t7, t6); \ | ||
81 | \ | ||
82 | /* AES subbytes + AES shift rows */ \ | ||
83 | vmovdqa .Lpost_tf_lo_s1, t0; \ | ||
84 | vmovdqa .Lpost_tf_hi_s1, t1; \ | ||
85 | vaesenclast t4, x0, x0; \ | ||
86 | vaesenclast t4, x7, x7; \ | ||
87 | vaesenclast t4, x1, x1; \ | ||
88 | vaesenclast t4, x4, x4; \ | ||
89 | vaesenclast t4, x2, x2; \ | ||
90 | vaesenclast t4, x5, x5; \ | ||
91 | vaesenclast t4, x3, x3; \ | ||
92 | vaesenclast t4, x6, x6; \ | ||
93 | \ | ||
94 | /* postfilter sboxes 1 and 4 */ \ | ||
95 | vmovdqa .Lpost_tf_lo_s3, t2; \ | ||
96 | vmovdqa .Lpost_tf_hi_s3, t3; \ | ||
97 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
98 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
99 | filter_8bit(x3, t0, t1, t7, t6); \ | ||
100 | filter_8bit(x6, t0, t1, t7, t6); \ | ||
101 | \ | ||
102 | /* postfilter sbox 3 */ \ | ||
103 | vmovdqa .Lpost_tf_lo_s2, t4; \ | ||
104 | vmovdqa .Lpost_tf_hi_s2, t5; \ | ||
105 | filter_8bit(x2, t2, t3, t7, t6); \ | ||
106 | filter_8bit(x5, t2, t3, t7, t6); \ | ||
107 | \ | ||
108 | vpxor t6, t6, t6; \ | ||
109 | vmovq key, t0; \ | ||
110 | \ | ||
111 | /* postfilter sbox 2 */ \ | ||
112 | filter_8bit(x1, t4, t5, t7, t2); \ | ||
113 | filter_8bit(x4, t4, t5, t7, t2); \ | ||
114 | \ | ||
115 | vpsrldq $5, t0, t5; \ | ||
116 | vpsrldq $1, t0, t1; \ | ||
117 | vpsrldq $2, t0, t2; \ | ||
118 | vpsrldq $3, t0, t3; \ | ||
119 | vpsrldq $4, t0, t4; \ | ||
120 | vpshufb t6, t0, t0; \ | ||
121 | vpshufb t6, t1, t1; \ | ||
122 | vpshufb t6, t2, t2; \ | ||
123 | vpshufb t6, t3, t3; \ | ||
124 | vpshufb t6, t4, t4; \ | ||
125 | vpsrldq $2, t5, t7; \ | ||
126 | vpshufb t6, t7, t7; \ | ||
127 | \ | ||
128 | /* \ | ||
129 | * P-function \ | ||
130 | */ \ | ||
131 | vpxor x5, x0, x0; \ | ||
132 | vpxor x6, x1, x1; \ | ||
133 | vpxor x7, x2, x2; \ | ||
134 | vpxor x4, x3, x3; \ | ||
135 | \ | ||
136 | vpxor x2, x4, x4; \ | ||
137 | vpxor x3, x5, x5; \ | ||
138 | vpxor x0, x6, x6; \ | ||
139 | vpxor x1, x7, x7; \ | ||
140 | \ | ||
141 | vpxor x7, x0, x0; \ | ||
142 | vpxor x4, x1, x1; \ | ||
143 | vpxor x5, x2, x2; \ | ||
144 | vpxor x6, x3, x3; \ | ||
145 | \ | ||
146 | vpxor x3, x4, x4; \ | ||
147 | vpxor x0, x5, x5; \ | ||
148 | vpxor x1, x6, x6; \ | ||
149 | vpxor x2, x7, x7; /* note: high and low parts swapped */ \ | ||
150 | \ | ||
151 | /* \ | ||
152 | * Add key material and result to CD (x becomes new CD) \ | ||
153 | */ \ | ||
154 | \ | ||
155 | vpxor t3, x4, x4; \ | ||
156 | vpxor 0 * 16(mem_cd), x4, x4; \ | ||
157 | \ | ||
158 | vpxor t2, x5, x5; \ | ||
159 | vpxor 1 * 16(mem_cd), x5, x5; \ | ||
160 | \ | ||
161 | vpsrldq $1, t5, t3; \ | ||
162 | vpshufb t6, t5, t5; \ | ||
163 | vpshufb t6, t3, t6; \ | ||
164 | \ | ||
165 | vpxor t1, x6, x6; \ | ||
166 | vpxor 2 * 16(mem_cd), x6, x6; \ | ||
167 | \ | ||
168 | vpxor t0, x7, x7; \ | ||
169 | vpxor 3 * 16(mem_cd), x7, x7; \ | ||
170 | \ | ||
171 | vpxor t7, x0, x0; \ | ||
172 | vpxor 4 * 16(mem_cd), x0, x0; \ | ||
173 | \ | ||
174 | vpxor t6, x1, x1; \ | ||
175 | vpxor 5 * 16(mem_cd), x1, x1; \ | ||
176 | \ | ||
177 | vpxor t5, x2, x2; \ | ||
178 | vpxor 6 * 16(mem_cd), x2, x2; \ | ||
179 | \ | ||
180 | vpxor t4, x3, x3; \ | ||
181 | vpxor 7 * 16(mem_cd), x3, x3; | ||
182 | |||
183 | /* | ||
184 | * Size optimization... with inlined roundsm16, binary would be over 5 times | ||
185 | * larger and would only be 0.5% faster (on sandy-bridge). | ||
186 | */ | ||
187 | .align 8 | ||
188 | roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: | ||
189 | roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
190 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, | ||
191 | %rcx, (%r9)); | ||
192 | ret; | ||
193 | |||
194 | .align 8 | ||
195 | roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: | ||
196 | roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, | ||
197 | %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, | ||
198 | %rax, (%r9)); | ||
199 | ret; | ||
200 | |||
201 | /* | ||
202 | * IN/OUT: | ||
203 | * x0..x7: byte-sliced AB state preloaded | ||
204 | * mem_ab: byte-sliced AB state in memory | ||
205 | * mem_cb: byte-sliced CD state in memory | ||
206 | */ | ||
207 | #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
208 | y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ | ||
209 | leaq (key_table + (i) * 8)(CTX), %r9; \ | ||
210 | call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ | ||
211 | \ | ||
212 | vmovdqu x4, 0 * 16(mem_cd); \ | ||
213 | vmovdqu x5, 1 * 16(mem_cd); \ | ||
214 | vmovdqu x6, 2 * 16(mem_cd); \ | ||
215 | vmovdqu x7, 3 * 16(mem_cd); \ | ||
216 | vmovdqu x0, 4 * 16(mem_cd); \ | ||
217 | vmovdqu x1, 5 * 16(mem_cd); \ | ||
218 | vmovdqu x2, 6 * 16(mem_cd); \ | ||
219 | vmovdqu x3, 7 * 16(mem_cd); \ | ||
220 | \ | ||
221 | leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ | ||
222 | call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ | ||
223 | \ | ||
224 | store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); | ||
225 | |||
226 | #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ | ||
227 | |||
228 | #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ | ||
229 | /* Store new AB state */ \ | ||
230 | vmovdqu x0, 0 * 16(mem_ab); \ | ||
231 | vmovdqu x1, 1 * 16(mem_ab); \ | ||
232 | vmovdqu x2, 2 * 16(mem_ab); \ | ||
233 | vmovdqu x3, 3 * 16(mem_ab); \ | ||
234 | vmovdqu x4, 4 * 16(mem_ab); \ | ||
235 | vmovdqu x5, 5 * 16(mem_ab); \ | ||
236 | vmovdqu x6, 6 * 16(mem_ab); \ | ||
237 | vmovdqu x7, 7 * 16(mem_ab); | ||
238 | |||
239 | #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
240 | y6, y7, mem_ab, mem_cd, i) \ | ||
241 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
242 | y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ | ||
243 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
244 | y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ | ||
245 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
246 | y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); | ||
247 | |||
248 | #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
249 | y6, y7, mem_ab, mem_cd, i) \ | ||
250 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
251 | y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ | ||
252 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
253 | y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ | ||
254 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
255 | y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); | ||
256 | |||
257 | /* | ||
258 | * IN: | ||
259 | * v0..3: byte-sliced 32-bit integers | ||
260 | * OUT: | ||
261 | * v0..3: (IN <<< 1) | ||
262 | */ | ||
263 | #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ | ||
264 | vpcmpgtb v0, zero, t0; \ | ||
265 | vpaddb v0, v0, v0; \ | ||
266 | vpabsb t0, t0; \ | ||
267 | \ | ||
268 | vpcmpgtb v1, zero, t1; \ | ||
269 | vpaddb v1, v1, v1; \ | ||
270 | vpabsb t1, t1; \ | ||
271 | \ | ||
272 | vpcmpgtb v2, zero, t2; \ | ||
273 | vpaddb v2, v2, v2; \ | ||
274 | vpabsb t2, t2; \ | ||
275 | \ | ||
276 | vpor t0, v1, v1; \ | ||
277 | \ | ||
278 | vpcmpgtb v3, zero, t0; \ | ||
279 | vpaddb v3, v3, v3; \ | ||
280 | vpabsb t0, t0; \ | ||
281 | \ | ||
282 | vpor t1, v2, v2; \ | ||
283 | vpor t2, v3, v3; \ | ||
284 | vpor t0, v0, v0; | ||
285 | |||
286 | /* | ||
287 | * IN: | ||
288 | * r: byte-sliced AB state in memory | ||
289 | * l: byte-sliced CD state in memory | ||
290 | * OUT: | ||
291 | * x0..x7: new byte-sliced CD state | ||
292 | */ | ||
293 | #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ | ||
294 | tt1, tt2, tt3, kll, klr, krl, krr) \ | ||
295 | /* \ | ||
296 | * t0 = kll; \ | ||
297 | * t0 &= ll; \ | ||
298 | * lr ^= rol32(t0, 1); \ | ||
299 | */ \ | ||
300 | vpxor tt0, tt0, tt0; \ | ||
301 | vmovd kll, t0; \ | ||
302 | vpshufb tt0, t0, t3; \ | ||
303 | vpsrldq $1, t0, t0; \ | ||
304 | vpshufb tt0, t0, t2; \ | ||
305 | vpsrldq $1, t0, t0; \ | ||
306 | vpshufb tt0, t0, t1; \ | ||
307 | vpsrldq $1, t0, t0; \ | ||
308 | vpshufb tt0, t0, t0; \ | ||
309 | \ | ||
310 | vpand l0, t0, t0; \ | ||
311 | vpand l1, t1, t1; \ | ||
312 | vpand l2, t2, t2; \ | ||
313 | vpand l3, t3, t3; \ | ||
314 | \ | ||
315 | rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | ||
316 | \ | ||
317 | vpxor l4, t0, l4; \ | ||
318 | vmovdqu l4, 4 * 16(l); \ | ||
319 | vpxor l5, t1, l5; \ | ||
320 | vmovdqu l5, 5 * 16(l); \ | ||
321 | vpxor l6, t2, l6; \ | ||
322 | vmovdqu l6, 6 * 16(l); \ | ||
323 | vpxor l7, t3, l7; \ | ||
324 | vmovdqu l7, 7 * 16(l); \ | ||
325 | \ | ||
326 | /* \ | ||
327 | * t2 = krr; \ | ||
328 | * t2 |= rr; \ | ||
329 | * rl ^= t2; \ | ||
330 | */ \ | ||
331 | \ | ||
332 | vmovd krr, t0; \ | ||
333 | vpshufb tt0, t0, t3; \ | ||
334 | vpsrldq $1, t0, t0; \ | ||
335 | vpshufb tt0, t0, t2; \ | ||
336 | vpsrldq $1, t0, t0; \ | ||
337 | vpshufb tt0, t0, t1; \ | ||
338 | vpsrldq $1, t0, t0; \ | ||
339 | vpshufb tt0, t0, t0; \ | ||
340 | \ | ||
341 | vpor 4 * 16(r), t0, t0; \ | ||
342 | vpor 5 * 16(r), t1, t1; \ | ||
343 | vpor 6 * 16(r), t2, t2; \ | ||
344 | vpor 7 * 16(r), t3, t3; \ | ||
345 | \ | ||
346 | vpxor 0 * 16(r), t0, t0; \ | ||
347 | vpxor 1 * 16(r), t1, t1; \ | ||
348 | vpxor 2 * 16(r), t2, t2; \ | ||
349 | vpxor 3 * 16(r), t3, t3; \ | ||
350 | vmovdqu t0, 0 * 16(r); \ | ||
351 | vmovdqu t1, 1 * 16(r); \ | ||
352 | vmovdqu t2, 2 * 16(r); \ | ||
353 | vmovdqu t3, 3 * 16(r); \ | ||
354 | \ | ||
355 | /* \ | ||
356 | * t2 = krl; \ | ||
357 | * t2 &= rl; \ | ||
358 | * rr ^= rol32(t2, 1); \ | ||
359 | */ \ | ||
360 | vmovd krl, t0; \ | ||
361 | vpshufb tt0, t0, t3; \ | ||
362 | vpsrldq $1, t0, t0; \ | ||
363 | vpshufb tt0, t0, t2; \ | ||
364 | vpsrldq $1, t0, t0; \ | ||
365 | vpshufb tt0, t0, t1; \ | ||
366 | vpsrldq $1, t0, t0; \ | ||
367 | vpshufb tt0, t0, t0; \ | ||
368 | \ | ||
369 | vpand 0 * 16(r), t0, t0; \ | ||
370 | vpand 1 * 16(r), t1, t1; \ | ||
371 | vpand 2 * 16(r), t2, t2; \ | ||
372 | vpand 3 * 16(r), t3, t3; \ | ||
373 | \ | ||
374 | rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | ||
375 | \ | ||
376 | vpxor 4 * 16(r), t0, t0; \ | ||
377 | vpxor 5 * 16(r), t1, t1; \ | ||
378 | vpxor 6 * 16(r), t2, t2; \ | ||
379 | vpxor 7 * 16(r), t3, t3; \ | ||
380 | vmovdqu t0, 4 * 16(r); \ | ||
381 | vmovdqu t1, 5 * 16(r); \ | ||
382 | vmovdqu t2, 6 * 16(r); \ | ||
383 | vmovdqu t3, 7 * 16(r); \ | ||
384 | \ | ||
385 | /* \ | ||
386 | * t0 = klr; \ | ||
387 | * t0 |= lr; \ | ||
388 | * ll ^= t0; \ | ||
389 | */ \ | ||
390 | \ | ||
391 | vmovd klr, t0; \ | ||
392 | vpshufb tt0, t0, t3; \ | ||
393 | vpsrldq $1, t0, t0; \ | ||
394 | vpshufb tt0, t0, t2; \ | ||
395 | vpsrldq $1, t0, t0; \ | ||
396 | vpshufb tt0, t0, t1; \ | ||
397 | vpsrldq $1, t0, t0; \ | ||
398 | vpshufb tt0, t0, t0; \ | ||
399 | \ | ||
400 | vpor l4, t0, t0; \ | ||
401 | vpor l5, t1, t1; \ | ||
402 | vpor l6, t2, t2; \ | ||
403 | vpor l7, t3, t3; \ | ||
404 | \ | ||
405 | vpxor l0, t0, l0; \ | ||
406 | vmovdqu l0, 0 * 16(l); \ | ||
407 | vpxor l1, t1, l1; \ | ||
408 | vmovdqu l1, 1 * 16(l); \ | ||
409 | vpxor l2, t2, l2; \ | ||
410 | vmovdqu l2, 2 * 16(l); \ | ||
411 | vpxor l3, t3, l3; \ | ||
412 | vmovdqu l3, 3 * 16(l); | ||
413 | |||
414 | #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ | ||
415 | vpunpckhdq x1, x0, t2; \ | ||
416 | vpunpckldq x1, x0, x0; \ | ||
417 | \ | ||
418 | vpunpckldq x3, x2, t1; \ | ||
419 | vpunpckhdq x3, x2, x2; \ | ||
420 | \ | ||
421 | vpunpckhqdq t1, x0, x1; \ | ||
422 | vpunpcklqdq t1, x0, x0; \ | ||
423 | \ | ||
424 | vpunpckhqdq x2, t2, x3; \ | ||
425 | vpunpcklqdq x2, t2, x2; | ||
426 | |||
427 | #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \ | ||
428 | b3, c3, d3, st0, st1) \ | ||
429 | vmovdqu d2, st0; \ | ||
430 | vmovdqu d3, st1; \ | ||
431 | transpose_4x4(a0, a1, a2, a3, d2, d3); \ | ||
432 | transpose_4x4(b0, b1, b2, b3, d2, d3); \ | ||
433 | vmovdqu st0, d2; \ | ||
434 | vmovdqu st1, d3; \ | ||
435 | \ | ||
436 | vmovdqu a0, st0; \ | ||
437 | vmovdqu a1, st1; \ | ||
438 | transpose_4x4(c0, c1, c2, c3, a0, a1); \ | ||
439 | transpose_4x4(d0, d1, d2, d3, a0, a1); \ | ||
440 | \ | ||
441 | vmovdqu .Lshufb_16x16b, a0; \ | ||
442 | vmovdqu st1, a1; \ | ||
443 | vpshufb a0, a2, a2; \ | ||
444 | vpshufb a0, a3, a3; \ | ||
445 | vpshufb a0, b0, b0; \ | ||
446 | vpshufb a0, b1, b1; \ | ||
447 | vpshufb a0, b2, b2; \ | ||
448 | vpshufb a0, b3, b3; \ | ||
449 | vpshufb a0, a1, a1; \ | ||
450 | vpshufb a0, c0, c0; \ | ||
451 | vpshufb a0, c1, c1; \ | ||
452 | vpshufb a0, c2, c2; \ | ||
453 | vpshufb a0, c3, c3; \ | ||
454 | vpshufb a0, d0, d0; \ | ||
455 | vpshufb a0, d1, d1; \ | ||
456 | vpshufb a0, d2, d2; \ | ||
457 | vpshufb a0, d3, d3; \ | ||
458 | vmovdqu d3, st1; \ | ||
459 | vmovdqu st0, d3; \ | ||
460 | vpshufb a0, d3, a0; \ | ||
461 | vmovdqu d2, st0; \ | ||
462 | \ | ||
463 | transpose_4x4(a0, b0, c0, d0, d2, d3); \ | ||
464 | transpose_4x4(a1, b1, c1, d1, d2, d3); \ | ||
465 | vmovdqu st0, d2; \ | ||
466 | vmovdqu st1, d3; \ | ||
467 | \ | ||
468 | vmovdqu b0, st0; \ | ||
469 | vmovdqu b1, st1; \ | ||
470 | transpose_4x4(a2, b2, c2, d2, b0, b1); \ | ||
471 | transpose_4x4(a3, b3, c3, d3, b0, b1); \ | ||
472 | vmovdqu st0, b0; \ | ||
473 | vmovdqu st1, b1; \ | ||
474 | /* does not adjust output bytes inside vectors */ | ||
475 | |||
476 | /* load blocks to registers and apply pre-whitening */ | ||
477 | #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
478 | y6, y7, rio, key) \ | ||
479 | vmovq key, x0; \ | ||
480 | vpshufb .Lpack_bswap, x0, x0; \ | ||
481 | \ | ||
482 | vpxor 0 * 16(rio), x0, y7; \ | ||
483 | vpxor 1 * 16(rio), x0, y6; \ | ||
484 | vpxor 2 * 16(rio), x0, y5; \ | ||
485 | vpxor 3 * 16(rio), x0, y4; \ | ||
486 | vpxor 4 * 16(rio), x0, y3; \ | ||
487 | vpxor 5 * 16(rio), x0, y2; \ | ||
488 | vpxor 6 * 16(rio), x0, y1; \ | ||
489 | vpxor 7 * 16(rio), x0, y0; \ | ||
490 | vpxor 8 * 16(rio), x0, x7; \ | ||
491 | vpxor 9 * 16(rio), x0, x6; \ | ||
492 | vpxor 10 * 16(rio), x0, x5; \ | ||
493 | vpxor 11 * 16(rio), x0, x4; \ | ||
494 | vpxor 12 * 16(rio), x0, x3; \ | ||
495 | vpxor 13 * 16(rio), x0, x2; \ | ||
496 | vpxor 14 * 16(rio), x0, x1; \ | ||
497 | vpxor 15 * 16(rio), x0, x0; | ||
498 | |||
499 | /* byteslice pre-whitened blocks and store to temporary memory */ | ||
500 | #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
501 | y6, y7, mem_ab, mem_cd) \ | ||
502 | byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ | ||
503 | y5, y6, y7, (mem_ab), (mem_cd)); \ | ||
504 | \ | ||
505 | vmovdqu x0, 0 * 16(mem_ab); \ | ||
506 | vmovdqu x1, 1 * 16(mem_ab); \ | ||
507 | vmovdqu x2, 2 * 16(mem_ab); \ | ||
508 | vmovdqu x3, 3 * 16(mem_ab); \ | ||
509 | vmovdqu x4, 4 * 16(mem_ab); \ | ||
510 | vmovdqu x5, 5 * 16(mem_ab); \ | ||
511 | vmovdqu x6, 6 * 16(mem_ab); \ | ||
512 | vmovdqu x7, 7 * 16(mem_ab); \ | ||
513 | vmovdqu y0, 0 * 16(mem_cd); \ | ||
514 | vmovdqu y1, 1 * 16(mem_cd); \ | ||
515 | vmovdqu y2, 2 * 16(mem_cd); \ | ||
516 | vmovdqu y3, 3 * 16(mem_cd); \ | ||
517 | vmovdqu y4, 4 * 16(mem_cd); \ | ||
518 | vmovdqu y5, 5 * 16(mem_cd); \ | ||
519 | vmovdqu y6, 6 * 16(mem_cd); \ | ||
520 | vmovdqu y7, 7 * 16(mem_cd); | ||
521 | |||
522 | /* de-byteslice, apply post-whitening and store blocks */ | ||
523 | #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ | ||
524 | y5, y6, y7, key, stack_tmp0, stack_tmp1) \ | ||
525 | byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \ | ||
526 | y7, x3, x7, stack_tmp0, stack_tmp1); \ | ||
527 | \ | ||
528 | vmovdqu x0, stack_tmp0; \ | ||
529 | \ | ||
530 | vmovq key, x0; \ | ||
531 | vpshufb .Lpack_bswap, x0, x0; \ | ||
532 | \ | ||
533 | vpxor x0, y7, y7; \ | ||
534 | vpxor x0, y6, y6; \ | ||
535 | vpxor x0, y5, y5; \ | ||
536 | vpxor x0, y4, y4; \ | ||
537 | vpxor x0, y3, y3; \ | ||
538 | vpxor x0, y2, y2; \ | ||
539 | vpxor x0, y1, y1; \ | ||
540 | vpxor x0, y0, y0; \ | ||
541 | vpxor x0, x7, x7; \ | ||
542 | vpxor x0, x6, x6; \ | ||
543 | vpxor x0, x5, x5; \ | ||
544 | vpxor x0, x4, x4; \ | ||
545 | vpxor x0, x3, x3; \ | ||
546 | vpxor x0, x2, x2; \ | ||
547 | vpxor x0, x1, x1; \ | ||
548 | vpxor stack_tmp0, x0, x0; | ||
549 | |||
550 | #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
551 | y6, y7, rio) \ | ||
552 | vmovdqu x0, 0 * 16(rio); \ | ||
553 | vmovdqu x1, 1 * 16(rio); \ | ||
554 | vmovdqu x2, 2 * 16(rio); \ | ||
555 | vmovdqu x3, 3 * 16(rio); \ | ||
556 | vmovdqu x4, 4 * 16(rio); \ | ||
557 | vmovdqu x5, 5 * 16(rio); \ | ||
558 | vmovdqu x6, 6 * 16(rio); \ | ||
559 | vmovdqu x7, 7 * 16(rio); \ | ||
560 | vmovdqu y0, 8 * 16(rio); \ | ||
561 | vmovdqu y1, 9 * 16(rio); \ | ||
562 | vmovdqu y2, 10 * 16(rio); \ | ||
563 | vmovdqu y3, 11 * 16(rio); \ | ||
564 | vmovdqu y4, 12 * 16(rio); \ | ||
565 | vmovdqu y5, 13 * 16(rio); \ | ||
566 | vmovdqu y6, 14 * 16(rio); \ | ||
567 | vmovdqu y7, 15 * 16(rio); | ||
568 | |||
569 | .data | ||
570 | .align 16 | ||
571 | |||
572 | #define SHUFB_BYTES(idx) \ | ||
573 | 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) | ||
574 | |||
575 | .Lshufb_16x16b: | ||
576 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); | ||
577 | |||
578 | .Lpack_bswap: | ||
579 | .long 0x00010203 | ||
580 | .long 0x04050607 | ||
581 | .long 0x80808080 | ||
582 | .long 0x80808080 | ||
583 | |||
584 | /* For CTR-mode IV byteswap */ | ||
585 | .Lbswap128_mask: | ||
586 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
587 | |||
588 | /* | ||
589 | * pre-SubByte transform | ||
590 | * | ||
591 | * pre-lookup for sbox1, sbox2, sbox3: | ||
592 | * swap_bitendianness( | ||
593 | * isom_map_camellia_to_aes( | ||
594 | * camellia_f( | ||
595 | * swap_bitendianess(in) | ||
596 | * ) | ||
597 | * ) | ||
598 | * ) | ||
599 | * | ||
600 | * (note: '⊕ 0xc5' inside camellia_f()) | ||
601 | */ | ||
602 | .Lpre_tf_lo_s1: | ||
603 | .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 | ||
604 | .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 | ||
605 | .Lpre_tf_hi_s1: | ||
606 | .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a | ||
607 | .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 | ||
608 | |||
609 | /* | ||
610 | * pre-SubByte transform | ||
611 | * | ||
612 | * pre-lookup for sbox4: | ||
613 | * swap_bitendianness( | ||
614 | * isom_map_camellia_to_aes( | ||
615 | * camellia_f( | ||
616 | * swap_bitendianess(in <<< 1) | ||
617 | * ) | ||
618 | * ) | ||
619 | * ) | ||
620 | * | ||
621 | * (note: '⊕ 0xc5' inside camellia_f()) | ||
622 | */ | ||
623 | .Lpre_tf_lo_s4: | ||
624 | .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 | ||
625 | .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 | ||
626 | .Lpre_tf_hi_s4: | ||
627 | .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 | ||
628 | .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf | ||
629 | |||
630 | /* | ||
631 | * post-SubByte transform | ||
632 | * | ||
633 | * post-lookup for sbox1, sbox4: | ||
634 | * swap_bitendianness( | ||
635 | * camellia_h( | ||
636 | * isom_map_aes_to_camellia( | ||
637 | * swap_bitendianness( | ||
638 | * aes_inverse_affine_transform(in) | ||
639 | * ) | ||
640 | * ) | ||
641 | * ) | ||
642 | * ) | ||
643 | * | ||
644 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
645 | */ | ||
646 | .Lpost_tf_lo_s1: | ||
647 | .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 | ||
648 | .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 | ||
649 | .Lpost_tf_hi_s1: | ||
650 | .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 | ||
651 | .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c | ||
652 | |||
653 | /* | ||
654 | * post-SubByte transform | ||
655 | * | ||
656 | * post-lookup for sbox2: | ||
657 | * swap_bitendianness( | ||
658 | * camellia_h( | ||
659 | * isom_map_aes_to_camellia( | ||
660 | * swap_bitendianness( | ||
661 | * aes_inverse_affine_transform(in) | ||
662 | * ) | ||
663 | * ) | ||
664 | * ) | ||
665 | * ) <<< 1 | ||
666 | * | ||
667 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
668 | */ | ||
669 | .Lpost_tf_lo_s2: | ||
670 | .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 | ||
671 | .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 | ||
672 | .Lpost_tf_hi_s2: | ||
673 | .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 | ||
674 | .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 | ||
675 | |||
676 | /* | ||
677 | * post-SubByte transform | ||
678 | * | ||
679 | * post-lookup for sbox3: | ||
680 | * swap_bitendianness( | ||
681 | * camellia_h( | ||
682 | * isom_map_aes_to_camellia( | ||
683 | * swap_bitendianness( | ||
684 | * aes_inverse_affine_transform(in) | ||
685 | * ) | ||
686 | * ) | ||
687 | * ) | ||
688 | * ) >>> 1 | ||
689 | * | ||
690 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
691 | */ | ||
692 | .Lpost_tf_lo_s3: | ||
693 | .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 | ||
694 | .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 | ||
695 | .Lpost_tf_hi_s3: | ||
696 | .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 | ||
697 | .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 | ||
698 | |||
699 | /* For isolating SubBytes from AESENCLAST, inverse shift row */ | ||
700 | .Linv_shift_row: | ||
701 | .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b | ||
702 | .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 | ||
703 | |||
704 | /* 4-bit mask */ | ||
705 | .align 4 | ||
706 | .L0f0f0f0f: | ||
707 | .long 0x0f0f0f0f | ||
708 | |||
709 | .text | ||
710 | |||
711 | .align 8 | ||
712 | .type __camellia_enc_blk16,@function; | ||
713 | |||
714 | __camellia_enc_blk16: | ||
715 | /* input: | ||
716 | * %rdi: ctx, CTX | ||
717 | * %rax: temporary storage, 256 bytes | ||
718 | * %xmm0..%xmm15: 16 plaintext blocks | ||
719 | * output: | ||
720 | * %xmm0..%xmm15: 16 encrypted blocks, order swapped: | ||
721 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | ||
722 | */ | ||
723 | |||
724 | leaq 8 * 16(%rax), %rcx; | ||
725 | |||
726 | inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
727 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
728 | %xmm15, %rax, %rcx); | ||
729 | |||
730 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
731 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
732 | %xmm15, %rax, %rcx, 0); | ||
733 | |||
734 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
735 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
736 | %xmm15, | ||
737 | ((key_table + (8) * 8) + 0)(CTX), | ||
738 | ((key_table + (8) * 8) + 4)(CTX), | ||
739 | ((key_table + (8) * 8) + 8)(CTX), | ||
740 | ((key_table + (8) * 8) + 12)(CTX)); | ||
741 | |||
742 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
743 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
744 | %xmm15, %rax, %rcx, 8); | ||
745 | |||
746 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
747 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
748 | %xmm15, | ||
749 | ((key_table + (16) * 8) + 0)(CTX), | ||
750 | ((key_table + (16) * 8) + 4)(CTX), | ||
751 | ((key_table + (16) * 8) + 8)(CTX), | ||
752 | ((key_table + (16) * 8) + 12)(CTX)); | ||
753 | |||
754 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
755 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
756 | %xmm15, %rax, %rcx, 16); | ||
757 | |||
758 | movl $24, %r8d; | ||
759 | cmpl $16, key_length(CTX); | ||
760 | jne .Lenc_max32; | ||
761 | |||
762 | .Lenc_done: | ||
763 | /* load CD for output */ | ||
764 | vmovdqu 0 * 16(%rcx), %xmm8; | ||
765 | vmovdqu 1 * 16(%rcx), %xmm9; | ||
766 | vmovdqu 2 * 16(%rcx), %xmm10; | ||
767 | vmovdqu 3 * 16(%rcx), %xmm11; | ||
768 | vmovdqu 4 * 16(%rcx), %xmm12; | ||
769 | vmovdqu 5 * 16(%rcx), %xmm13; | ||
770 | vmovdqu 6 * 16(%rcx), %xmm14; | ||
771 | vmovdqu 7 * 16(%rcx), %xmm15; | ||
772 | |||
773 | outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
774 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
775 | %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); | ||
776 | |||
777 | ret; | ||
778 | |||
779 | .align 8 | ||
780 | .Lenc_max32: | ||
781 | movl $32, %r8d; | ||
782 | |||
783 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
784 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
785 | %xmm15, | ||
786 | ((key_table + (24) * 8) + 0)(CTX), | ||
787 | ((key_table + (24) * 8) + 4)(CTX), | ||
788 | ((key_table + (24) * 8) + 8)(CTX), | ||
789 | ((key_table + (24) * 8) + 12)(CTX)); | ||
790 | |||
791 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
792 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
793 | %xmm15, %rax, %rcx, 24); | ||
794 | |||
795 | jmp .Lenc_done; | ||
796 | |||
797 | .align 8 | ||
798 | .type __camellia_dec_blk16,@function; | ||
799 | |||
800 | __camellia_dec_blk16: | ||
801 | /* input: | ||
802 | * %rdi: ctx, CTX | ||
803 | * %rax: temporary storage, 256 bytes | ||
804 | * %r8d: 24 for 16 byte key, 32 for larger | ||
805 | * %xmm0..%xmm15: 16 encrypted blocks | ||
806 | * output: | ||
807 | * %xmm0..%xmm15: 16 plaintext blocks, order swapped: | ||
808 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | ||
809 | */ | ||
810 | |||
811 | leaq 8 * 16(%rax), %rcx; | ||
812 | |||
813 | inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
814 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
815 | %xmm15, %rax, %rcx); | ||
816 | |||
817 | cmpl $32, %r8d; | ||
818 | je .Ldec_max32; | ||
819 | |||
820 | .Ldec_max24: | ||
821 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
822 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
823 | %xmm15, %rax, %rcx, 16); | ||
824 | |||
825 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
826 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
827 | %xmm15, | ||
828 | ((key_table + (16) * 8) + 8)(CTX), | ||
829 | ((key_table + (16) * 8) + 12)(CTX), | ||
830 | ((key_table + (16) * 8) + 0)(CTX), | ||
831 | ((key_table + (16) * 8) + 4)(CTX)); | ||
832 | |||
833 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
834 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
835 | %xmm15, %rax, %rcx, 8); | ||
836 | |||
837 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
838 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
839 | %xmm15, | ||
840 | ((key_table + (8) * 8) + 8)(CTX), | ||
841 | ((key_table + (8) * 8) + 12)(CTX), | ||
842 | ((key_table + (8) * 8) + 0)(CTX), | ||
843 | ((key_table + (8) * 8) + 4)(CTX)); | ||
844 | |||
845 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
846 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
847 | %xmm15, %rax, %rcx, 0); | ||
848 | |||
849 | /* load CD for output */ | ||
850 | vmovdqu 0 * 16(%rcx), %xmm8; | ||
851 | vmovdqu 1 * 16(%rcx), %xmm9; | ||
852 | vmovdqu 2 * 16(%rcx), %xmm10; | ||
853 | vmovdqu 3 * 16(%rcx), %xmm11; | ||
854 | vmovdqu 4 * 16(%rcx), %xmm12; | ||
855 | vmovdqu 5 * 16(%rcx), %xmm13; | ||
856 | vmovdqu 6 * 16(%rcx), %xmm14; | ||
857 | vmovdqu 7 * 16(%rcx), %xmm15; | ||
858 | |||
859 | outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
860 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
861 | %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); | ||
862 | |||
863 | ret; | ||
864 | |||
865 | .align 8 | ||
866 | .Ldec_max32: | ||
867 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
868 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
869 | %xmm15, %rax, %rcx, 24); | ||
870 | |||
871 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
872 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
873 | %xmm15, | ||
874 | ((key_table + (24) * 8) + 8)(CTX), | ||
875 | ((key_table + (24) * 8) + 12)(CTX), | ||
876 | ((key_table + (24) * 8) + 0)(CTX), | ||
877 | ((key_table + (24) * 8) + 4)(CTX)); | ||
878 | |||
879 | jmp .Ldec_max24; | ||
880 | |||
881 | .align 8 | ||
882 | .global camellia_ecb_enc_16way | ||
883 | .type camellia_ecb_enc_16way,@function; | ||
884 | |||
885 | camellia_ecb_enc_16way: | ||
886 | /* input: | ||
887 | * %rdi: ctx, CTX | ||
888 | * %rsi: dst (16 blocks) | ||
889 | * %rdx: src (16 blocks) | ||
890 | */ | ||
891 | |||
892 | inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
893 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
894 | %xmm15, %rdx, (key_table)(CTX)); | ||
895 | |||
896 | /* now dst can be used as temporary buffer (even in src == dst case) */ | ||
897 | movq %rsi, %rax; | ||
898 | |||
899 | call __camellia_enc_blk16; | ||
900 | |||
901 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
902 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
903 | %xmm8, %rsi); | ||
904 | |||
905 | ret; | ||
906 | |||
907 | .align 8 | ||
908 | .global camellia_ecb_dec_16way | ||
909 | .type camellia_ecb_dec_16way,@function; | ||
910 | |||
911 | camellia_ecb_dec_16way: | ||
912 | /* input: | ||
913 | * %rdi: ctx, CTX | ||
914 | * %rsi: dst (16 blocks) | ||
915 | * %rdx: src (16 blocks) | ||
916 | */ | ||
917 | |||
918 | cmpl $16, key_length(CTX); | ||
919 | movl $32, %r8d; | ||
920 | movl $24, %eax; | ||
921 | cmovel %eax, %r8d; /* max */ | ||
922 | |||
923 | inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
924 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
925 | %xmm15, %rdx, (key_table)(CTX, %r8, 8)); | ||
926 | |||
927 | /* now dst can be used as temporary buffer (even in src == dst case) */ | ||
928 | movq %rsi, %rax; | ||
929 | |||
930 | call __camellia_dec_blk16; | ||
931 | |||
932 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
933 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
934 | %xmm8, %rsi); | ||
935 | |||
936 | ret; | ||
937 | |||
938 | .align 8 | ||
939 | .global camellia_cbc_dec_16way | ||
940 | .type camellia_cbc_dec_16way,@function; | ||
941 | |||
942 | camellia_cbc_dec_16way: | ||
943 | /* input: | ||
944 | * %rdi: ctx, CTX | ||
945 | * %rsi: dst (16 blocks) | ||
946 | * %rdx: src (16 blocks) | ||
947 | */ | ||
948 | |||
949 | cmpl $16, key_length(CTX); | ||
950 | movl $32, %r8d; | ||
951 | movl $24, %eax; | ||
952 | cmovel %eax, %r8d; /* max */ | ||
953 | |||
954 | inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
955 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
956 | %xmm15, %rdx, (key_table)(CTX, %r8, 8)); | ||
957 | |||
958 | /* | ||
959 | * dst might still be in-use (in case dst == src), so use stack for | ||
960 | * temporary storage. | ||
961 | */ | ||
962 | subq $(16 * 16), %rsp; | ||
963 | movq %rsp, %rax; | ||
964 | |||
965 | call __camellia_dec_blk16; | ||
966 | |||
967 | addq $(16 * 16), %rsp; | ||
968 | |||
969 | vpxor (0 * 16)(%rdx), %xmm6, %xmm6; | ||
970 | vpxor (1 * 16)(%rdx), %xmm5, %xmm5; | ||
971 | vpxor (2 * 16)(%rdx), %xmm4, %xmm4; | ||
972 | vpxor (3 * 16)(%rdx), %xmm3, %xmm3; | ||
973 | vpxor (4 * 16)(%rdx), %xmm2, %xmm2; | ||
974 | vpxor (5 * 16)(%rdx), %xmm1, %xmm1; | ||
975 | vpxor (6 * 16)(%rdx), %xmm0, %xmm0; | ||
976 | vpxor (7 * 16)(%rdx), %xmm15, %xmm15; | ||
977 | vpxor (8 * 16)(%rdx), %xmm14, %xmm14; | ||
978 | vpxor (9 * 16)(%rdx), %xmm13, %xmm13; | ||
979 | vpxor (10 * 16)(%rdx), %xmm12, %xmm12; | ||
980 | vpxor (11 * 16)(%rdx), %xmm11, %xmm11; | ||
981 | vpxor (12 * 16)(%rdx), %xmm10, %xmm10; | ||
982 | vpxor (13 * 16)(%rdx), %xmm9, %xmm9; | ||
983 | vpxor (14 * 16)(%rdx), %xmm8, %xmm8; | ||
984 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
985 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
986 | %xmm8, %rsi); | ||
987 | |||
988 | ret; | ||
989 | |||
990 | #define inc_le128(x, minus_one, tmp) \ | ||
991 | vpcmpeqq minus_one, x, tmp; \ | ||
992 | vpsubq minus_one, x, x; \ | ||
993 | vpslldq $8, tmp, tmp; \ | ||
994 | vpsubq tmp, x, x; | ||
995 | |||
996 | .align 8 | ||
997 | .global camellia_ctr_16way | ||
998 | .type camellia_ctr_16way,@function; | ||
999 | |||
1000 | camellia_ctr_16way: | ||
1001 | /* input: | ||
1002 | * %rdi: ctx, CTX | ||
1003 | * %rsi: dst (16 blocks) | ||
1004 | * %rdx: src (16 blocks) | ||
1005 | * %rcx: iv (little endian, 128bit) | ||
1006 | */ | ||
1007 | |||
1008 | subq $(16 * 16), %rsp; | ||
1009 | movq %rsp, %rax; | ||
1010 | |||
1011 | vmovdqa .Lbswap128_mask, %xmm14; | ||
1012 | |||
1013 | /* load IV and byteswap */ | ||
1014 | vmovdqu (%rcx), %xmm0; | ||
1015 | vpshufb %xmm14, %xmm0, %xmm15; | ||
1016 | vmovdqu %xmm15, 15 * 16(%rax); | ||
1017 | |||
1018 | vpcmpeqd %xmm15, %xmm15, %xmm15; | ||
1019 | vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ | ||
1020 | |||
1021 | /* construct IVs */ | ||
1022 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1023 | vpshufb %xmm14, %xmm0, %xmm13; | ||
1024 | vmovdqu %xmm13, 14 * 16(%rax); | ||
1025 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1026 | vpshufb %xmm14, %xmm0, %xmm13; | ||
1027 | vmovdqu %xmm13, 13 * 16(%rax); | ||
1028 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1029 | vpshufb %xmm14, %xmm0, %xmm12; | ||
1030 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1031 | vpshufb %xmm14, %xmm0, %xmm11; | ||
1032 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1033 | vpshufb %xmm14, %xmm0, %xmm10; | ||
1034 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1035 | vpshufb %xmm14, %xmm0, %xmm9; | ||
1036 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1037 | vpshufb %xmm14, %xmm0, %xmm8; | ||
1038 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1039 | vpshufb %xmm14, %xmm0, %xmm7; | ||
1040 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1041 | vpshufb %xmm14, %xmm0, %xmm6; | ||
1042 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1043 | vpshufb %xmm14, %xmm0, %xmm5; | ||
1044 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1045 | vpshufb %xmm14, %xmm0, %xmm4; | ||
1046 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1047 | vpshufb %xmm14, %xmm0, %xmm3; | ||
1048 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1049 | vpshufb %xmm14, %xmm0, %xmm2; | ||
1050 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1051 | vpshufb %xmm14, %xmm0, %xmm1; | ||
1052 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1053 | vmovdqa %xmm0, %xmm13; | ||
1054 | vpshufb %xmm14, %xmm0, %xmm0; | ||
1055 | inc_le128(%xmm13, %xmm15, %xmm14); | ||
1056 | vmovdqu %xmm13, (%rcx); | ||
1057 | |||
1058 | /* inpack16_pre: */ | ||
1059 | vmovq (key_table)(CTX), %xmm15; | ||
1060 | vpshufb .Lpack_bswap, %xmm15, %xmm15; | ||
1061 | vpxor %xmm0, %xmm15, %xmm0; | ||
1062 | vpxor %xmm1, %xmm15, %xmm1; | ||
1063 | vpxor %xmm2, %xmm15, %xmm2; | ||
1064 | vpxor %xmm3, %xmm15, %xmm3; | ||
1065 | vpxor %xmm4, %xmm15, %xmm4; | ||
1066 | vpxor %xmm5, %xmm15, %xmm5; | ||
1067 | vpxor %xmm6, %xmm15, %xmm6; | ||
1068 | vpxor %xmm7, %xmm15, %xmm7; | ||
1069 | vpxor %xmm8, %xmm15, %xmm8; | ||
1070 | vpxor %xmm9, %xmm15, %xmm9; | ||
1071 | vpxor %xmm10, %xmm15, %xmm10; | ||
1072 | vpxor %xmm11, %xmm15, %xmm11; | ||
1073 | vpxor %xmm12, %xmm15, %xmm12; | ||
1074 | vpxor 13 * 16(%rax), %xmm15, %xmm13; | ||
1075 | vpxor 14 * 16(%rax), %xmm15, %xmm14; | ||
1076 | vpxor 15 * 16(%rax), %xmm15, %xmm15; | ||
1077 | |||
1078 | call __camellia_enc_blk16; | ||
1079 | |||
1080 | addq $(16 * 16), %rsp; | ||
1081 | |||
1082 | vpxor 0 * 16(%rdx), %xmm7, %xmm7; | ||
1083 | vpxor 1 * 16(%rdx), %xmm6, %xmm6; | ||
1084 | vpxor 2 * 16(%rdx), %xmm5, %xmm5; | ||
1085 | vpxor 3 * 16(%rdx), %xmm4, %xmm4; | ||
1086 | vpxor 4 * 16(%rdx), %xmm3, %xmm3; | ||
1087 | vpxor 5 * 16(%rdx), %xmm2, %xmm2; | ||
1088 | vpxor 6 * 16(%rdx), %xmm1, %xmm1; | ||
1089 | vpxor 7 * 16(%rdx), %xmm0, %xmm0; | ||
1090 | vpxor 8 * 16(%rdx), %xmm15, %xmm15; | ||
1091 | vpxor 9 * 16(%rdx), %xmm14, %xmm14; | ||
1092 | vpxor 10 * 16(%rdx), %xmm13, %xmm13; | ||
1093 | vpxor 11 * 16(%rdx), %xmm12, %xmm12; | ||
1094 | vpxor 12 * 16(%rdx), %xmm11, %xmm11; | ||
1095 | vpxor 13 * 16(%rdx), %xmm10, %xmm10; | ||
1096 | vpxor 14 * 16(%rdx), %xmm9, %xmm9; | ||
1097 | vpxor 15 * 16(%rdx), %xmm8, %xmm8; | ||
1098 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
1099 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
1100 | %xmm8, %rsi); | ||
1101 | |||
1102 | ret; | ||
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c new file mode 100644 index 000000000000..96cbb6068fce --- /dev/null +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c | |||
@@ -0,0 +1,558 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia | ||
3 | * | ||
4 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <crypto/algapi.h> | ||
18 | #include <crypto/ctr.h> | ||
19 | #include <crypto/lrw.h> | ||
20 | #include <crypto/xts.h> | ||
21 | #include <asm/xcr.h> | ||
22 | #include <asm/xsave.h> | ||
23 | #include <asm/crypto/camellia.h> | ||
24 | #include <asm/crypto/ablk_helper.h> | ||
25 | #include <asm/crypto/glue_helper.h> | ||
26 | |||
27 | #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 | ||
28 | |||
29 | /* 16-way AES-NI parallel cipher functions */ | ||
30 | asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, | ||
31 | const u8 *src); | ||
32 | asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
33 | const u8 *src); | ||
34 | |||
35 | asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
36 | const u8 *src); | ||
37 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, | ||
38 | const u8 *src, le128 *iv); | ||
39 | |||
40 | static const struct common_glue_ctx camellia_enc = { | ||
41 | .num_funcs = 3, | ||
42 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
43 | |||
44 | .funcs = { { | ||
45 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
46 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } | ||
47 | }, { | ||
48 | .num_blocks = 2, | ||
49 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } | ||
50 | }, { | ||
51 | .num_blocks = 1, | ||
52 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } | ||
53 | } } | ||
54 | }; | ||
55 | |||
56 | static const struct common_glue_ctx camellia_ctr = { | ||
57 | .num_funcs = 3, | ||
58 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
59 | |||
60 | .funcs = { { | ||
61 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
62 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } | ||
63 | }, { | ||
64 | .num_blocks = 2, | ||
65 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } | ||
66 | }, { | ||
67 | .num_blocks = 1, | ||
68 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } | ||
69 | } } | ||
70 | }; | ||
71 | |||
72 | static const struct common_glue_ctx camellia_dec = { | ||
73 | .num_funcs = 3, | ||
74 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
75 | |||
76 | .funcs = { { | ||
77 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
78 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } | ||
79 | }, { | ||
80 | .num_blocks = 2, | ||
81 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } | ||
82 | }, { | ||
83 | .num_blocks = 1, | ||
84 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } | ||
85 | } } | ||
86 | }; | ||
87 | |||
88 | static const struct common_glue_ctx camellia_dec_cbc = { | ||
89 | .num_funcs = 3, | ||
90 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
91 | |||
92 | .funcs = { { | ||
93 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
94 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } | ||
95 | }, { | ||
96 | .num_blocks = 2, | ||
97 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } | ||
98 | }, { | ||
99 | .num_blocks = 1, | ||
100 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } | ||
101 | } } | ||
102 | }; | ||
103 | |||
104 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
105 | struct scatterlist *src, unsigned int nbytes) | ||
106 | { | ||
107 | return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); | ||
108 | } | ||
109 | |||
110 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
111 | struct scatterlist *src, unsigned int nbytes) | ||
112 | { | ||
113 | return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); | ||
114 | } | ||
115 | |||
116 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
117 | struct scatterlist *src, unsigned int nbytes) | ||
118 | { | ||
119 | return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, | ||
120 | dst, src, nbytes); | ||
121 | } | ||
122 | |||
123 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
124 | struct scatterlist *src, unsigned int nbytes) | ||
125 | { | ||
126 | return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, | ||
127 | nbytes); | ||
128 | } | ||
129 | |||
130 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
131 | struct scatterlist *src, unsigned int nbytes) | ||
132 | { | ||
133 | return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); | ||
134 | } | ||
135 | |||
136 | static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
137 | { | ||
138 | return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, | ||
139 | CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, | ||
140 | nbytes); | ||
141 | } | ||
142 | |||
143 | static inline void camellia_fpu_end(bool fpu_enabled) | ||
144 | { | ||
145 | glue_fpu_end(fpu_enabled); | ||
146 | } | ||
147 | |||
148 | static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, | ||
149 | unsigned int key_len) | ||
150 | { | ||
151 | return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, | ||
152 | &tfm->crt_flags); | ||
153 | } | ||
154 | |||
155 | struct crypt_priv { | ||
156 | struct camellia_ctx *ctx; | ||
157 | bool fpu_enabled; | ||
158 | }; | ||
159 | |||
160 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
161 | { | ||
162 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | ||
163 | struct crypt_priv *ctx = priv; | ||
164 | int i; | ||
165 | |||
166 | ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); | ||
167 | |||
168 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | ||
169 | camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); | ||
170 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
171 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
172 | } | ||
173 | |||
174 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | ||
175 | camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); | ||
176 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
177 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
178 | } | ||
179 | |||
180 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
181 | camellia_enc_blk(ctx->ctx, srcdst, srcdst); | ||
182 | } | ||
183 | |||
184 | static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
185 | { | ||
186 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | ||
187 | struct crypt_priv *ctx = priv; | ||
188 | int i; | ||
189 | |||
190 | ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); | ||
191 | |||
192 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | ||
193 | camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); | ||
194 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
195 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
196 | } | ||
197 | |||
198 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | ||
199 | camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); | ||
200 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
201 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
202 | } | ||
203 | |||
204 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
205 | camellia_dec_blk(ctx->ctx, srcdst, srcdst); | ||
206 | } | ||
207 | |||
208 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
209 | struct scatterlist *src, unsigned int nbytes) | ||
210 | { | ||
211 | struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
212 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
213 | struct crypt_priv crypt_ctx = { | ||
214 | .ctx = &ctx->camellia_ctx, | ||
215 | .fpu_enabled = false, | ||
216 | }; | ||
217 | struct lrw_crypt_req req = { | ||
218 | .tbuf = buf, | ||
219 | .tbuflen = sizeof(buf), | ||
220 | |||
221 | .table_ctx = &ctx->lrw_table, | ||
222 | .crypt_ctx = &crypt_ctx, | ||
223 | .crypt_fn = encrypt_callback, | ||
224 | }; | ||
225 | int ret; | ||
226 | |||
227 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
228 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
229 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
230 | |||
231 | return ret; | ||
232 | } | ||
233 | |||
234 | static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
235 | struct scatterlist *src, unsigned int nbytes) | ||
236 | { | ||
237 | struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
238 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
239 | struct crypt_priv crypt_ctx = { | ||
240 | .ctx = &ctx->camellia_ctx, | ||
241 | .fpu_enabled = false, | ||
242 | }; | ||
243 | struct lrw_crypt_req req = { | ||
244 | .tbuf = buf, | ||
245 | .tbuflen = sizeof(buf), | ||
246 | |||
247 | .table_ctx = &ctx->lrw_table, | ||
248 | .crypt_ctx = &crypt_ctx, | ||
249 | .crypt_fn = decrypt_callback, | ||
250 | }; | ||
251 | int ret; | ||
252 | |||
253 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
254 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
255 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
256 | |||
257 | return ret; | ||
258 | } | ||
259 | |||
260 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
261 | struct scatterlist *src, unsigned int nbytes) | ||
262 | { | ||
263 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
264 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
265 | struct crypt_priv crypt_ctx = { | ||
266 | .ctx = &ctx->crypt_ctx, | ||
267 | .fpu_enabled = false, | ||
268 | }; | ||
269 | struct xts_crypt_req req = { | ||
270 | .tbuf = buf, | ||
271 | .tbuflen = sizeof(buf), | ||
272 | |||
273 | .tweak_ctx = &ctx->tweak_ctx, | ||
274 | .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), | ||
275 | .crypt_ctx = &crypt_ctx, | ||
276 | .crypt_fn = encrypt_callback, | ||
277 | }; | ||
278 | int ret; | ||
279 | |||
280 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
281 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
282 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
283 | |||
284 | return ret; | ||
285 | } | ||
286 | |||
287 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
288 | struct scatterlist *src, unsigned int nbytes) | ||
289 | { | ||
290 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
291 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
292 | struct crypt_priv crypt_ctx = { | ||
293 | .ctx = &ctx->crypt_ctx, | ||
294 | .fpu_enabled = false, | ||
295 | }; | ||
296 | struct xts_crypt_req req = { | ||
297 | .tbuf = buf, | ||
298 | .tbuflen = sizeof(buf), | ||
299 | |||
300 | .tweak_ctx = &ctx->tweak_ctx, | ||
301 | .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), | ||
302 | .crypt_ctx = &crypt_ctx, | ||
303 | .crypt_fn = decrypt_callback, | ||
304 | }; | ||
305 | int ret; | ||
306 | |||
307 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
308 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
309 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
310 | |||
311 | return ret; | ||
312 | } | ||
313 | |||
314 | static struct crypto_alg cmll_algs[10] = { { | ||
315 | .cra_name = "__ecb-camellia-aesni", | ||
316 | .cra_driver_name = "__driver-ecb-camellia-aesni", | ||
317 | .cra_priority = 0, | ||
318 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
319 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
320 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
321 | .cra_alignmask = 0, | ||
322 | .cra_type = &crypto_blkcipher_type, | ||
323 | .cra_module = THIS_MODULE, | ||
324 | .cra_u = { | ||
325 | .blkcipher = { | ||
326 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
327 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
328 | .setkey = camellia_setkey, | ||
329 | .encrypt = ecb_encrypt, | ||
330 | .decrypt = ecb_decrypt, | ||
331 | }, | ||
332 | }, | ||
333 | }, { | ||
334 | .cra_name = "__cbc-camellia-aesni", | ||
335 | .cra_driver_name = "__driver-cbc-camellia-aesni", | ||
336 | .cra_priority = 0, | ||
337 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
338 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
339 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
340 | .cra_alignmask = 0, | ||
341 | .cra_type = &crypto_blkcipher_type, | ||
342 | .cra_module = THIS_MODULE, | ||
343 | .cra_u = { | ||
344 | .blkcipher = { | ||
345 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
346 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
347 | .setkey = camellia_setkey, | ||
348 | .encrypt = cbc_encrypt, | ||
349 | .decrypt = cbc_decrypt, | ||
350 | }, | ||
351 | }, | ||
352 | }, { | ||
353 | .cra_name = "__ctr-camellia-aesni", | ||
354 | .cra_driver_name = "__driver-ctr-camellia-aesni", | ||
355 | .cra_priority = 0, | ||
356 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
357 | .cra_blocksize = 1, | ||
358 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
359 | .cra_alignmask = 0, | ||
360 | .cra_type = &crypto_blkcipher_type, | ||
361 | .cra_module = THIS_MODULE, | ||
362 | .cra_u = { | ||
363 | .blkcipher = { | ||
364 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
365 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
366 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
367 | .setkey = camellia_setkey, | ||
368 | .encrypt = ctr_crypt, | ||
369 | .decrypt = ctr_crypt, | ||
370 | }, | ||
371 | }, | ||
372 | }, { | ||
373 | .cra_name = "__lrw-camellia-aesni", | ||
374 | .cra_driver_name = "__driver-lrw-camellia-aesni", | ||
375 | .cra_priority = 0, | ||
376 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
377 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
378 | .cra_ctxsize = sizeof(struct camellia_lrw_ctx), | ||
379 | .cra_alignmask = 0, | ||
380 | .cra_type = &crypto_blkcipher_type, | ||
381 | .cra_module = THIS_MODULE, | ||
382 | .cra_exit = lrw_camellia_exit_tfm, | ||
383 | .cra_u = { | ||
384 | .blkcipher = { | ||
385 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + | ||
386 | CAMELLIA_BLOCK_SIZE, | ||
387 | .max_keysize = CAMELLIA_MAX_KEY_SIZE + | ||
388 | CAMELLIA_BLOCK_SIZE, | ||
389 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
390 | .setkey = lrw_camellia_setkey, | ||
391 | .encrypt = lrw_encrypt, | ||
392 | .decrypt = lrw_decrypt, | ||
393 | }, | ||
394 | }, | ||
395 | }, { | ||
396 | .cra_name = "__xts-camellia-aesni", | ||
397 | .cra_driver_name = "__driver-xts-camellia-aesni", | ||
398 | .cra_priority = 0, | ||
399 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
400 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
401 | .cra_ctxsize = sizeof(struct camellia_xts_ctx), | ||
402 | .cra_alignmask = 0, | ||
403 | .cra_type = &crypto_blkcipher_type, | ||
404 | .cra_module = THIS_MODULE, | ||
405 | .cra_u = { | ||
406 | .blkcipher = { | ||
407 | .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, | ||
408 | .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, | ||
409 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
410 | .setkey = xts_camellia_setkey, | ||
411 | .encrypt = xts_encrypt, | ||
412 | .decrypt = xts_decrypt, | ||
413 | }, | ||
414 | }, | ||
415 | }, { | ||
416 | .cra_name = "ecb(camellia)", | ||
417 | .cra_driver_name = "ecb-camellia-aesni", | ||
418 | .cra_priority = 400, | ||
419 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
420 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
421 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
422 | .cra_alignmask = 0, | ||
423 | .cra_type = &crypto_ablkcipher_type, | ||
424 | .cra_module = THIS_MODULE, | ||
425 | .cra_init = ablk_init, | ||
426 | .cra_exit = ablk_exit, | ||
427 | .cra_u = { | ||
428 | .ablkcipher = { | ||
429 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
430 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
431 | .setkey = ablk_set_key, | ||
432 | .encrypt = ablk_encrypt, | ||
433 | .decrypt = ablk_decrypt, | ||
434 | }, | ||
435 | }, | ||
436 | }, { | ||
437 | .cra_name = "cbc(camellia)", | ||
438 | .cra_driver_name = "cbc-camellia-aesni", | ||
439 | .cra_priority = 400, | ||
440 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
441 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
442 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
443 | .cra_alignmask = 0, | ||
444 | .cra_type = &crypto_ablkcipher_type, | ||
445 | .cra_module = THIS_MODULE, | ||
446 | .cra_init = ablk_init, | ||
447 | .cra_exit = ablk_exit, | ||
448 | .cra_u = { | ||
449 | .ablkcipher = { | ||
450 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
451 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
452 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
453 | .setkey = ablk_set_key, | ||
454 | .encrypt = __ablk_encrypt, | ||
455 | .decrypt = ablk_decrypt, | ||
456 | }, | ||
457 | }, | ||
458 | }, { | ||
459 | .cra_name = "ctr(camellia)", | ||
460 | .cra_driver_name = "ctr-camellia-aesni", | ||
461 | .cra_priority = 400, | ||
462 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
463 | .cra_blocksize = 1, | ||
464 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
465 | .cra_alignmask = 0, | ||
466 | .cra_type = &crypto_ablkcipher_type, | ||
467 | .cra_module = THIS_MODULE, | ||
468 | .cra_init = ablk_init, | ||
469 | .cra_exit = ablk_exit, | ||
470 | .cra_u = { | ||
471 | .ablkcipher = { | ||
472 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
473 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
474 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
475 | .setkey = ablk_set_key, | ||
476 | .encrypt = ablk_encrypt, | ||
477 | .decrypt = ablk_encrypt, | ||
478 | .geniv = "chainiv", | ||
479 | }, | ||
480 | }, | ||
481 | }, { | ||
482 | .cra_name = "lrw(camellia)", | ||
483 | .cra_driver_name = "lrw-camellia-aesni", | ||
484 | .cra_priority = 400, | ||
485 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
486 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
487 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
488 | .cra_alignmask = 0, | ||
489 | .cra_type = &crypto_ablkcipher_type, | ||
490 | .cra_module = THIS_MODULE, | ||
491 | .cra_init = ablk_init, | ||
492 | .cra_exit = ablk_exit, | ||
493 | .cra_u = { | ||
494 | .ablkcipher = { | ||
495 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + | ||
496 | CAMELLIA_BLOCK_SIZE, | ||
497 | .max_keysize = CAMELLIA_MAX_KEY_SIZE + | ||
498 | CAMELLIA_BLOCK_SIZE, | ||
499 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
500 | .setkey = ablk_set_key, | ||
501 | .encrypt = ablk_encrypt, | ||
502 | .decrypt = ablk_decrypt, | ||
503 | }, | ||
504 | }, | ||
505 | }, { | ||
506 | .cra_name = "xts(camellia)", | ||
507 | .cra_driver_name = "xts-camellia-aesni", | ||
508 | .cra_priority = 400, | ||
509 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
510 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
511 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
512 | .cra_alignmask = 0, | ||
513 | .cra_type = &crypto_ablkcipher_type, | ||
514 | .cra_module = THIS_MODULE, | ||
515 | .cra_init = ablk_init, | ||
516 | .cra_exit = ablk_exit, | ||
517 | .cra_u = { | ||
518 | .ablkcipher = { | ||
519 | .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, | ||
520 | .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, | ||
521 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
522 | .setkey = ablk_set_key, | ||
523 | .encrypt = ablk_encrypt, | ||
524 | .decrypt = ablk_decrypt, | ||
525 | }, | ||
526 | }, | ||
527 | } }; | ||
528 | |||
529 | static int __init camellia_aesni_init(void) | ||
530 | { | ||
531 | u64 xcr0; | ||
532 | |||
533 | if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { | ||
534 | pr_info("AVX or AES-NI instructions are not detected.\n"); | ||
535 | return -ENODEV; | ||
536 | } | ||
537 | |||
538 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
539 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
540 | pr_info("AVX detected but unusable.\n"); | ||
541 | return -ENODEV; | ||
542 | } | ||
543 | |||
544 | return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); | ||
545 | } | ||
546 | |||
547 | static void __exit camellia_aesni_fini(void) | ||
548 | { | ||
549 | crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); | ||
550 | } | ||
551 | |||
552 | module_init(camellia_aesni_init); | ||
553 | module_exit(camellia_aesni_fini); | ||
554 | |||
555 | MODULE_LICENSE("GPL"); | ||
556 | MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized"); | ||
557 | MODULE_ALIAS("camellia"); | ||
558 | MODULE_ALIAS("camellia-asm"); | ||