aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-10-26 07:49:01 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-11-09 04:32:32 -0500
commitd9b1d2e7e10d2e926775b1d3da39da0f51491e54 (patch)
tree58da273e600536c041e6962136a6295e6c8cfafe /arch/x86/crypto
parentcf582ccedad02eb9bfdcdb25adfc800dd117b428 (diff)
crypto: camellia - add AES-NI/AVX/x86_64 assembler implementation of camellia cipher
This patch adds AES-NI/AVX/x86_64 assembler implementation of Camellia block cipher. Implementation process data in sixteen block chunks, which are byte-sliced and AES SubBytes is reused for Camellia s-box with help of pre- and post-filtering. Patch has been tested with tcrypt and automated filesystem tests. tcrypt test results: Intel Core i5-2450M: camellia-aesni-avx vs camellia-asm-x86_64-2way: 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.98x 0.96x 0.99x 0.96x 0.96x 0.95x 0.95x 0.94x 0.97x 0.98x 64B 0.99x 0.98x 1.00x 0.98x 0.98x 0.99x 0.98x 0.93x 0.99x 0.98x 256B 2.28x 2.28x 1.01x 2.29x 2.25x 2.24x 1.96x 1.97x 1.91x 1.90x 1024B 2.57x 2.56x 1.00x 2.57x 2.51x 2.53x 2.19x 2.17x 2.19x 2.22x 8192B 2.49x 2.49x 1.00x 2.53x 2.48x 2.49x 2.17x 2.17x 2.22x 2.22x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 0.98x 0.99x 0.97x 0.97x 0.96x 0.97x 0.98x 0.98x 0.99x 64B 1.00x 1.00x 1.01x 0.99x 0.98x 0.99x 0.99x 0.99x 0.99x 0.99x 256B 2.37x 2.37x 1.01x 2.39x 2.35x 2.33x 2.10x 2.11x 1.99x 2.02x 1024B 2.58x 2.60x 1.00x 2.58x 2.56x 2.56x 2.28x 2.29x 2.28x 2.29x 8192B 2.50x 2.52x 1.00x 2.56x 2.51x 2.51x 2.24x 2.25x 2.26x 2.29x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Acked-by: David S. Miller <davem@davemloft.net> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S1102
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c558
3 files changed, 1663 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 84d7dbaba26e..e0ca7c9ac383 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
12 12
13obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 13obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
14obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o 14obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
15obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
15obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o 16obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
16obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o 17obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
17obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 18obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
34 35
35aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 36aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
36camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o 37camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
38camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
39 camellia_aesni_avx_glue.o
37cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o 40cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
38cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o 41cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
39blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 42blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..2306d2e4816f
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -0,0 +1,1102 @@
1/*
2 * x86_64/AVX/AES-NI assembler implementation of Camellia
3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13/*
14 * Version licensed under 2-clause BSD License is available at:
15 * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
16 */
17
18#define CAMELLIA_TABLE_BYTE_LEN 272
19
20/* struct camellia_ctx: */
21#define key_table 0
22#define key_length CAMELLIA_TABLE_BYTE_LEN
23
24/* register macros */
25#define CTX %rdi
26
27/**********************************************************************
28 16-way camellia
29 **********************************************************************/
30#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
31 vpand x, mask4bit, tmp0; \
32 vpandn x, mask4bit, x; \
33 vpsrld $4, x, x; \
34 \
35 vpshufb tmp0, lo_t, tmp0; \
36 vpshufb x, hi_t, x; \
37 vpxor tmp0, x, x;
38
39/*
40 * IN:
41 * x0..x7: byte-sliced AB state
42 * mem_cd: register pointer storing CD state
43 * key: index for key material
44 * OUT:
45 * x0..x7: new byte-sliced CD state
46 */
47#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
48 t7, mem_cd, key) \
49 /* \
50 * S-function with AES subbytes \
51 */ \
52 vmovdqa .Linv_shift_row, t4; \
53 vbroadcastss .L0f0f0f0f, t7; \
54 vmovdqa .Lpre_tf_lo_s1, t0; \
55 vmovdqa .Lpre_tf_hi_s1, t1; \
56 \
57 /* AES inverse shift rows */ \
58 vpshufb t4, x0, x0; \
59 vpshufb t4, x7, x7; \
60 vpshufb t4, x1, x1; \
61 vpshufb t4, x4, x4; \
62 vpshufb t4, x2, x2; \
63 vpshufb t4, x5, x5; \
64 vpshufb t4, x3, x3; \
65 vpshufb t4, x6, x6; \
66 \
67 /* prefilter sboxes 1, 2 and 3 */ \
68 vmovdqa .Lpre_tf_lo_s4, t2; \
69 vmovdqa .Lpre_tf_hi_s4, t3; \
70 filter_8bit(x0, t0, t1, t7, t6); \
71 filter_8bit(x7, t0, t1, t7, t6); \
72 filter_8bit(x1, t0, t1, t7, t6); \
73 filter_8bit(x4, t0, t1, t7, t6); \
74 filter_8bit(x2, t0, t1, t7, t6); \
75 filter_8bit(x5, t0, t1, t7, t6); \
76 \
77 /* prefilter sbox 4 */ \
78 vpxor t4, t4, t4; \
79 filter_8bit(x3, t2, t3, t7, t6); \
80 filter_8bit(x6, t2, t3, t7, t6); \
81 \
82 /* AES subbytes + AES shift rows */ \
83 vmovdqa .Lpost_tf_lo_s1, t0; \
84 vmovdqa .Lpost_tf_hi_s1, t1; \
85 vaesenclast t4, x0, x0; \
86 vaesenclast t4, x7, x7; \
87 vaesenclast t4, x1, x1; \
88 vaesenclast t4, x4, x4; \
89 vaesenclast t4, x2, x2; \
90 vaesenclast t4, x5, x5; \
91 vaesenclast t4, x3, x3; \
92 vaesenclast t4, x6, x6; \
93 \
94 /* postfilter sboxes 1 and 4 */ \
95 vmovdqa .Lpost_tf_lo_s3, t2; \
96 vmovdqa .Lpost_tf_hi_s3, t3; \
97 filter_8bit(x0, t0, t1, t7, t6); \
98 filter_8bit(x7, t0, t1, t7, t6); \
99 filter_8bit(x3, t0, t1, t7, t6); \
100 filter_8bit(x6, t0, t1, t7, t6); \
101 \
102 /* postfilter sbox 3 */ \
103 vmovdqa .Lpost_tf_lo_s2, t4; \
104 vmovdqa .Lpost_tf_hi_s2, t5; \
105 filter_8bit(x2, t2, t3, t7, t6); \
106 filter_8bit(x5, t2, t3, t7, t6); \
107 \
108 vpxor t6, t6, t6; \
109 vmovq key, t0; \
110 \
111 /* postfilter sbox 2 */ \
112 filter_8bit(x1, t4, t5, t7, t2); \
113 filter_8bit(x4, t4, t5, t7, t2); \
114 \
115 vpsrldq $5, t0, t5; \
116 vpsrldq $1, t0, t1; \
117 vpsrldq $2, t0, t2; \
118 vpsrldq $3, t0, t3; \
119 vpsrldq $4, t0, t4; \
120 vpshufb t6, t0, t0; \
121 vpshufb t6, t1, t1; \
122 vpshufb t6, t2, t2; \
123 vpshufb t6, t3, t3; \
124 vpshufb t6, t4, t4; \
125 vpsrldq $2, t5, t7; \
126 vpshufb t6, t7, t7; \
127 \
128 /* \
129 * P-function \
130 */ \
131 vpxor x5, x0, x0; \
132 vpxor x6, x1, x1; \
133 vpxor x7, x2, x2; \
134 vpxor x4, x3, x3; \
135 \
136 vpxor x2, x4, x4; \
137 vpxor x3, x5, x5; \
138 vpxor x0, x6, x6; \
139 vpxor x1, x7, x7; \
140 \
141 vpxor x7, x0, x0; \
142 vpxor x4, x1, x1; \
143 vpxor x5, x2, x2; \
144 vpxor x6, x3, x3; \
145 \
146 vpxor x3, x4, x4; \
147 vpxor x0, x5, x5; \
148 vpxor x1, x6, x6; \
149 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
150 \
151 /* \
152 * Add key material and result to CD (x becomes new CD) \
153 */ \
154 \
155 vpxor t3, x4, x4; \
156 vpxor 0 * 16(mem_cd), x4, x4; \
157 \
158 vpxor t2, x5, x5; \
159 vpxor 1 * 16(mem_cd), x5, x5; \
160 \
161 vpsrldq $1, t5, t3; \
162 vpshufb t6, t5, t5; \
163 vpshufb t6, t3, t6; \
164 \
165 vpxor t1, x6, x6; \
166 vpxor 2 * 16(mem_cd), x6, x6; \
167 \
168 vpxor t0, x7, x7; \
169 vpxor 3 * 16(mem_cd), x7, x7; \
170 \
171 vpxor t7, x0, x0; \
172 vpxor 4 * 16(mem_cd), x0, x0; \
173 \
174 vpxor t6, x1, x1; \
175 vpxor 5 * 16(mem_cd), x1, x1; \
176 \
177 vpxor t5, x2, x2; \
178 vpxor 6 * 16(mem_cd), x2, x2; \
179 \
180 vpxor t4, x3, x3; \
181 vpxor 7 * 16(mem_cd), x3, x3;
182
183/*
184 * Size optimization... with inlined roundsm16, binary would be over 5 times
185 * larger and would only be 0.5% faster (on sandy-bridge).
186 */
187.align 8
188roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
189 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
190 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
191 %rcx, (%r9));
192 ret;
193
194.align 8
195roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
196 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
197 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
198 %rax, (%r9));
199 ret;
200
201/*
202 * IN/OUT:
203 * x0..x7: byte-sliced AB state preloaded
204 * mem_ab: byte-sliced AB state in memory
205 * mem_cb: byte-sliced CD state in memory
206 */
207#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
208 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
209 leaq (key_table + (i) * 8)(CTX), %r9; \
210 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
211 \
212 vmovdqu x4, 0 * 16(mem_cd); \
213 vmovdqu x5, 1 * 16(mem_cd); \
214 vmovdqu x6, 2 * 16(mem_cd); \
215 vmovdqu x7, 3 * 16(mem_cd); \
216 vmovdqu x0, 4 * 16(mem_cd); \
217 vmovdqu x1, 5 * 16(mem_cd); \
218 vmovdqu x2, 6 * 16(mem_cd); \
219 vmovdqu x3, 7 * 16(mem_cd); \
220 \
221 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
222 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
223 \
224 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
225
226#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
227
228#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
229 /* Store new AB state */ \
230 vmovdqu x0, 0 * 16(mem_ab); \
231 vmovdqu x1, 1 * 16(mem_ab); \
232 vmovdqu x2, 2 * 16(mem_ab); \
233 vmovdqu x3, 3 * 16(mem_ab); \
234 vmovdqu x4, 4 * 16(mem_ab); \
235 vmovdqu x5, 5 * 16(mem_ab); \
236 vmovdqu x6, 6 * 16(mem_ab); \
237 vmovdqu x7, 7 * 16(mem_ab);
238
239#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
240 y6, y7, mem_ab, mem_cd, i) \
241 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
242 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
243 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
244 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
245 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
247
248#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
249 y6, y7, mem_ab, mem_cd, i) \
250 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
251 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
252 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
253 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
254 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
256
257/*
258 * IN:
259 * v0..3: byte-sliced 32-bit integers
260 * OUT:
261 * v0..3: (IN <<< 1)
262 */
263#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
264 vpcmpgtb v0, zero, t0; \
265 vpaddb v0, v0, v0; \
266 vpabsb t0, t0; \
267 \
268 vpcmpgtb v1, zero, t1; \
269 vpaddb v1, v1, v1; \
270 vpabsb t1, t1; \
271 \
272 vpcmpgtb v2, zero, t2; \
273 vpaddb v2, v2, v2; \
274 vpabsb t2, t2; \
275 \
276 vpor t0, v1, v1; \
277 \
278 vpcmpgtb v3, zero, t0; \
279 vpaddb v3, v3, v3; \
280 vpabsb t0, t0; \
281 \
282 vpor t1, v2, v2; \
283 vpor t2, v3, v3; \
284 vpor t0, v0, v0;
285
286/*
287 * IN:
288 * r: byte-sliced AB state in memory
289 * l: byte-sliced CD state in memory
290 * OUT:
291 * x0..x7: new byte-sliced CD state
292 */
293#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
294 tt1, tt2, tt3, kll, klr, krl, krr) \
295 /* \
296 * t0 = kll; \
297 * t0 &= ll; \
298 * lr ^= rol32(t0, 1); \
299 */ \
300 vpxor tt0, tt0, tt0; \
301 vmovd kll, t0; \
302 vpshufb tt0, t0, t3; \
303 vpsrldq $1, t0, t0; \
304 vpshufb tt0, t0, t2; \
305 vpsrldq $1, t0, t0; \
306 vpshufb tt0, t0, t1; \
307 vpsrldq $1, t0, t0; \
308 vpshufb tt0, t0, t0; \
309 \
310 vpand l0, t0, t0; \
311 vpand l1, t1, t1; \
312 vpand l2, t2, t2; \
313 vpand l3, t3, t3; \
314 \
315 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
316 \
317 vpxor l4, t0, l4; \
318 vmovdqu l4, 4 * 16(l); \
319 vpxor l5, t1, l5; \
320 vmovdqu l5, 5 * 16(l); \
321 vpxor l6, t2, l6; \
322 vmovdqu l6, 6 * 16(l); \
323 vpxor l7, t3, l7; \
324 vmovdqu l7, 7 * 16(l); \
325 \
326 /* \
327 * t2 = krr; \
328 * t2 |= rr; \
329 * rl ^= t2; \
330 */ \
331 \
332 vmovd krr, t0; \
333 vpshufb tt0, t0, t3; \
334 vpsrldq $1, t0, t0; \
335 vpshufb tt0, t0, t2; \
336 vpsrldq $1, t0, t0; \
337 vpshufb tt0, t0, t1; \
338 vpsrldq $1, t0, t0; \
339 vpshufb tt0, t0, t0; \
340 \
341 vpor 4 * 16(r), t0, t0; \
342 vpor 5 * 16(r), t1, t1; \
343 vpor 6 * 16(r), t2, t2; \
344 vpor 7 * 16(r), t3, t3; \
345 \
346 vpxor 0 * 16(r), t0, t0; \
347 vpxor 1 * 16(r), t1, t1; \
348 vpxor 2 * 16(r), t2, t2; \
349 vpxor 3 * 16(r), t3, t3; \
350 vmovdqu t0, 0 * 16(r); \
351 vmovdqu t1, 1 * 16(r); \
352 vmovdqu t2, 2 * 16(r); \
353 vmovdqu t3, 3 * 16(r); \
354 \
355 /* \
356 * t2 = krl; \
357 * t2 &= rl; \
358 * rr ^= rol32(t2, 1); \
359 */ \
360 vmovd krl, t0; \
361 vpshufb tt0, t0, t3; \
362 vpsrldq $1, t0, t0; \
363 vpshufb tt0, t0, t2; \
364 vpsrldq $1, t0, t0; \
365 vpshufb tt0, t0, t1; \
366 vpsrldq $1, t0, t0; \
367 vpshufb tt0, t0, t0; \
368 \
369 vpand 0 * 16(r), t0, t0; \
370 vpand 1 * 16(r), t1, t1; \
371 vpand 2 * 16(r), t2, t2; \
372 vpand 3 * 16(r), t3, t3; \
373 \
374 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
375 \
376 vpxor 4 * 16(r), t0, t0; \
377 vpxor 5 * 16(r), t1, t1; \
378 vpxor 6 * 16(r), t2, t2; \
379 vpxor 7 * 16(r), t3, t3; \
380 vmovdqu t0, 4 * 16(r); \
381 vmovdqu t1, 5 * 16(r); \
382 vmovdqu t2, 6 * 16(r); \
383 vmovdqu t3, 7 * 16(r); \
384 \
385 /* \
386 * t0 = klr; \
387 * t0 |= lr; \
388 * ll ^= t0; \
389 */ \
390 \
391 vmovd klr, t0; \
392 vpshufb tt0, t0, t3; \
393 vpsrldq $1, t0, t0; \
394 vpshufb tt0, t0, t2; \
395 vpsrldq $1, t0, t0; \
396 vpshufb tt0, t0, t1; \
397 vpsrldq $1, t0, t0; \
398 vpshufb tt0, t0, t0; \
399 \
400 vpor l4, t0, t0; \
401 vpor l5, t1, t1; \
402 vpor l6, t2, t2; \
403 vpor l7, t3, t3; \
404 \
405 vpxor l0, t0, l0; \
406 vmovdqu l0, 0 * 16(l); \
407 vpxor l1, t1, l1; \
408 vmovdqu l1, 1 * 16(l); \
409 vpxor l2, t2, l2; \
410 vmovdqu l2, 2 * 16(l); \
411 vpxor l3, t3, l3; \
412 vmovdqu l3, 3 * 16(l);
413
414#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
415 vpunpckhdq x1, x0, t2; \
416 vpunpckldq x1, x0, x0; \
417 \
418 vpunpckldq x3, x2, t1; \
419 vpunpckhdq x3, x2, x2; \
420 \
421 vpunpckhqdq t1, x0, x1; \
422 vpunpcklqdq t1, x0, x0; \
423 \
424 vpunpckhqdq x2, t2, x3; \
425 vpunpcklqdq x2, t2, x2;
426
427#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
428 b3, c3, d3, st0, st1) \
429 vmovdqu d2, st0; \
430 vmovdqu d3, st1; \
431 transpose_4x4(a0, a1, a2, a3, d2, d3); \
432 transpose_4x4(b0, b1, b2, b3, d2, d3); \
433 vmovdqu st0, d2; \
434 vmovdqu st1, d3; \
435 \
436 vmovdqu a0, st0; \
437 vmovdqu a1, st1; \
438 transpose_4x4(c0, c1, c2, c3, a0, a1); \
439 transpose_4x4(d0, d1, d2, d3, a0, a1); \
440 \
441 vmovdqu .Lshufb_16x16b, a0; \
442 vmovdqu st1, a1; \
443 vpshufb a0, a2, a2; \
444 vpshufb a0, a3, a3; \
445 vpshufb a0, b0, b0; \
446 vpshufb a0, b1, b1; \
447 vpshufb a0, b2, b2; \
448 vpshufb a0, b3, b3; \
449 vpshufb a0, a1, a1; \
450 vpshufb a0, c0, c0; \
451 vpshufb a0, c1, c1; \
452 vpshufb a0, c2, c2; \
453 vpshufb a0, c3, c3; \
454 vpshufb a0, d0, d0; \
455 vpshufb a0, d1, d1; \
456 vpshufb a0, d2, d2; \
457 vpshufb a0, d3, d3; \
458 vmovdqu d3, st1; \
459 vmovdqu st0, d3; \
460 vpshufb a0, d3, a0; \
461 vmovdqu d2, st0; \
462 \
463 transpose_4x4(a0, b0, c0, d0, d2, d3); \
464 transpose_4x4(a1, b1, c1, d1, d2, d3); \
465 vmovdqu st0, d2; \
466 vmovdqu st1, d3; \
467 \
468 vmovdqu b0, st0; \
469 vmovdqu b1, st1; \
470 transpose_4x4(a2, b2, c2, d2, b0, b1); \
471 transpose_4x4(a3, b3, c3, d3, b0, b1); \
472 vmovdqu st0, b0; \
473 vmovdqu st1, b1; \
474 /* does not adjust output bytes inside vectors */
475
476/* load blocks to registers and apply pre-whitening */
477#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
478 y6, y7, rio, key) \
479 vmovq key, x0; \
480 vpshufb .Lpack_bswap, x0, x0; \
481 \
482 vpxor 0 * 16(rio), x0, y7; \
483 vpxor 1 * 16(rio), x0, y6; \
484 vpxor 2 * 16(rio), x0, y5; \
485 vpxor 3 * 16(rio), x0, y4; \
486 vpxor 4 * 16(rio), x0, y3; \
487 vpxor 5 * 16(rio), x0, y2; \
488 vpxor 6 * 16(rio), x0, y1; \
489 vpxor 7 * 16(rio), x0, y0; \
490 vpxor 8 * 16(rio), x0, x7; \
491 vpxor 9 * 16(rio), x0, x6; \
492 vpxor 10 * 16(rio), x0, x5; \
493 vpxor 11 * 16(rio), x0, x4; \
494 vpxor 12 * 16(rio), x0, x3; \
495 vpxor 13 * 16(rio), x0, x2; \
496 vpxor 14 * 16(rio), x0, x1; \
497 vpxor 15 * 16(rio), x0, x0;
498
499/* byteslice pre-whitened blocks and store to temporary memory */
500#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
501 y6, y7, mem_ab, mem_cd) \
502 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
503 y5, y6, y7, (mem_ab), (mem_cd)); \
504 \
505 vmovdqu x0, 0 * 16(mem_ab); \
506 vmovdqu x1, 1 * 16(mem_ab); \
507 vmovdqu x2, 2 * 16(mem_ab); \
508 vmovdqu x3, 3 * 16(mem_ab); \
509 vmovdqu x4, 4 * 16(mem_ab); \
510 vmovdqu x5, 5 * 16(mem_ab); \
511 vmovdqu x6, 6 * 16(mem_ab); \
512 vmovdqu x7, 7 * 16(mem_ab); \
513 vmovdqu y0, 0 * 16(mem_cd); \
514 vmovdqu y1, 1 * 16(mem_cd); \
515 vmovdqu y2, 2 * 16(mem_cd); \
516 vmovdqu y3, 3 * 16(mem_cd); \
517 vmovdqu y4, 4 * 16(mem_cd); \
518 vmovdqu y5, 5 * 16(mem_cd); \
519 vmovdqu y6, 6 * 16(mem_cd); \
520 vmovdqu y7, 7 * 16(mem_cd);
521
522/* de-byteslice, apply post-whitening and store blocks */
523#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
524 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
525 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
526 y7, x3, x7, stack_tmp0, stack_tmp1); \
527 \
528 vmovdqu x0, stack_tmp0; \
529 \
530 vmovq key, x0; \
531 vpshufb .Lpack_bswap, x0, x0; \
532 \
533 vpxor x0, y7, y7; \
534 vpxor x0, y6, y6; \
535 vpxor x0, y5, y5; \
536 vpxor x0, y4, y4; \
537 vpxor x0, y3, y3; \
538 vpxor x0, y2, y2; \
539 vpxor x0, y1, y1; \
540 vpxor x0, y0, y0; \
541 vpxor x0, x7, x7; \
542 vpxor x0, x6, x6; \
543 vpxor x0, x5, x5; \
544 vpxor x0, x4, x4; \
545 vpxor x0, x3, x3; \
546 vpxor x0, x2, x2; \
547 vpxor x0, x1, x1; \
548 vpxor stack_tmp0, x0, x0;
549
550#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
551 y6, y7, rio) \
552 vmovdqu x0, 0 * 16(rio); \
553 vmovdqu x1, 1 * 16(rio); \
554 vmovdqu x2, 2 * 16(rio); \
555 vmovdqu x3, 3 * 16(rio); \
556 vmovdqu x4, 4 * 16(rio); \
557 vmovdqu x5, 5 * 16(rio); \
558 vmovdqu x6, 6 * 16(rio); \
559 vmovdqu x7, 7 * 16(rio); \
560 vmovdqu y0, 8 * 16(rio); \
561 vmovdqu y1, 9 * 16(rio); \
562 vmovdqu y2, 10 * 16(rio); \
563 vmovdqu y3, 11 * 16(rio); \
564 vmovdqu y4, 12 * 16(rio); \
565 vmovdqu y5, 13 * 16(rio); \
566 vmovdqu y6, 14 * 16(rio); \
567 vmovdqu y7, 15 * 16(rio);
568
569.data
570.align 16
571
572#define SHUFB_BYTES(idx) \
573 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
574
575.Lshufb_16x16b:
576 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
577
578.Lpack_bswap:
579 .long 0x00010203
580 .long 0x04050607
581 .long 0x80808080
582 .long 0x80808080
583
584/* For CTR-mode IV byteswap */
585.Lbswap128_mask:
586 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
587
588/*
589 * pre-SubByte transform
590 *
591 * pre-lookup for sbox1, sbox2, sbox3:
592 * swap_bitendianness(
593 * isom_map_camellia_to_aes(
594 * camellia_f(
595 * swap_bitendianess(in)
596 * )
597 * )
598 * )
599 *
600 * (note: '⊕ 0xc5' inside camellia_f())
601 */
602.Lpre_tf_lo_s1:
603 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
604 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
605.Lpre_tf_hi_s1:
606 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
607 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
608
609/*
610 * pre-SubByte transform
611 *
612 * pre-lookup for sbox4:
613 * swap_bitendianness(
614 * isom_map_camellia_to_aes(
615 * camellia_f(
616 * swap_bitendianess(in <<< 1)
617 * )
618 * )
619 * )
620 *
621 * (note: '⊕ 0xc5' inside camellia_f())
622 */
623.Lpre_tf_lo_s4:
624 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
625 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
626.Lpre_tf_hi_s4:
627 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
628 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
629
630/*
631 * post-SubByte transform
632 *
633 * post-lookup for sbox1, sbox4:
634 * swap_bitendianness(
635 * camellia_h(
636 * isom_map_aes_to_camellia(
637 * swap_bitendianness(
638 * aes_inverse_affine_transform(in)
639 * )
640 * )
641 * )
642 * )
643 *
644 * (note: '⊕ 0x6e' inside camellia_h())
645 */
646.Lpost_tf_lo_s1:
647 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
648 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
649.Lpost_tf_hi_s1:
650 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
651 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
652
653/*
654 * post-SubByte transform
655 *
656 * post-lookup for sbox2:
657 * swap_bitendianness(
658 * camellia_h(
659 * isom_map_aes_to_camellia(
660 * swap_bitendianness(
661 * aes_inverse_affine_transform(in)
662 * )
663 * )
664 * )
665 * ) <<< 1
666 *
667 * (note: '⊕ 0x6e' inside camellia_h())
668 */
669.Lpost_tf_lo_s2:
670 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
671 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
672.Lpost_tf_hi_s2:
673 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
674 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
675
676/*
677 * post-SubByte transform
678 *
679 * post-lookup for sbox3:
680 * swap_bitendianness(
681 * camellia_h(
682 * isom_map_aes_to_camellia(
683 * swap_bitendianness(
684 * aes_inverse_affine_transform(in)
685 * )
686 * )
687 * )
688 * ) >>> 1
689 *
690 * (note: '⊕ 0x6e' inside camellia_h())
691 */
692.Lpost_tf_lo_s3:
693 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
694 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
695.Lpost_tf_hi_s3:
696 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
697 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
698
699/* For isolating SubBytes from AESENCLAST, inverse shift row */
700.Linv_shift_row:
701 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
702 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
703
704/* 4-bit mask */
705.align 4
706.L0f0f0f0f:
707 .long 0x0f0f0f0f
708
709.text
710
711.align 8
712.type __camellia_enc_blk16,@function;
713
714__camellia_enc_blk16:
715 /* input:
716 * %rdi: ctx, CTX
717 * %rax: temporary storage, 256 bytes
718 * %xmm0..%xmm15: 16 plaintext blocks
719 * output:
720 * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
721 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
722 */
723
724 leaq 8 * 16(%rax), %rcx;
725
726 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
727 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
728 %xmm15, %rax, %rcx);
729
730 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
731 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
732 %xmm15, %rax, %rcx, 0);
733
734 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
735 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
736 %xmm15,
737 ((key_table + (8) * 8) + 0)(CTX),
738 ((key_table + (8) * 8) + 4)(CTX),
739 ((key_table + (8) * 8) + 8)(CTX),
740 ((key_table + (8) * 8) + 12)(CTX));
741
742 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
743 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
744 %xmm15, %rax, %rcx, 8);
745
746 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
747 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
748 %xmm15,
749 ((key_table + (16) * 8) + 0)(CTX),
750 ((key_table + (16) * 8) + 4)(CTX),
751 ((key_table + (16) * 8) + 8)(CTX),
752 ((key_table + (16) * 8) + 12)(CTX));
753
754 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
755 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
756 %xmm15, %rax, %rcx, 16);
757
758 movl $24, %r8d;
759 cmpl $16, key_length(CTX);
760 jne .Lenc_max32;
761
762.Lenc_done:
763 /* load CD for output */
764 vmovdqu 0 * 16(%rcx), %xmm8;
765 vmovdqu 1 * 16(%rcx), %xmm9;
766 vmovdqu 2 * 16(%rcx), %xmm10;
767 vmovdqu 3 * 16(%rcx), %xmm11;
768 vmovdqu 4 * 16(%rcx), %xmm12;
769 vmovdqu 5 * 16(%rcx), %xmm13;
770 vmovdqu 6 * 16(%rcx), %xmm14;
771 vmovdqu 7 * 16(%rcx), %xmm15;
772
773 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
774 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
775 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
776
777 ret;
778
779.align 8
780.Lenc_max32:
781 movl $32, %r8d;
782
783 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
784 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
785 %xmm15,
786 ((key_table + (24) * 8) + 0)(CTX),
787 ((key_table + (24) * 8) + 4)(CTX),
788 ((key_table + (24) * 8) + 8)(CTX),
789 ((key_table + (24) * 8) + 12)(CTX));
790
791 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
792 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
793 %xmm15, %rax, %rcx, 24);
794
795 jmp .Lenc_done;
796
797.align 8
798.type __camellia_dec_blk16,@function;
799
800__camellia_dec_blk16:
801 /* input:
802 * %rdi: ctx, CTX
803 * %rax: temporary storage, 256 bytes
804 * %r8d: 24 for 16 byte key, 32 for larger
805 * %xmm0..%xmm15: 16 encrypted blocks
806 * output:
807 * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
808 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
809 */
810
811 leaq 8 * 16(%rax), %rcx;
812
813 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
814 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
815 %xmm15, %rax, %rcx);
816
817 cmpl $32, %r8d;
818 je .Ldec_max32;
819
820.Ldec_max24:
821 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
822 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
823 %xmm15, %rax, %rcx, 16);
824
825 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
826 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
827 %xmm15,
828 ((key_table + (16) * 8) + 8)(CTX),
829 ((key_table + (16) * 8) + 12)(CTX),
830 ((key_table + (16) * 8) + 0)(CTX),
831 ((key_table + (16) * 8) + 4)(CTX));
832
833 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
834 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
835 %xmm15, %rax, %rcx, 8);
836
837 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
838 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
839 %xmm15,
840 ((key_table + (8) * 8) + 8)(CTX),
841 ((key_table + (8) * 8) + 12)(CTX),
842 ((key_table + (8) * 8) + 0)(CTX),
843 ((key_table + (8) * 8) + 4)(CTX));
844
845 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
846 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
847 %xmm15, %rax, %rcx, 0);
848
849 /* load CD for output */
850 vmovdqu 0 * 16(%rcx), %xmm8;
851 vmovdqu 1 * 16(%rcx), %xmm9;
852 vmovdqu 2 * 16(%rcx), %xmm10;
853 vmovdqu 3 * 16(%rcx), %xmm11;
854 vmovdqu 4 * 16(%rcx), %xmm12;
855 vmovdqu 5 * 16(%rcx), %xmm13;
856 vmovdqu 6 * 16(%rcx), %xmm14;
857 vmovdqu 7 * 16(%rcx), %xmm15;
858
859 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
860 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
861 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
862
863 ret;
864
865.align 8
866.Ldec_max32:
867 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
868 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
869 %xmm15, %rax, %rcx, 24);
870
871 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
872 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
873 %xmm15,
874 ((key_table + (24) * 8) + 8)(CTX),
875 ((key_table + (24) * 8) + 12)(CTX),
876 ((key_table + (24) * 8) + 0)(CTX),
877 ((key_table + (24) * 8) + 4)(CTX));
878
879 jmp .Ldec_max24;
880
881.align 8
882.global camellia_ecb_enc_16way
883.type camellia_ecb_enc_16way,@function;
884
885camellia_ecb_enc_16way:
886 /* input:
887 * %rdi: ctx, CTX
888 * %rsi: dst (16 blocks)
889 * %rdx: src (16 blocks)
890 */
891
892 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
893 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
894 %xmm15, %rdx, (key_table)(CTX));
895
896 /* now dst can be used as temporary buffer (even in src == dst case) */
897 movq %rsi, %rax;
898
899 call __camellia_enc_blk16;
900
901 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
902 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
903 %xmm8, %rsi);
904
905 ret;
906
907.align 8
908.global camellia_ecb_dec_16way
909.type camellia_ecb_dec_16way,@function;
910
911camellia_ecb_dec_16way:
912 /* input:
913 * %rdi: ctx, CTX
914 * %rsi: dst (16 blocks)
915 * %rdx: src (16 blocks)
916 */
917
918 cmpl $16, key_length(CTX);
919 movl $32, %r8d;
920 movl $24, %eax;
921 cmovel %eax, %r8d; /* max */
922
923 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
924 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
925 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
926
927 /* now dst can be used as temporary buffer (even in src == dst case) */
928 movq %rsi, %rax;
929
930 call __camellia_dec_blk16;
931
932 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
933 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
934 %xmm8, %rsi);
935
936 ret;
937
938.align 8
939.global camellia_cbc_dec_16way
940.type camellia_cbc_dec_16way,@function;
941
942camellia_cbc_dec_16way:
943 /* input:
944 * %rdi: ctx, CTX
945 * %rsi: dst (16 blocks)
946 * %rdx: src (16 blocks)
947 */
948
949 cmpl $16, key_length(CTX);
950 movl $32, %r8d;
951 movl $24, %eax;
952 cmovel %eax, %r8d; /* max */
953
954 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
955 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
956 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
957
958 /*
959 * dst might still be in-use (in case dst == src), so use stack for
960 * temporary storage.
961 */
962 subq $(16 * 16), %rsp;
963 movq %rsp, %rax;
964
965 call __camellia_dec_blk16;
966
967 addq $(16 * 16), %rsp;
968
969 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
970 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
971 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
972 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
973 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
974 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
975 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
976 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
977 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
978 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
979 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
980 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
981 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
982 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
983 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
984 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
985 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
986 %xmm8, %rsi);
987
988 ret;
989
990#define inc_le128(x, minus_one, tmp) \
991 vpcmpeqq minus_one, x, tmp; \
992 vpsubq minus_one, x, x; \
993 vpslldq $8, tmp, tmp; \
994 vpsubq tmp, x, x;
995
996.align 8
997.global camellia_ctr_16way
998.type camellia_ctr_16way,@function;
999
1000camellia_ctr_16way:
1001 /* input:
1002 * %rdi: ctx, CTX
1003 * %rsi: dst (16 blocks)
1004 * %rdx: src (16 blocks)
1005 * %rcx: iv (little endian, 128bit)
1006 */
1007
1008 subq $(16 * 16), %rsp;
1009 movq %rsp, %rax;
1010
1011 vmovdqa .Lbswap128_mask, %xmm14;
1012
1013 /* load IV and byteswap */
1014 vmovdqu (%rcx), %xmm0;
1015 vpshufb %xmm14, %xmm0, %xmm15;
1016 vmovdqu %xmm15, 15 * 16(%rax);
1017
1018 vpcmpeqd %xmm15, %xmm15, %xmm15;
1019 vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1020
1021 /* construct IVs */
1022 inc_le128(%xmm0, %xmm15, %xmm13);
1023 vpshufb %xmm14, %xmm0, %xmm13;
1024 vmovdqu %xmm13, 14 * 16(%rax);
1025 inc_le128(%xmm0, %xmm15, %xmm13);
1026 vpshufb %xmm14, %xmm0, %xmm13;
1027 vmovdqu %xmm13, 13 * 16(%rax);
1028 inc_le128(%xmm0, %xmm15, %xmm13);
1029 vpshufb %xmm14, %xmm0, %xmm12;
1030 inc_le128(%xmm0, %xmm15, %xmm13);
1031 vpshufb %xmm14, %xmm0, %xmm11;
1032 inc_le128(%xmm0, %xmm15, %xmm13);
1033 vpshufb %xmm14, %xmm0, %xmm10;
1034 inc_le128(%xmm0, %xmm15, %xmm13);
1035 vpshufb %xmm14, %xmm0, %xmm9;
1036 inc_le128(%xmm0, %xmm15, %xmm13);
1037 vpshufb %xmm14, %xmm0, %xmm8;
1038 inc_le128(%xmm0, %xmm15, %xmm13);
1039 vpshufb %xmm14, %xmm0, %xmm7;
1040 inc_le128(%xmm0, %xmm15, %xmm13);
1041 vpshufb %xmm14, %xmm0, %xmm6;
1042 inc_le128(%xmm0, %xmm15, %xmm13);
1043 vpshufb %xmm14, %xmm0, %xmm5;
1044 inc_le128(%xmm0, %xmm15, %xmm13);
1045 vpshufb %xmm14, %xmm0, %xmm4;
1046 inc_le128(%xmm0, %xmm15, %xmm13);
1047 vpshufb %xmm14, %xmm0, %xmm3;
1048 inc_le128(%xmm0, %xmm15, %xmm13);
1049 vpshufb %xmm14, %xmm0, %xmm2;
1050 inc_le128(%xmm0, %xmm15, %xmm13);
1051 vpshufb %xmm14, %xmm0, %xmm1;
1052 inc_le128(%xmm0, %xmm15, %xmm13);
1053 vmovdqa %xmm0, %xmm13;
1054 vpshufb %xmm14, %xmm0, %xmm0;
1055 inc_le128(%xmm13, %xmm15, %xmm14);
1056 vmovdqu %xmm13, (%rcx);
1057
1058 /* inpack16_pre: */
1059 vmovq (key_table)(CTX), %xmm15;
1060 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1061 vpxor %xmm0, %xmm15, %xmm0;
1062 vpxor %xmm1, %xmm15, %xmm1;
1063 vpxor %xmm2, %xmm15, %xmm2;
1064 vpxor %xmm3, %xmm15, %xmm3;
1065 vpxor %xmm4, %xmm15, %xmm4;
1066 vpxor %xmm5, %xmm15, %xmm5;
1067 vpxor %xmm6, %xmm15, %xmm6;
1068 vpxor %xmm7, %xmm15, %xmm7;
1069 vpxor %xmm8, %xmm15, %xmm8;
1070 vpxor %xmm9, %xmm15, %xmm9;
1071 vpxor %xmm10, %xmm15, %xmm10;
1072 vpxor %xmm11, %xmm15, %xmm11;
1073 vpxor %xmm12, %xmm15, %xmm12;
1074 vpxor 13 * 16(%rax), %xmm15, %xmm13;
1075 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1076 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1077
1078 call __camellia_enc_blk16;
1079
1080 addq $(16 * 16), %rsp;
1081
1082 vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1083 vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1084 vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1085 vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1086 vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1087 vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1088 vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1089 vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1090 vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1091 vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1092 vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1093 vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1094 vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1095 vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1096 vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1097 vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1098 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1099 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1100 %xmm8, %rsi);
1101
1102 ret;
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
new file mode 100644
index 000000000000..96cbb6068fce
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -0,0 +1,558 @@
1/*
2 * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/crypto.h>
16#include <linux/err.h>
17#include <crypto/algapi.h>
18#include <crypto/ctr.h>
19#include <crypto/lrw.h>
20#include <crypto/xts.h>
21#include <asm/xcr.h>
22#include <asm/xsave.h>
23#include <asm/crypto/camellia.h>
24#include <asm/crypto/ablk_helper.h>
25#include <asm/crypto/glue_helper.h>
26
27#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
28
29/* 16-way AES-NI parallel cipher functions */
30asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
31 const u8 *src);
32asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
33 const u8 *src);
34
35asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
36 const u8 *src);
37asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
38 const u8 *src, le128 *iv);
39
40static const struct common_glue_ctx camellia_enc = {
41 .num_funcs = 3,
42 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
43
44 .funcs = { {
45 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
46 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
47 }, {
48 .num_blocks = 2,
49 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
50 }, {
51 .num_blocks = 1,
52 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
53 } }
54};
55
56static const struct common_glue_ctx camellia_ctr = {
57 .num_funcs = 3,
58 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
59
60 .funcs = { {
61 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
62 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
63 }, {
64 .num_blocks = 2,
65 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
66 }, {
67 .num_blocks = 1,
68 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
69 } }
70};
71
72static const struct common_glue_ctx camellia_dec = {
73 .num_funcs = 3,
74 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
75
76 .funcs = { {
77 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
78 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
79 }, {
80 .num_blocks = 2,
81 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
82 }, {
83 .num_blocks = 1,
84 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
85 } }
86};
87
88static const struct common_glue_ctx camellia_dec_cbc = {
89 .num_funcs = 3,
90 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
91
92 .funcs = { {
93 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
94 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
95 }, {
96 .num_blocks = 2,
97 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
98 }, {
99 .num_blocks = 1,
100 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
101 } }
102};
103
104static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
105 struct scatterlist *src, unsigned int nbytes)
106{
107 return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
108}
109
110static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
111 struct scatterlist *src, unsigned int nbytes)
112{
113 return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
114}
115
116static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
117 struct scatterlist *src, unsigned int nbytes)
118{
119 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
120 dst, src, nbytes);
121}
122
123static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
124 struct scatterlist *src, unsigned int nbytes)
125{
126 return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
127 nbytes);
128}
129
130static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
131 struct scatterlist *src, unsigned int nbytes)
132{
133 return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
134}
135
136static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
137{
138 return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
139 CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
140 nbytes);
141}
142
143static inline void camellia_fpu_end(bool fpu_enabled)
144{
145 glue_fpu_end(fpu_enabled);
146}
147
148static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
149 unsigned int key_len)
150{
151 return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
152 &tfm->crt_flags);
153}
154
155struct crypt_priv {
156 struct camellia_ctx *ctx;
157 bool fpu_enabled;
158};
159
160static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
161{
162 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
163 struct crypt_priv *ctx = priv;
164 int i;
165
166 ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
167
168 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
169 camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
170 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
171 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
172 }
173
174 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
175 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
176 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
177 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
178 }
179
180 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
181 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
182}
183
184static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
185{
186 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
187 struct crypt_priv *ctx = priv;
188 int i;
189
190 ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
191
192 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
193 camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
194 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
195 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
196 }
197
198 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
199 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
200 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
201 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
202 }
203
204 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
205 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
206}
207
208static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
209 struct scatterlist *src, unsigned int nbytes)
210{
211 struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
212 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
213 struct crypt_priv crypt_ctx = {
214 .ctx = &ctx->camellia_ctx,
215 .fpu_enabled = false,
216 };
217 struct lrw_crypt_req req = {
218 .tbuf = buf,
219 .tbuflen = sizeof(buf),
220
221 .table_ctx = &ctx->lrw_table,
222 .crypt_ctx = &crypt_ctx,
223 .crypt_fn = encrypt_callback,
224 };
225 int ret;
226
227 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
228 ret = lrw_crypt(desc, dst, src, nbytes, &req);
229 camellia_fpu_end(crypt_ctx.fpu_enabled);
230
231 return ret;
232}
233
234static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
235 struct scatterlist *src, unsigned int nbytes)
236{
237 struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
238 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
239 struct crypt_priv crypt_ctx = {
240 .ctx = &ctx->camellia_ctx,
241 .fpu_enabled = false,
242 };
243 struct lrw_crypt_req req = {
244 .tbuf = buf,
245 .tbuflen = sizeof(buf),
246
247 .table_ctx = &ctx->lrw_table,
248 .crypt_ctx = &crypt_ctx,
249 .crypt_fn = decrypt_callback,
250 };
251 int ret;
252
253 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
254 ret = lrw_crypt(desc, dst, src, nbytes, &req);
255 camellia_fpu_end(crypt_ctx.fpu_enabled);
256
257 return ret;
258}
259
260static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
261 struct scatterlist *src, unsigned int nbytes)
262{
263 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
264 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
265 struct crypt_priv crypt_ctx = {
266 .ctx = &ctx->crypt_ctx,
267 .fpu_enabled = false,
268 };
269 struct xts_crypt_req req = {
270 .tbuf = buf,
271 .tbuflen = sizeof(buf),
272
273 .tweak_ctx = &ctx->tweak_ctx,
274 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
275 .crypt_ctx = &crypt_ctx,
276 .crypt_fn = encrypt_callback,
277 };
278 int ret;
279
280 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
281 ret = xts_crypt(desc, dst, src, nbytes, &req);
282 camellia_fpu_end(crypt_ctx.fpu_enabled);
283
284 return ret;
285}
286
287static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
288 struct scatterlist *src, unsigned int nbytes)
289{
290 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
291 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
292 struct crypt_priv crypt_ctx = {
293 .ctx = &ctx->crypt_ctx,
294 .fpu_enabled = false,
295 };
296 struct xts_crypt_req req = {
297 .tbuf = buf,
298 .tbuflen = sizeof(buf),
299
300 .tweak_ctx = &ctx->tweak_ctx,
301 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
302 .crypt_ctx = &crypt_ctx,
303 .crypt_fn = decrypt_callback,
304 };
305 int ret;
306
307 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
308 ret = xts_crypt(desc, dst, src, nbytes, &req);
309 camellia_fpu_end(crypt_ctx.fpu_enabled);
310
311 return ret;
312}
313
314static struct crypto_alg cmll_algs[10] = { {
315 .cra_name = "__ecb-camellia-aesni",
316 .cra_driver_name = "__driver-ecb-camellia-aesni",
317 .cra_priority = 0,
318 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
319 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
320 .cra_ctxsize = sizeof(struct camellia_ctx),
321 .cra_alignmask = 0,
322 .cra_type = &crypto_blkcipher_type,
323 .cra_module = THIS_MODULE,
324 .cra_u = {
325 .blkcipher = {
326 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
327 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
328 .setkey = camellia_setkey,
329 .encrypt = ecb_encrypt,
330 .decrypt = ecb_decrypt,
331 },
332 },
333}, {
334 .cra_name = "__cbc-camellia-aesni",
335 .cra_driver_name = "__driver-cbc-camellia-aesni",
336 .cra_priority = 0,
337 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
338 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
339 .cra_ctxsize = sizeof(struct camellia_ctx),
340 .cra_alignmask = 0,
341 .cra_type = &crypto_blkcipher_type,
342 .cra_module = THIS_MODULE,
343 .cra_u = {
344 .blkcipher = {
345 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
346 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
347 .setkey = camellia_setkey,
348 .encrypt = cbc_encrypt,
349 .decrypt = cbc_decrypt,
350 },
351 },
352}, {
353 .cra_name = "__ctr-camellia-aesni",
354 .cra_driver_name = "__driver-ctr-camellia-aesni",
355 .cra_priority = 0,
356 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
357 .cra_blocksize = 1,
358 .cra_ctxsize = sizeof(struct camellia_ctx),
359 .cra_alignmask = 0,
360 .cra_type = &crypto_blkcipher_type,
361 .cra_module = THIS_MODULE,
362 .cra_u = {
363 .blkcipher = {
364 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
365 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
366 .ivsize = CAMELLIA_BLOCK_SIZE,
367 .setkey = camellia_setkey,
368 .encrypt = ctr_crypt,
369 .decrypt = ctr_crypt,
370 },
371 },
372}, {
373 .cra_name = "__lrw-camellia-aesni",
374 .cra_driver_name = "__driver-lrw-camellia-aesni",
375 .cra_priority = 0,
376 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
377 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
378 .cra_ctxsize = sizeof(struct camellia_lrw_ctx),
379 .cra_alignmask = 0,
380 .cra_type = &crypto_blkcipher_type,
381 .cra_module = THIS_MODULE,
382 .cra_exit = lrw_camellia_exit_tfm,
383 .cra_u = {
384 .blkcipher = {
385 .min_keysize = CAMELLIA_MIN_KEY_SIZE +
386 CAMELLIA_BLOCK_SIZE,
387 .max_keysize = CAMELLIA_MAX_KEY_SIZE +
388 CAMELLIA_BLOCK_SIZE,
389 .ivsize = CAMELLIA_BLOCK_SIZE,
390 .setkey = lrw_camellia_setkey,
391 .encrypt = lrw_encrypt,
392 .decrypt = lrw_decrypt,
393 },
394 },
395}, {
396 .cra_name = "__xts-camellia-aesni",
397 .cra_driver_name = "__driver-xts-camellia-aesni",
398 .cra_priority = 0,
399 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
400 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
401 .cra_ctxsize = sizeof(struct camellia_xts_ctx),
402 .cra_alignmask = 0,
403 .cra_type = &crypto_blkcipher_type,
404 .cra_module = THIS_MODULE,
405 .cra_u = {
406 .blkcipher = {
407 .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
408 .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
409 .ivsize = CAMELLIA_BLOCK_SIZE,
410 .setkey = xts_camellia_setkey,
411 .encrypt = xts_encrypt,
412 .decrypt = xts_decrypt,
413 },
414 },
415}, {
416 .cra_name = "ecb(camellia)",
417 .cra_driver_name = "ecb-camellia-aesni",
418 .cra_priority = 400,
419 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
420 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
421 .cra_ctxsize = sizeof(struct async_helper_ctx),
422 .cra_alignmask = 0,
423 .cra_type = &crypto_ablkcipher_type,
424 .cra_module = THIS_MODULE,
425 .cra_init = ablk_init,
426 .cra_exit = ablk_exit,
427 .cra_u = {
428 .ablkcipher = {
429 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
430 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
431 .setkey = ablk_set_key,
432 .encrypt = ablk_encrypt,
433 .decrypt = ablk_decrypt,
434 },
435 },
436}, {
437 .cra_name = "cbc(camellia)",
438 .cra_driver_name = "cbc-camellia-aesni",
439 .cra_priority = 400,
440 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
441 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
442 .cra_ctxsize = sizeof(struct async_helper_ctx),
443 .cra_alignmask = 0,
444 .cra_type = &crypto_ablkcipher_type,
445 .cra_module = THIS_MODULE,
446 .cra_init = ablk_init,
447 .cra_exit = ablk_exit,
448 .cra_u = {
449 .ablkcipher = {
450 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
451 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
452 .ivsize = CAMELLIA_BLOCK_SIZE,
453 .setkey = ablk_set_key,
454 .encrypt = __ablk_encrypt,
455 .decrypt = ablk_decrypt,
456 },
457 },
458}, {
459 .cra_name = "ctr(camellia)",
460 .cra_driver_name = "ctr-camellia-aesni",
461 .cra_priority = 400,
462 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
463 .cra_blocksize = 1,
464 .cra_ctxsize = sizeof(struct async_helper_ctx),
465 .cra_alignmask = 0,
466 .cra_type = &crypto_ablkcipher_type,
467 .cra_module = THIS_MODULE,
468 .cra_init = ablk_init,
469 .cra_exit = ablk_exit,
470 .cra_u = {
471 .ablkcipher = {
472 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
473 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
474 .ivsize = CAMELLIA_BLOCK_SIZE,
475 .setkey = ablk_set_key,
476 .encrypt = ablk_encrypt,
477 .decrypt = ablk_encrypt,
478 .geniv = "chainiv",
479 },
480 },
481}, {
482 .cra_name = "lrw(camellia)",
483 .cra_driver_name = "lrw-camellia-aesni",
484 .cra_priority = 400,
485 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
486 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
487 .cra_ctxsize = sizeof(struct async_helper_ctx),
488 .cra_alignmask = 0,
489 .cra_type = &crypto_ablkcipher_type,
490 .cra_module = THIS_MODULE,
491 .cra_init = ablk_init,
492 .cra_exit = ablk_exit,
493 .cra_u = {
494 .ablkcipher = {
495 .min_keysize = CAMELLIA_MIN_KEY_SIZE +
496 CAMELLIA_BLOCK_SIZE,
497 .max_keysize = CAMELLIA_MAX_KEY_SIZE +
498 CAMELLIA_BLOCK_SIZE,
499 .ivsize = CAMELLIA_BLOCK_SIZE,
500 .setkey = ablk_set_key,
501 .encrypt = ablk_encrypt,
502 .decrypt = ablk_decrypt,
503 },
504 },
505}, {
506 .cra_name = "xts(camellia)",
507 .cra_driver_name = "xts-camellia-aesni",
508 .cra_priority = 400,
509 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
510 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
511 .cra_ctxsize = sizeof(struct async_helper_ctx),
512 .cra_alignmask = 0,
513 .cra_type = &crypto_ablkcipher_type,
514 .cra_module = THIS_MODULE,
515 .cra_init = ablk_init,
516 .cra_exit = ablk_exit,
517 .cra_u = {
518 .ablkcipher = {
519 .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
520 .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
521 .ivsize = CAMELLIA_BLOCK_SIZE,
522 .setkey = ablk_set_key,
523 .encrypt = ablk_encrypt,
524 .decrypt = ablk_decrypt,
525 },
526 },
527} };
528
529static int __init camellia_aesni_init(void)
530{
531 u64 xcr0;
532
533 if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
534 pr_info("AVX or AES-NI instructions are not detected.\n");
535 return -ENODEV;
536 }
537
538 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
539 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
540 pr_info("AVX detected but unusable.\n");
541 return -ENODEV;
542 }
543
544 return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
545}
546
547static void __exit camellia_aesni_fini(void)
548{
549 crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
550}
551
552module_init(camellia_aesni_init);
553module_exit(camellia_aesni_fini);
554
555MODULE_LICENSE("GPL");
556MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
557MODULE_ALIAS("camellia");
558MODULE_ALIAS("camellia-asm");