diff options
author | Jussi Kivilinna <jussi.kivilinna@iki.fi> | 2013-04-13 06:47:00 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2013-04-25 09:09:07 -0400 |
commit | f3f935a76aa0eee68da2b273a08d84ba8ffc7a73 (patch) | |
tree | c33db3ca826852d1f5b66ec48e08a08b0e273b78 | |
parent | 56d76c96a9f3e39ab733c5643b3ce5a1d4be242a (diff) |
crypto: camellia - add AVX2/AES-NI/x86_64 assembler implementation of camellia cipher
Patch adds AVX2/AES-NI/x86-64 implementation of Camellia cipher, requiring
32 parallel blocks for input (512 bytes). Compared to AVX implementation, this
version is extended to use the 256-bit wide YMM registers. For AES-NI
instructions data is split to two 128-bit registers and merged afterwards.
Even with this additional handling, performance should be higher compared
to the AES-NI/AVX implementation.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/x86/crypto/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 1368 | ||||
-rw-r--r-- | arch/x86/crypto/camellia_aesni_avx2_glue.c | 586 | ||||
-rw-r--r-- | arch/x86/crypto/camellia_aesni_avx_glue.c | 17 | ||||
-rw-r--r-- | arch/x86/include/asm/crypto/camellia.h | 19 | ||||
-rw-r--r-- | crypto/Kconfig | 23 | ||||
-rw-r--r-- | crypto/testmgr.c | 12 |
7 files changed, 2024 insertions, 3 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index a21af593ab8d..a3a0ed80f17c 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -43,6 +43,7 @@ endif | |||
43 | # These modules require assembler to support AVX2. | 43 | # These modules require assembler to support AVX2. |
44 | ifeq ($(avx2_supported),yes) | 44 | ifeq ($(avx2_supported),yes) |
45 | obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o | 45 | obj-$(CONFIG_CRYPTO_BLOWFISH_AVX2_X86_64) += blowfish-avx2.o |
46 | obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64) += camellia-aesni-avx2.o | ||
46 | obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o | 47 | obj-$(CONFIG_CRYPTO_SERPENT_AVX2_X86_64) += serpent-avx2.o |
47 | obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o | 48 | obj-$(CONFIG_CRYPTO_TWOFISH_AVX2_X86_64) += twofish-avx2.o |
48 | endif | 49 | endif |
@@ -73,6 +74,7 @@ endif | |||
73 | 74 | ||
74 | ifeq ($(avx2_supported),yes) | 75 | ifeq ($(avx2_supported),yes) |
75 | blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o | 76 | blowfish-avx2-y := blowfish-avx2-asm_64.o blowfish_avx2_glue.o |
77 | camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o | ||
76 | serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o | 78 | serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o |
77 | twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o | 79 | twofish-avx2-y := twofish-avx2-asm_64.o twofish_avx2_glue.o |
78 | endif | 80 | endif |
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S new file mode 100644 index 000000000000..91a1878fcc3e --- /dev/null +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S | |||
@@ -0,0 +1,1368 @@ | |||
1 | /* | ||
2 | * x86_64/AVX2/AES-NI assembler implementation of Camellia | ||
3 | * | ||
4 | * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/linkage.h> | ||
14 | |||
15 | #define CAMELLIA_TABLE_BYTE_LEN 272 | ||
16 | |||
17 | /* struct camellia_ctx: */ | ||
18 | #define key_table 0 | ||
19 | #define key_length CAMELLIA_TABLE_BYTE_LEN | ||
20 | |||
21 | /* register macros */ | ||
22 | #define CTX %rdi | ||
23 | #define RIO %r8 | ||
24 | |||
25 | /********************************************************************** | ||
26 | helper macros | ||
27 | **********************************************************************/ | ||
28 | #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ | ||
29 | vpand x, mask4bit, tmp0; \ | ||
30 | vpandn x, mask4bit, x; \ | ||
31 | vpsrld $4, x, x; \ | ||
32 | \ | ||
33 | vpshufb tmp0, lo_t, tmp0; \ | ||
34 | vpshufb x, hi_t, x; \ | ||
35 | vpxor tmp0, x, x; | ||
36 | |||
37 | #define ymm0_x xmm0 | ||
38 | #define ymm1_x xmm1 | ||
39 | #define ymm2_x xmm2 | ||
40 | #define ymm3_x xmm3 | ||
41 | #define ymm4_x xmm4 | ||
42 | #define ymm5_x xmm5 | ||
43 | #define ymm6_x xmm6 | ||
44 | #define ymm7_x xmm7 | ||
45 | #define ymm8_x xmm8 | ||
46 | #define ymm9_x xmm9 | ||
47 | #define ymm10_x xmm10 | ||
48 | #define ymm11_x xmm11 | ||
49 | #define ymm12_x xmm12 | ||
50 | #define ymm13_x xmm13 | ||
51 | #define ymm14_x xmm14 | ||
52 | #define ymm15_x xmm15 | ||
53 | |||
54 | /* | ||
55 | * AES-NI instructions do not support ymmX registers, so we need splitting and | ||
56 | * merging. | ||
57 | */ | ||
58 | #define vaesenclast256(zero, yreg, tmp) \ | ||
59 | vextracti128 $1, yreg, tmp##_x; \ | ||
60 | vaesenclast zero##_x, yreg##_x, yreg##_x; \ | ||
61 | vaesenclast zero##_x, tmp##_x, tmp##_x; \ | ||
62 | vinserti128 $1, tmp##_x, yreg, yreg; | ||
63 | |||
64 | /********************************************************************** | ||
65 | 32-way camellia | ||
66 | **********************************************************************/ | ||
67 | |||
68 | /* | ||
69 | * IN: | ||
70 | * x0..x7: byte-sliced AB state | ||
71 | * mem_cd: register pointer storing CD state | ||
72 | * key: index for key material | ||
73 | * OUT: | ||
74 | * x0..x7: new byte-sliced CD state | ||
75 | */ | ||
76 | #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ | ||
77 | t7, mem_cd, key) \ | ||
78 | /* \ | ||
79 | * S-function with AES subbytes \ | ||
80 | */ \ | ||
81 | vbroadcasti128 .Linv_shift_row, t4; \ | ||
82 | vpbroadcastb .L0f0f0f0f, t7; \ | ||
83 | vbroadcasti128 .Lpre_tf_lo_s1, t0; \ | ||
84 | vbroadcasti128 .Lpre_tf_hi_s1, t1; \ | ||
85 | \ | ||
86 | /* AES inverse shift rows */ \ | ||
87 | vpshufb t4, x0, x0; \ | ||
88 | vpshufb t4, x7, x7; \ | ||
89 | vpshufb t4, x1, x1; \ | ||
90 | vpshufb t4, x4, x4; \ | ||
91 | vpshufb t4, x2, x2; \ | ||
92 | vpshufb t4, x5, x5; \ | ||
93 | vpshufb t4, x3, x3; \ | ||
94 | vpshufb t4, x6, x6; \ | ||
95 | \ | ||
96 | /* prefilter sboxes 1, 2 and 3 */ \ | ||
97 | vbroadcasti128 .Lpre_tf_lo_s4, t2; \ | ||
98 | vbroadcasti128 .Lpre_tf_hi_s4, t3; \ | ||
99 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
100 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
101 | filter_8bit(x1, t0, t1, t7, t6); \ | ||
102 | filter_8bit(x4, t0, t1, t7, t6); \ | ||
103 | filter_8bit(x2, t0, t1, t7, t6); \ | ||
104 | filter_8bit(x5, t0, t1, t7, t6); \ | ||
105 | \ | ||
106 | /* prefilter sbox 4 */ \ | ||
107 | vpxor t4##_x, t4##_x, t4##_x; \ | ||
108 | filter_8bit(x3, t2, t3, t7, t6); \ | ||
109 | filter_8bit(x6, t2, t3, t7, t6); \ | ||
110 | \ | ||
111 | /* AES subbytes + AES shift rows */ \ | ||
112 | vbroadcasti128 .Lpost_tf_lo_s1, t0; \ | ||
113 | vbroadcasti128 .Lpost_tf_hi_s1, t1; \ | ||
114 | vaesenclast256(t4, x0, t5); \ | ||
115 | vaesenclast256(t4, x7, t5); \ | ||
116 | vaesenclast256(t4, x1, t5); \ | ||
117 | vaesenclast256(t4, x4, t5); \ | ||
118 | vaesenclast256(t4, x2, t5); \ | ||
119 | vaesenclast256(t4, x5, t5); \ | ||
120 | vaesenclast256(t4, x3, t5); \ | ||
121 | vaesenclast256(t4, x6, t5); \ | ||
122 | \ | ||
123 | /* postfilter sboxes 1 and 4 */ \ | ||
124 | vbroadcasti128 .Lpost_tf_lo_s3, t2; \ | ||
125 | vbroadcasti128 .Lpost_tf_hi_s3, t3; \ | ||
126 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
127 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
128 | filter_8bit(x3, t0, t1, t7, t6); \ | ||
129 | filter_8bit(x6, t0, t1, t7, t6); \ | ||
130 | \ | ||
131 | /* postfilter sbox 3 */ \ | ||
132 | vbroadcasti128 .Lpost_tf_lo_s2, t4; \ | ||
133 | vbroadcasti128 .Lpost_tf_hi_s2, t5; \ | ||
134 | filter_8bit(x2, t2, t3, t7, t6); \ | ||
135 | filter_8bit(x5, t2, t3, t7, t6); \ | ||
136 | \ | ||
137 | vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ | ||
138 | \ | ||
139 | /* postfilter sbox 2 */ \ | ||
140 | filter_8bit(x1, t4, t5, t7, t2); \ | ||
141 | filter_8bit(x4, t4, t5, t7, t2); \ | ||
142 | \ | ||
143 | vpsrldq $1, t0, t1; \ | ||
144 | vpsrldq $2, t0, t2; \ | ||
145 | vpsrldq $3, t0, t3; \ | ||
146 | vpsrldq $4, t0, t4; \ | ||
147 | vpsrldq $5, t0, t5; \ | ||
148 | vpsrldq $6, t0, t6; \ | ||
149 | vpsrldq $7, t0, t7; \ | ||
150 | vpbroadcastb t0##_x, t0; \ | ||
151 | vpbroadcastb t1##_x, t1; \ | ||
152 | vpbroadcastb t2##_x, t2; \ | ||
153 | vpbroadcastb t3##_x, t3; \ | ||
154 | vpbroadcastb t4##_x, t4; \ | ||
155 | vpbroadcastb t6##_x, t6; \ | ||
156 | vpbroadcastb t5##_x, t5; \ | ||
157 | vpbroadcastb t7##_x, t7; \ | ||
158 | \ | ||
159 | /* P-function */ \ | ||
160 | vpxor x5, x0, x0; \ | ||
161 | vpxor x6, x1, x1; \ | ||
162 | vpxor x7, x2, x2; \ | ||
163 | vpxor x4, x3, x3; \ | ||
164 | \ | ||
165 | vpxor x2, x4, x4; \ | ||
166 | vpxor x3, x5, x5; \ | ||
167 | vpxor x0, x6, x6; \ | ||
168 | vpxor x1, x7, x7; \ | ||
169 | \ | ||
170 | vpxor x7, x0, x0; \ | ||
171 | vpxor x4, x1, x1; \ | ||
172 | vpxor x5, x2, x2; \ | ||
173 | vpxor x6, x3, x3; \ | ||
174 | \ | ||
175 | vpxor x3, x4, x4; \ | ||
176 | vpxor x0, x5, x5; \ | ||
177 | vpxor x1, x6, x6; \ | ||
178 | vpxor x2, x7, x7; /* note: high and low parts swapped */ \ | ||
179 | \ | ||
180 | /* Add key material and result to CD (x becomes new CD) */ \ | ||
181 | \ | ||
182 | vpxor t7, x0, x0; \ | ||
183 | vpxor 4 * 32(mem_cd), x0, x0; \ | ||
184 | \ | ||
185 | vpxor t6, x1, x1; \ | ||
186 | vpxor 5 * 32(mem_cd), x1, x1; \ | ||
187 | \ | ||
188 | vpxor t5, x2, x2; \ | ||
189 | vpxor 6 * 32(mem_cd), x2, x2; \ | ||
190 | \ | ||
191 | vpxor t4, x3, x3; \ | ||
192 | vpxor 7 * 32(mem_cd), x3, x3; \ | ||
193 | \ | ||
194 | vpxor t3, x4, x4; \ | ||
195 | vpxor 0 * 32(mem_cd), x4, x4; \ | ||
196 | \ | ||
197 | vpxor t2, x5, x5; \ | ||
198 | vpxor 1 * 32(mem_cd), x5, x5; \ | ||
199 | \ | ||
200 | vpxor t1, x6, x6; \ | ||
201 | vpxor 2 * 32(mem_cd), x6, x6; \ | ||
202 | \ | ||
203 | vpxor t0, x7, x7; \ | ||
204 | vpxor 3 * 32(mem_cd), x7, x7; | ||
205 | |||
206 | /* | ||
207 | * Size optimization... with inlined roundsm16 binary would be over 5 times | ||
208 | * larger and would only marginally faster. | ||
209 | */ | ||
210 | .align 8 | ||
211 | roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: | ||
212 | roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
213 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, | ||
214 | %rcx, (%r9)); | ||
215 | ret; | ||
216 | ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) | ||
217 | |||
218 | .align 8 | ||
219 | roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: | ||
220 | roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, | ||
221 | %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, | ||
222 | %rax, (%r9)); | ||
223 | ret; | ||
224 | ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) | ||
225 | |||
226 | /* | ||
227 | * IN/OUT: | ||
228 | * x0..x7: byte-sliced AB state preloaded | ||
229 | * mem_ab: byte-sliced AB state in memory | ||
230 | * mem_cb: byte-sliced CD state in memory | ||
231 | */ | ||
232 | #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
233 | y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ | ||
234 | leaq (key_table + (i) * 8)(CTX), %r9; \ | ||
235 | call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ | ||
236 | \ | ||
237 | vmovdqu x0, 4 * 32(mem_cd); \ | ||
238 | vmovdqu x1, 5 * 32(mem_cd); \ | ||
239 | vmovdqu x2, 6 * 32(mem_cd); \ | ||
240 | vmovdqu x3, 7 * 32(mem_cd); \ | ||
241 | vmovdqu x4, 0 * 32(mem_cd); \ | ||
242 | vmovdqu x5, 1 * 32(mem_cd); \ | ||
243 | vmovdqu x6, 2 * 32(mem_cd); \ | ||
244 | vmovdqu x7, 3 * 32(mem_cd); \ | ||
245 | \ | ||
246 | leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ | ||
247 | call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ | ||
248 | \ | ||
249 | store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); | ||
250 | |||
251 | #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ | ||
252 | |||
253 | #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ | ||
254 | /* Store new AB state */ \ | ||
255 | vmovdqu x4, 4 * 32(mem_ab); \ | ||
256 | vmovdqu x5, 5 * 32(mem_ab); \ | ||
257 | vmovdqu x6, 6 * 32(mem_ab); \ | ||
258 | vmovdqu x7, 7 * 32(mem_ab); \ | ||
259 | vmovdqu x0, 0 * 32(mem_ab); \ | ||
260 | vmovdqu x1, 1 * 32(mem_ab); \ | ||
261 | vmovdqu x2, 2 * 32(mem_ab); \ | ||
262 | vmovdqu x3, 3 * 32(mem_ab); | ||
263 | |||
264 | #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
265 | y6, y7, mem_ab, mem_cd, i) \ | ||
266 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
267 | y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ | ||
268 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
269 | y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ | ||
270 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
271 | y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); | ||
272 | |||
273 | #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
274 | y6, y7, mem_ab, mem_cd, i) \ | ||
275 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
276 | y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ | ||
277 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
278 | y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ | ||
279 | two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
280 | y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); | ||
281 | |||
282 | /* | ||
283 | * IN: | ||
284 | * v0..3: byte-sliced 32-bit integers | ||
285 | * OUT: | ||
286 | * v0..3: (IN <<< 1) | ||
287 | */ | ||
288 | #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ | ||
289 | vpcmpgtb v0, zero, t0; \ | ||
290 | vpaddb v0, v0, v0; \ | ||
291 | vpabsb t0, t0; \ | ||
292 | \ | ||
293 | vpcmpgtb v1, zero, t1; \ | ||
294 | vpaddb v1, v1, v1; \ | ||
295 | vpabsb t1, t1; \ | ||
296 | \ | ||
297 | vpcmpgtb v2, zero, t2; \ | ||
298 | vpaddb v2, v2, v2; \ | ||
299 | vpabsb t2, t2; \ | ||
300 | \ | ||
301 | vpor t0, v1, v1; \ | ||
302 | \ | ||
303 | vpcmpgtb v3, zero, t0; \ | ||
304 | vpaddb v3, v3, v3; \ | ||
305 | vpabsb t0, t0; \ | ||
306 | \ | ||
307 | vpor t1, v2, v2; \ | ||
308 | vpor t2, v3, v3; \ | ||
309 | vpor t0, v0, v0; | ||
310 | |||
311 | /* | ||
312 | * IN: | ||
313 | * r: byte-sliced AB state in memory | ||
314 | * l: byte-sliced CD state in memory | ||
315 | * OUT: | ||
316 | * x0..x7: new byte-sliced CD state | ||
317 | */ | ||
318 | #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ | ||
319 | tt1, tt2, tt3, kll, klr, krl, krr) \ | ||
320 | /* \ | ||
321 | * t0 = kll; \ | ||
322 | * t0 &= ll; \ | ||
323 | * lr ^= rol32(t0, 1); \ | ||
324 | */ \ | ||
325 | vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ | ||
326 | vpxor tt0, tt0, tt0; \ | ||
327 | vpbroadcastb t0##_x, t3; \ | ||
328 | vpsrldq $1, t0, t0; \ | ||
329 | vpbroadcastb t0##_x, t2; \ | ||
330 | vpsrldq $1, t0, t0; \ | ||
331 | vpbroadcastb t0##_x, t1; \ | ||
332 | vpsrldq $1, t0, t0; \ | ||
333 | vpbroadcastb t0##_x, t0; \ | ||
334 | \ | ||
335 | vpand l0, t0, t0; \ | ||
336 | vpand l1, t1, t1; \ | ||
337 | vpand l2, t2, t2; \ | ||
338 | vpand l3, t3, t3; \ | ||
339 | \ | ||
340 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | ||
341 | \ | ||
342 | vpxor l4, t0, l4; \ | ||
343 | vmovdqu l4, 4 * 32(l); \ | ||
344 | vpxor l5, t1, l5; \ | ||
345 | vmovdqu l5, 5 * 32(l); \ | ||
346 | vpxor l6, t2, l6; \ | ||
347 | vmovdqu l6, 6 * 32(l); \ | ||
348 | vpxor l7, t3, l7; \ | ||
349 | vmovdqu l7, 7 * 32(l); \ | ||
350 | \ | ||
351 | /* \ | ||
352 | * t2 = krr; \ | ||
353 | * t2 |= rr; \ | ||
354 | * rl ^= t2; \ | ||
355 | */ \ | ||
356 | \ | ||
357 | vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ | ||
358 | vpbroadcastb t0##_x, t3; \ | ||
359 | vpsrldq $1, t0, t0; \ | ||
360 | vpbroadcastb t0##_x, t2; \ | ||
361 | vpsrldq $1, t0, t0; \ | ||
362 | vpbroadcastb t0##_x, t1; \ | ||
363 | vpsrldq $1, t0, t0; \ | ||
364 | vpbroadcastb t0##_x, t0; \ | ||
365 | \ | ||
366 | vpor 4 * 32(r), t0, t0; \ | ||
367 | vpor 5 * 32(r), t1, t1; \ | ||
368 | vpor 6 * 32(r), t2, t2; \ | ||
369 | vpor 7 * 32(r), t3, t3; \ | ||
370 | \ | ||
371 | vpxor 0 * 32(r), t0, t0; \ | ||
372 | vpxor 1 * 32(r), t1, t1; \ | ||
373 | vpxor 2 * 32(r), t2, t2; \ | ||
374 | vpxor 3 * 32(r), t3, t3; \ | ||
375 | vmovdqu t0, 0 * 32(r); \ | ||
376 | vmovdqu t1, 1 * 32(r); \ | ||
377 | vmovdqu t2, 2 * 32(r); \ | ||
378 | vmovdqu t3, 3 * 32(r); \ | ||
379 | \ | ||
380 | /* \ | ||
381 | * t2 = krl; \ | ||
382 | * t2 &= rl; \ | ||
383 | * rr ^= rol32(t2, 1); \ | ||
384 | */ \ | ||
385 | vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ | ||
386 | vpbroadcastb t0##_x, t3; \ | ||
387 | vpsrldq $1, t0, t0; \ | ||
388 | vpbroadcastb t0##_x, t2; \ | ||
389 | vpsrldq $1, t0, t0; \ | ||
390 | vpbroadcastb t0##_x, t1; \ | ||
391 | vpsrldq $1, t0, t0; \ | ||
392 | vpbroadcastb t0##_x, t0; \ | ||
393 | \ | ||
394 | vpand 0 * 32(r), t0, t0; \ | ||
395 | vpand 1 * 32(r), t1, t1; \ | ||
396 | vpand 2 * 32(r), t2, t2; \ | ||
397 | vpand 3 * 32(r), t3, t3; \ | ||
398 | \ | ||
399 | rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | ||
400 | \ | ||
401 | vpxor 4 * 32(r), t0, t0; \ | ||
402 | vpxor 5 * 32(r), t1, t1; \ | ||
403 | vpxor 6 * 32(r), t2, t2; \ | ||
404 | vpxor 7 * 32(r), t3, t3; \ | ||
405 | vmovdqu t0, 4 * 32(r); \ | ||
406 | vmovdqu t1, 5 * 32(r); \ | ||
407 | vmovdqu t2, 6 * 32(r); \ | ||
408 | vmovdqu t3, 7 * 32(r); \ | ||
409 | \ | ||
410 | /* \ | ||
411 | * t0 = klr; \ | ||
412 | * t0 |= lr; \ | ||
413 | * ll ^= t0; \ | ||
414 | */ \ | ||
415 | \ | ||
416 | vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ | ||
417 | vpbroadcastb t0##_x, t3; \ | ||
418 | vpsrldq $1, t0, t0; \ | ||
419 | vpbroadcastb t0##_x, t2; \ | ||
420 | vpsrldq $1, t0, t0; \ | ||
421 | vpbroadcastb t0##_x, t1; \ | ||
422 | vpsrldq $1, t0, t0; \ | ||
423 | vpbroadcastb t0##_x, t0; \ | ||
424 | \ | ||
425 | vpor l4, t0, t0; \ | ||
426 | vpor l5, t1, t1; \ | ||
427 | vpor l6, t2, t2; \ | ||
428 | vpor l7, t3, t3; \ | ||
429 | \ | ||
430 | vpxor l0, t0, l0; \ | ||
431 | vmovdqu l0, 0 * 32(l); \ | ||
432 | vpxor l1, t1, l1; \ | ||
433 | vmovdqu l1, 1 * 32(l); \ | ||
434 | vpxor l2, t2, l2; \ | ||
435 | vmovdqu l2, 2 * 32(l); \ | ||
436 | vpxor l3, t3, l3; \ | ||
437 | vmovdqu l3, 3 * 32(l); | ||
438 | |||
439 | #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ | ||
440 | vpunpckhdq x1, x0, t2; \ | ||
441 | vpunpckldq x1, x0, x0; \ | ||
442 | \ | ||
443 | vpunpckldq x3, x2, t1; \ | ||
444 | vpunpckhdq x3, x2, x2; \ | ||
445 | \ | ||
446 | vpunpckhqdq t1, x0, x1; \ | ||
447 | vpunpcklqdq t1, x0, x0; \ | ||
448 | \ | ||
449 | vpunpckhqdq x2, t2, x3; \ | ||
450 | vpunpcklqdq x2, t2, x2; | ||
451 | |||
452 | #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ | ||
453 | a3, b3, c3, d3, st0, st1) \ | ||
454 | vmovdqu d2, st0; \ | ||
455 | vmovdqu d3, st1; \ | ||
456 | transpose_4x4(a0, a1, a2, a3, d2, d3); \ | ||
457 | transpose_4x4(b0, b1, b2, b3, d2, d3); \ | ||
458 | vmovdqu st0, d2; \ | ||
459 | vmovdqu st1, d3; \ | ||
460 | \ | ||
461 | vmovdqu a0, st0; \ | ||
462 | vmovdqu a1, st1; \ | ||
463 | transpose_4x4(c0, c1, c2, c3, a0, a1); \ | ||
464 | transpose_4x4(d0, d1, d2, d3, a0, a1); \ | ||
465 | \ | ||
466 | vbroadcasti128 .Lshufb_16x16b, a0; \ | ||
467 | vmovdqu st1, a1; \ | ||
468 | vpshufb a0, a2, a2; \ | ||
469 | vpshufb a0, a3, a3; \ | ||
470 | vpshufb a0, b0, b0; \ | ||
471 | vpshufb a0, b1, b1; \ | ||
472 | vpshufb a0, b2, b2; \ | ||
473 | vpshufb a0, b3, b3; \ | ||
474 | vpshufb a0, a1, a1; \ | ||
475 | vpshufb a0, c0, c0; \ | ||
476 | vpshufb a0, c1, c1; \ | ||
477 | vpshufb a0, c2, c2; \ | ||
478 | vpshufb a0, c3, c3; \ | ||
479 | vpshufb a0, d0, d0; \ | ||
480 | vpshufb a0, d1, d1; \ | ||
481 | vpshufb a0, d2, d2; \ | ||
482 | vpshufb a0, d3, d3; \ | ||
483 | vmovdqu d3, st1; \ | ||
484 | vmovdqu st0, d3; \ | ||
485 | vpshufb a0, d3, a0; \ | ||
486 | vmovdqu d2, st0; \ | ||
487 | \ | ||
488 | transpose_4x4(a0, b0, c0, d0, d2, d3); \ | ||
489 | transpose_4x4(a1, b1, c1, d1, d2, d3); \ | ||
490 | vmovdqu st0, d2; \ | ||
491 | vmovdqu st1, d3; \ | ||
492 | \ | ||
493 | vmovdqu b0, st0; \ | ||
494 | vmovdqu b1, st1; \ | ||
495 | transpose_4x4(a2, b2, c2, d2, b0, b1); \ | ||
496 | transpose_4x4(a3, b3, c3, d3, b0, b1); \ | ||
497 | vmovdqu st0, b0; \ | ||
498 | vmovdqu st1, b1; \ | ||
499 | /* does not adjust output bytes inside vectors */ | ||
500 | |||
501 | /* load blocks to registers and apply pre-whitening */ | ||
502 | #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
503 | y6, y7, rio, key) \ | ||
504 | vpbroadcastq key, x0; \ | ||
505 | vpshufb .Lpack_bswap, x0, x0; \ | ||
506 | \ | ||
507 | vpxor 0 * 32(rio), x0, y7; \ | ||
508 | vpxor 1 * 32(rio), x0, y6; \ | ||
509 | vpxor 2 * 32(rio), x0, y5; \ | ||
510 | vpxor 3 * 32(rio), x0, y4; \ | ||
511 | vpxor 4 * 32(rio), x0, y3; \ | ||
512 | vpxor 5 * 32(rio), x0, y2; \ | ||
513 | vpxor 6 * 32(rio), x0, y1; \ | ||
514 | vpxor 7 * 32(rio), x0, y0; \ | ||
515 | vpxor 8 * 32(rio), x0, x7; \ | ||
516 | vpxor 9 * 32(rio), x0, x6; \ | ||
517 | vpxor 10 * 32(rio), x0, x5; \ | ||
518 | vpxor 11 * 32(rio), x0, x4; \ | ||
519 | vpxor 12 * 32(rio), x0, x3; \ | ||
520 | vpxor 13 * 32(rio), x0, x2; \ | ||
521 | vpxor 14 * 32(rio), x0, x1; \ | ||
522 | vpxor 15 * 32(rio), x0, x0; | ||
523 | |||
524 | /* byteslice pre-whitened blocks and store to temporary memory */ | ||
525 | #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
526 | y6, y7, mem_ab, mem_cd) \ | ||
527 | byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ | ||
528 | y4, y5, y6, y7, (mem_ab), (mem_cd)); \ | ||
529 | \ | ||
530 | vmovdqu x0, 0 * 32(mem_ab); \ | ||
531 | vmovdqu x1, 1 * 32(mem_ab); \ | ||
532 | vmovdqu x2, 2 * 32(mem_ab); \ | ||
533 | vmovdqu x3, 3 * 32(mem_ab); \ | ||
534 | vmovdqu x4, 4 * 32(mem_ab); \ | ||
535 | vmovdqu x5, 5 * 32(mem_ab); \ | ||
536 | vmovdqu x6, 6 * 32(mem_ab); \ | ||
537 | vmovdqu x7, 7 * 32(mem_ab); \ | ||
538 | vmovdqu y0, 0 * 32(mem_cd); \ | ||
539 | vmovdqu y1, 1 * 32(mem_cd); \ | ||
540 | vmovdqu y2, 2 * 32(mem_cd); \ | ||
541 | vmovdqu y3, 3 * 32(mem_cd); \ | ||
542 | vmovdqu y4, 4 * 32(mem_cd); \ | ||
543 | vmovdqu y5, 5 * 32(mem_cd); \ | ||
544 | vmovdqu y6, 6 * 32(mem_cd); \ | ||
545 | vmovdqu y7, 7 * 32(mem_cd); | ||
546 | |||
547 | /* de-byteslice, apply post-whitening and store blocks */ | ||
548 | #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ | ||
549 | y5, y6, y7, key, stack_tmp0, stack_tmp1) \ | ||
550 | byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ | ||
551 | y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ | ||
552 | \ | ||
553 | vmovdqu x0, stack_tmp0; \ | ||
554 | \ | ||
555 | vpbroadcastq key, x0; \ | ||
556 | vpshufb .Lpack_bswap, x0, x0; \ | ||
557 | \ | ||
558 | vpxor x0, y7, y7; \ | ||
559 | vpxor x0, y6, y6; \ | ||
560 | vpxor x0, y5, y5; \ | ||
561 | vpxor x0, y4, y4; \ | ||
562 | vpxor x0, y3, y3; \ | ||
563 | vpxor x0, y2, y2; \ | ||
564 | vpxor x0, y1, y1; \ | ||
565 | vpxor x0, y0, y0; \ | ||
566 | vpxor x0, x7, x7; \ | ||
567 | vpxor x0, x6, x6; \ | ||
568 | vpxor x0, x5, x5; \ | ||
569 | vpxor x0, x4, x4; \ | ||
570 | vpxor x0, x3, x3; \ | ||
571 | vpxor x0, x2, x2; \ | ||
572 | vpxor x0, x1, x1; \ | ||
573 | vpxor stack_tmp0, x0, x0; | ||
574 | |||
575 | #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
576 | y6, y7, rio) \ | ||
577 | vmovdqu x0, 0 * 32(rio); \ | ||
578 | vmovdqu x1, 1 * 32(rio); \ | ||
579 | vmovdqu x2, 2 * 32(rio); \ | ||
580 | vmovdqu x3, 3 * 32(rio); \ | ||
581 | vmovdqu x4, 4 * 32(rio); \ | ||
582 | vmovdqu x5, 5 * 32(rio); \ | ||
583 | vmovdqu x6, 6 * 32(rio); \ | ||
584 | vmovdqu x7, 7 * 32(rio); \ | ||
585 | vmovdqu y0, 8 * 32(rio); \ | ||
586 | vmovdqu y1, 9 * 32(rio); \ | ||
587 | vmovdqu y2, 10 * 32(rio); \ | ||
588 | vmovdqu y3, 11 * 32(rio); \ | ||
589 | vmovdqu y4, 12 * 32(rio); \ | ||
590 | vmovdqu y5, 13 * 32(rio); \ | ||
591 | vmovdqu y6, 14 * 32(rio); \ | ||
592 | vmovdqu y7, 15 * 32(rio); | ||
593 | |||
594 | .data | ||
595 | .align 32 | ||
596 | |||
597 | #define SHUFB_BYTES(idx) \ | ||
598 | 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) | ||
599 | |||
600 | .Lshufb_16x16b: | ||
601 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) | ||
602 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) | ||
603 | |||
604 | .Lpack_bswap: | ||
605 | .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 | ||
606 | .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 | ||
607 | |||
608 | /* For CTR-mode IV byteswap */ | ||
609 | .Lbswap128_mask: | ||
610 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
611 | |||
612 | /* For XTS mode */ | ||
613 | .Lxts_gf128mul_and_shl1_mask_0: | ||
614 | .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | ||
615 | .Lxts_gf128mul_and_shl1_mask_1: | ||
616 | .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 | ||
617 | |||
618 | /* | ||
619 | * pre-SubByte transform | ||
620 | * | ||
621 | * pre-lookup for sbox1, sbox2, sbox3: | ||
622 | * swap_bitendianness( | ||
623 | * isom_map_camellia_to_aes( | ||
624 | * camellia_f( | ||
625 | * swap_bitendianess(in) | ||
626 | * ) | ||
627 | * ) | ||
628 | * ) | ||
629 | * | ||
630 | * (note: '⊕ 0xc5' inside camellia_f()) | ||
631 | */ | ||
632 | .Lpre_tf_lo_s1: | ||
633 | .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 | ||
634 | .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 | ||
635 | .Lpre_tf_hi_s1: | ||
636 | .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a | ||
637 | .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 | ||
638 | |||
639 | /* | ||
640 | * pre-SubByte transform | ||
641 | * | ||
642 | * pre-lookup for sbox4: | ||
643 | * swap_bitendianness( | ||
644 | * isom_map_camellia_to_aes( | ||
645 | * camellia_f( | ||
646 | * swap_bitendianess(in <<< 1) | ||
647 | * ) | ||
648 | * ) | ||
649 | * ) | ||
650 | * | ||
651 | * (note: '⊕ 0xc5' inside camellia_f()) | ||
652 | */ | ||
653 | .Lpre_tf_lo_s4: | ||
654 | .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 | ||
655 | .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 | ||
656 | .Lpre_tf_hi_s4: | ||
657 | .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 | ||
658 | .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf | ||
659 | |||
660 | /* | ||
661 | * post-SubByte transform | ||
662 | * | ||
663 | * post-lookup for sbox1, sbox4: | ||
664 | * swap_bitendianness( | ||
665 | * camellia_h( | ||
666 | * isom_map_aes_to_camellia( | ||
667 | * swap_bitendianness( | ||
668 | * aes_inverse_affine_transform(in) | ||
669 | * ) | ||
670 | * ) | ||
671 | * ) | ||
672 | * ) | ||
673 | * | ||
674 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
675 | */ | ||
676 | .Lpost_tf_lo_s1: | ||
677 | .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 | ||
678 | .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 | ||
679 | .Lpost_tf_hi_s1: | ||
680 | .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 | ||
681 | .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c | ||
682 | |||
683 | /* | ||
684 | * post-SubByte transform | ||
685 | * | ||
686 | * post-lookup for sbox2: | ||
687 | * swap_bitendianness( | ||
688 | * camellia_h( | ||
689 | * isom_map_aes_to_camellia( | ||
690 | * swap_bitendianness( | ||
691 | * aes_inverse_affine_transform(in) | ||
692 | * ) | ||
693 | * ) | ||
694 | * ) | ||
695 | * ) <<< 1 | ||
696 | * | ||
697 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
698 | */ | ||
699 | .Lpost_tf_lo_s2: | ||
700 | .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 | ||
701 | .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 | ||
702 | .Lpost_tf_hi_s2: | ||
703 | .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 | ||
704 | .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 | ||
705 | |||
706 | /* | ||
707 | * post-SubByte transform | ||
708 | * | ||
709 | * post-lookup for sbox3: | ||
710 | * swap_bitendianness( | ||
711 | * camellia_h( | ||
712 | * isom_map_aes_to_camellia( | ||
713 | * swap_bitendianness( | ||
714 | * aes_inverse_affine_transform(in) | ||
715 | * ) | ||
716 | * ) | ||
717 | * ) | ||
718 | * ) >>> 1 | ||
719 | * | ||
720 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
721 | */ | ||
722 | .Lpost_tf_lo_s3: | ||
723 | .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 | ||
724 | .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 | ||
725 | .Lpost_tf_hi_s3: | ||
726 | .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 | ||
727 | .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 | ||
728 | |||
729 | /* For isolating SubBytes from AESENCLAST, inverse shift row */ | ||
730 | .Linv_shift_row: | ||
731 | .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b | ||
732 | .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 | ||
733 | |||
734 | .align 4 | ||
735 | /* 4-bit mask */ | ||
736 | .L0f0f0f0f: | ||
737 | .long 0x0f0f0f0f | ||
738 | |||
739 | .text | ||
740 | |||
741 | .align 8 | ||
742 | __camellia_enc_blk32: | ||
743 | /* input: | ||
744 | * %rdi: ctx, CTX | ||
745 | * %rax: temporary storage, 512 bytes | ||
746 | * %ymm0..%ymm15: 32 plaintext blocks | ||
747 | * output: | ||
748 | * %ymm0..%ymm15: 32 encrypted blocks, order swapped: | ||
749 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | ||
750 | */ | ||
751 | |||
752 | leaq 8 * 32(%rax), %rcx; | ||
753 | |||
754 | inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
755 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
756 | %ymm15, %rax, %rcx); | ||
757 | |||
758 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
759 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
760 | %ymm15, %rax, %rcx, 0); | ||
761 | |||
762 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
763 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
764 | %ymm15, | ||
765 | ((key_table + (8) * 8) + 0)(CTX), | ||
766 | ((key_table + (8) * 8) + 4)(CTX), | ||
767 | ((key_table + (8) * 8) + 8)(CTX), | ||
768 | ((key_table + (8) * 8) + 12)(CTX)); | ||
769 | |||
770 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
771 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
772 | %ymm15, %rax, %rcx, 8); | ||
773 | |||
774 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
775 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
776 | %ymm15, | ||
777 | ((key_table + (16) * 8) + 0)(CTX), | ||
778 | ((key_table + (16) * 8) + 4)(CTX), | ||
779 | ((key_table + (16) * 8) + 8)(CTX), | ||
780 | ((key_table + (16) * 8) + 12)(CTX)); | ||
781 | |||
782 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
783 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
784 | %ymm15, %rax, %rcx, 16); | ||
785 | |||
786 | movl $24, %r8d; | ||
787 | cmpl $16, key_length(CTX); | ||
788 | jne .Lenc_max32; | ||
789 | |||
790 | .Lenc_done: | ||
791 | /* load CD for output */ | ||
792 | vmovdqu 0 * 32(%rcx), %ymm8; | ||
793 | vmovdqu 1 * 32(%rcx), %ymm9; | ||
794 | vmovdqu 2 * 32(%rcx), %ymm10; | ||
795 | vmovdqu 3 * 32(%rcx), %ymm11; | ||
796 | vmovdqu 4 * 32(%rcx), %ymm12; | ||
797 | vmovdqu 5 * 32(%rcx), %ymm13; | ||
798 | vmovdqu 6 * 32(%rcx), %ymm14; | ||
799 | vmovdqu 7 * 32(%rcx), %ymm15; | ||
800 | |||
801 | outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
802 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
803 | %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); | ||
804 | |||
805 | ret; | ||
806 | |||
807 | .align 8 | ||
808 | .Lenc_max32: | ||
809 | movl $32, %r8d; | ||
810 | |||
811 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
812 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
813 | %ymm15, | ||
814 | ((key_table + (24) * 8) + 0)(CTX), | ||
815 | ((key_table + (24) * 8) + 4)(CTX), | ||
816 | ((key_table + (24) * 8) + 8)(CTX), | ||
817 | ((key_table + (24) * 8) + 12)(CTX)); | ||
818 | |||
819 | enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
820 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
821 | %ymm15, %rax, %rcx, 24); | ||
822 | |||
823 | jmp .Lenc_done; | ||
824 | ENDPROC(__camellia_enc_blk32) | ||
825 | |||
826 | .align 8 | ||
827 | __camellia_dec_blk32: | ||
828 | /* input: | ||
829 | * %rdi: ctx, CTX | ||
830 | * %rax: temporary storage, 512 bytes | ||
831 | * %r8d: 24 for 16 byte key, 32 for larger | ||
832 | * %ymm0..%ymm15: 16 encrypted blocks | ||
833 | * output: | ||
834 | * %ymm0..%ymm15: 16 plaintext blocks, order swapped: | ||
835 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | ||
836 | */ | ||
837 | |||
838 | leaq 8 * 32(%rax), %rcx; | ||
839 | |||
840 | inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
841 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
842 | %ymm15, %rax, %rcx); | ||
843 | |||
844 | cmpl $32, %r8d; | ||
845 | je .Ldec_max32; | ||
846 | |||
847 | .Ldec_max24: | ||
848 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
849 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
850 | %ymm15, %rax, %rcx, 16); | ||
851 | |||
852 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
853 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
854 | %ymm15, | ||
855 | ((key_table + (16) * 8) + 8)(CTX), | ||
856 | ((key_table + (16) * 8) + 12)(CTX), | ||
857 | ((key_table + (16) * 8) + 0)(CTX), | ||
858 | ((key_table + (16) * 8) + 4)(CTX)); | ||
859 | |||
860 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
861 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
862 | %ymm15, %rax, %rcx, 8); | ||
863 | |||
864 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
865 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
866 | %ymm15, | ||
867 | ((key_table + (8) * 8) + 8)(CTX), | ||
868 | ((key_table + (8) * 8) + 12)(CTX), | ||
869 | ((key_table + (8) * 8) + 0)(CTX), | ||
870 | ((key_table + (8) * 8) + 4)(CTX)); | ||
871 | |||
872 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
873 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
874 | %ymm15, %rax, %rcx, 0); | ||
875 | |||
876 | /* load CD for output */ | ||
877 | vmovdqu 0 * 32(%rcx), %ymm8; | ||
878 | vmovdqu 1 * 32(%rcx), %ymm9; | ||
879 | vmovdqu 2 * 32(%rcx), %ymm10; | ||
880 | vmovdqu 3 * 32(%rcx), %ymm11; | ||
881 | vmovdqu 4 * 32(%rcx), %ymm12; | ||
882 | vmovdqu 5 * 32(%rcx), %ymm13; | ||
883 | vmovdqu 6 * 32(%rcx), %ymm14; | ||
884 | vmovdqu 7 * 32(%rcx), %ymm15; | ||
885 | |||
886 | outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
887 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
888 | %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); | ||
889 | |||
890 | ret; | ||
891 | |||
892 | .align 8 | ||
893 | .Ldec_max32: | ||
894 | dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
895 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
896 | %ymm15, %rax, %rcx, 24); | ||
897 | |||
898 | fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
899 | %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
900 | %ymm15, | ||
901 | ((key_table + (24) * 8) + 8)(CTX), | ||
902 | ((key_table + (24) * 8) + 12)(CTX), | ||
903 | ((key_table + (24) * 8) + 0)(CTX), | ||
904 | ((key_table + (24) * 8) + 4)(CTX)); | ||
905 | |||
906 | jmp .Ldec_max24; | ||
907 | ENDPROC(__camellia_dec_blk32) | ||
908 | |||
909 | ENTRY(camellia_ecb_enc_32way) | ||
910 | /* input: | ||
911 | * %rdi: ctx, CTX | ||
912 | * %rsi: dst (32 blocks) | ||
913 | * %rdx: src (32 blocks) | ||
914 | */ | ||
915 | |||
916 | vzeroupper; | ||
917 | |||
918 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
919 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
920 | %ymm15, %rdx, (key_table)(CTX)); | ||
921 | |||
922 | /* now dst can be used as temporary buffer (even in src == dst case) */ | ||
923 | movq %rsi, %rax; | ||
924 | |||
925 | call __camellia_enc_blk32; | ||
926 | |||
927 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
928 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
929 | %ymm8, %rsi); | ||
930 | |||
931 | vzeroupper; | ||
932 | |||
933 | ret; | ||
934 | ENDPROC(camellia_ecb_enc_32way) | ||
935 | |||
936 | ENTRY(camellia_ecb_dec_32way) | ||
937 | /* input: | ||
938 | * %rdi: ctx, CTX | ||
939 | * %rsi: dst (32 blocks) | ||
940 | * %rdx: src (32 blocks) | ||
941 | */ | ||
942 | |||
943 | vzeroupper; | ||
944 | |||
945 | cmpl $16, key_length(CTX); | ||
946 | movl $32, %r8d; | ||
947 | movl $24, %eax; | ||
948 | cmovel %eax, %r8d; /* max */ | ||
949 | |||
950 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
951 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
952 | %ymm15, %rdx, (key_table)(CTX, %r8, 8)); | ||
953 | |||
954 | /* now dst can be used as temporary buffer (even in src == dst case) */ | ||
955 | movq %rsi, %rax; | ||
956 | |||
957 | call __camellia_dec_blk32; | ||
958 | |||
959 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
960 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
961 | %ymm8, %rsi); | ||
962 | |||
963 | vzeroupper; | ||
964 | |||
965 | ret; | ||
966 | ENDPROC(camellia_ecb_dec_32way) | ||
967 | |||
968 | ENTRY(camellia_cbc_dec_32way) | ||
969 | /* input: | ||
970 | * %rdi: ctx, CTX | ||
971 | * %rsi: dst (32 blocks) | ||
972 | * %rdx: src (32 blocks) | ||
973 | */ | ||
974 | |||
975 | vzeroupper; | ||
976 | |||
977 | cmpl $16, key_length(CTX); | ||
978 | movl $32, %r8d; | ||
979 | movl $24, %eax; | ||
980 | cmovel %eax, %r8d; /* max */ | ||
981 | |||
982 | inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, | ||
983 | %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, | ||
984 | %ymm15, %rdx, (key_table)(CTX, %r8, 8)); | ||
985 | |||
986 | movq %rsp, %r10; | ||
987 | cmpq %rsi, %rdx; | ||
988 | je .Lcbc_dec_use_stack; | ||
989 | |||
990 | /* dst can be used as temporary storage, src is not overwritten. */ | ||
991 | movq %rsi, %rax; | ||
992 | jmp .Lcbc_dec_continue; | ||
993 | |||
994 | .Lcbc_dec_use_stack: | ||
995 | /* | ||
996 | * dst still in-use (because dst == src), so use stack for temporary | ||
997 | * storage. | ||
998 | */ | ||
999 | subq $(16 * 32), %rsp; | ||
1000 | movq %rsp, %rax; | ||
1001 | |||
1002 | .Lcbc_dec_continue: | ||
1003 | call __camellia_dec_blk32; | ||
1004 | |||
1005 | vmovdqu %ymm7, (%rax); | ||
1006 | vpxor %ymm7, %ymm7, %ymm7; | ||
1007 | vinserti128 $1, (%rdx), %ymm7, %ymm7; | ||
1008 | vpxor (%rax), %ymm7, %ymm7; | ||
1009 | movq %r10, %rsp; | ||
1010 | vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; | ||
1011 | vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; | ||
1012 | vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; | ||
1013 | vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; | ||
1014 | vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; | ||
1015 | vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; | ||
1016 | vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; | ||
1017 | vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; | ||
1018 | vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; | ||
1019 | vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; | ||
1020 | vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; | ||
1021 | vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; | ||
1022 | vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; | ||
1023 | vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; | ||
1024 | vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; | ||
1025 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
1026 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
1027 | %ymm8, %rsi); | ||
1028 | |||
1029 | vzeroupper; | ||
1030 | |||
1031 | ret; | ||
1032 | ENDPROC(camellia_cbc_dec_32way) | ||
1033 | |||
1034 | #define inc_le128(x, minus_one, tmp) \ | ||
1035 | vpcmpeqq minus_one, x, tmp; \ | ||
1036 | vpsubq minus_one, x, x; \ | ||
1037 | vpslldq $8, tmp, tmp; \ | ||
1038 | vpsubq tmp, x, x; | ||
1039 | |||
1040 | #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ | ||
1041 | vpcmpeqq minus_one, x, tmp1; \ | ||
1042 | vpcmpeqq minus_two, x, tmp2; \ | ||
1043 | vpsubq minus_two, x, x; \ | ||
1044 | vpor tmp2, tmp1, tmp1; \ | ||
1045 | vpslldq $8, tmp1, tmp1; \ | ||
1046 | vpsubq tmp1, x, x; | ||
1047 | |||
1048 | ENTRY(camellia_ctr_32way) | ||
1049 | /* input: | ||
1050 | * %rdi: ctx, CTX | ||
1051 | * %rsi: dst (32 blocks) | ||
1052 | * %rdx: src (32 blocks) | ||
1053 | * %rcx: iv (little endian, 128bit) | ||
1054 | */ | ||
1055 | |||
1056 | vzeroupper; | ||
1057 | |||
1058 | movq %rsp, %r10; | ||
1059 | cmpq %rsi, %rdx; | ||
1060 | je .Lctr_use_stack; | ||
1061 | |||
1062 | /* dst can be used as temporary storage, src is not overwritten. */ | ||
1063 | movq %rsi, %rax; | ||
1064 | jmp .Lctr_continue; | ||
1065 | |||
1066 | .Lctr_use_stack: | ||
1067 | subq $(16 * 32), %rsp; | ||
1068 | movq %rsp, %rax; | ||
1069 | |||
1070 | .Lctr_continue: | ||
1071 | vpcmpeqd %ymm15, %ymm15, %ymm15; | ||
1072 | vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ | ||
1073 | vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ | ||
1074 | |||
1075 | /* load IV and byteswap */ | ||
1076 | vmovdqu (%rcx), %xmm0; | ||
1077 | vmovdqa %xmm0, %xmm1; | ||
1078 | inc_le128(%xmm0, %xmm15, %xmm14); | ||
1079 | vbroadcasti128 .Lbswap128_mask, %ymm14; | ||
1080 | vinserti128 $1, %xmm0, %ymm1, %ymm0; | ||
1081 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1082 | vmovdqu %ymm13, 15 * 32(%rax); | ||
1083 | |||
1084 | /* construct IVs */ | ||
1085 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ | ||
1086 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1087 | vmovdqu %ymm13, 14 * 32(%rax); | ||
1088 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1089 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1090 | vmovdqu %ymm13, 13 * 32(%rax); | ||
1091 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1092 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1093 | vmovdqu %ymm13, 12 * 32(%rax); | ||
1094 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1095 | vpshufb %ymm14, %ymm0, %ymm13; | ||
1096 | vmovdqu %ymm13, 11 * 32(%rax); | ||
1097 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1098 | vpshufb %ymm14, %ymm0, %ymm10; | ||
1099 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1100 | vpshufb %ymm14, %ymm0, %ymm9; | ||
1101 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1102 | vpshufb %ymm14, %ymm0, %ymm8; | ||
1103 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1104 | vpshufb %ymm14, %ymm0, %ymm7; | ||
1105 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1106 | vpshufb %ymm14, %ymm0, %ymm6; | ||
1107 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1108 | vpshufb %ymm14, %ymm0, %ymm5; | ||
1109 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1110 | vpshufb %ymm14, %ymm0, %ymm4; | ||
1111 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1112 | vpshufb %ymm14, %ymm0, %ymm3; | ||
1113 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1114 | vpshufb %ymm14, %ymm0, %ymm2; | ||
1115 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1116 | vpshufb %ymm14, %ymm0, %ymm1; | ||
1117 | add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); | ||
1118 | vextracti128 $1, %ymm0, %xmm13; | ||
1119 | vpshufb %ymm14, %ymm0, %ymm0; | ||
1120 | inc_le128(%xmm13, %xmm15, %xmm14); | ||
1121 | vmovdqu %xmm13, (%rcx); | ||
1122 | |||
1123 | /* inpack32_pre: */ | ||
1124 | vpbroadcastq (key_table)(CTX), %ymm15; | ||
1125 | vpshufb .Lpack_bswap, %ymm15, %ymm15; | ||
1126 | vpxor %ymm0, %ymm15, %ymm0; | ||
1127 | vpxor %ymm1, %ymm15, %ymm1; | ||
1128 | vpxor %ymm2, %ymm15, %ymm2; | ||
1129 | vpxor %ymm3, %ymm15, %ymm3; | ||
1130 | vpxor %ymm4, %ymm15, %ymm4; | ||
1131 | vpxor %ymm5, %ymm15, %ymm5; | ||
1132 | vpxor %ymm6, %ymm15, %ymm6; | ||
1133 | vpxor %ymm7, %ymm15, %ymm7; | ||
1134 | vpxor %ymm8, %ymm15, %ymm8; | ||
1135 | vpxor %ymm9, %ymm15, %ymm9; | ||
1136 | vpxor %ymm10, %ymm15, %ymm10; | ||
1137 | vpxor 11 * 32(%rax), %ymm15, %ymm11; | ||
1138 | vpxor 12 * 32(%rax), %ymm15, %ymm12; | ||
1139 | vpxor 13 * 32(%rax), %ymm15, %ymm13; | ||
1140 | vpxor 14 * 32(%rax), %ymm15, %ymm14; | ||
1141 | vpxor 15 * 32(%rax), %ymm15, %ymm15; | ||
1142 | |||
1143 | call __camellia_enc_blk32; | ||
1144 | |||
1145 | movq %r10, %rsp; | ||
1146 | |||
1147 | vpxor 0 * 32(%rdx), %ymm7, %ymm7; | ||
1148 | vpxor 1 * 32(%rdx), %ymm6, %ymm6; | ||
1149 | vpxor 2 * 32(%rdx), %ymm5, %ymm5; | ||
1150 | vpxor 3 * 32(%rdx), %ymm4, %ymm4; | ||
1151 | vpxor 4 * 32(%rdx), %ymm3, %ymm3; | ||
1152 | vpxor 5 * 32(%rdx), %ymm2, %ymm2; | ||
1153 | vpxor 6 * 32(%rdx), %ymm1, %ymm1; | ||
1154 | vpxor 7 * 32(%rdx), %ymm0, %ymm0; | ||
1155 | vpxor 8 * 32(%rdx), %ymm15, %ymm15; | ||
1156 | vpxor 9 * 32(%rdx), %ymm14, %ymm14; | ||
1157 | vpxor 10 * 32(%rdx), %ymm13, %ymm13; | ||
1158 | vpxor 11 * 32(%rdx), %ymm12, %ymm12; | ||
1159 | vpxor 12 * 32(%rdx), %ymm11, %ymm11; | ||
1160 | vpxor 13 * 32(%rdx), %ymm10, %ymm10; | ||
1161 | vpxor 14 * 32(%rdx), %ymm9, %ymm9; | ||
1162 | vpxor 15 * 32(%rdx), %ymm8, %ymm8; | ||
1163 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
1164 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
1165 | %ymm8, %rsi); | ||
1166 | |||
1167 | vzeroupper; | ||
1168 | |||
1169 | ret; | ||
1170 | ENDPROC(camellia_ctr_32way) | ||
1171 | |||
1172 | #define gf128mul_x_ble(iv, mask, tmp) \ | ||
1173 | vpsrad $31, iv, tmp; \ | ||
1174 | vpaddq iv, iv, iv; \ | ||
1175 | vpshufd $0x13, tmp, tmp; \ | ||
1176 | vpand mask, tmp, tmp; \ | ||
1177 | vpxor tmp, iv, iv; | ||
1178 | |||
1179 | #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ | ||
1180 | vpsrad $31, iv, tmp0; \ | ||
1181 | vpaddq iv, iv, tmp1; \ | ||
1182 | vpsllq $2, iv, iv; \ | ||
1183 | vpshufd $0x13, tmp0, tmp0; \ | ||
1184 | vpsrad $31, tmp1, tmp1; \ | ||
1185 | vpand mask2, tmp0, tmp0; \ | ||
1186 | vpshufd $0x13, tmp1, tmp1; \ | ||
1187 | vpxor tmp0, iv, iv; \ | ||
1188 | vpand mask1, tmp1, tmp1; \ | ||
1189 | vpxor tmp1, iv, iv; | ||
1190 | |||
1191 | .align 8 | ||
1192 | camellia_xts_crypt_32way: | ||
1193 | /* input: | ||
1194 | * %rdi: ctx, CTX | ||
1195 | * %rsi: dst (32 blocks) | ||
1196 | * %rdx: src (32 blocks) | ||
1197 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1198 | * %r8: index for input whitening key | ||
1199 | * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 | ||
1200 | */ | ||
1201 | |||
1202 | vzeroupper; | ||
1203 | |||
1204 | subq $(16 * 32), %rsp; | ||
1205 | movq %rsp, %rax; | ||
1206 | |||
1207 | vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; | ||
1208 | |||
1209 | /* load IV and construct second IV */ | ||
1210 | vmovdqu (%rcx), %xmm0; | ||
1211 | vmovdqa %xmm0, %xmm15; | ||
1212 | gf128mul_x_ble(%xmm0, %xmm12, %xmm13); | ||
1213 | vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; | ||
1214 | vinserti128 $1, %xmm0, %ymm15, %ymm0; | ||
1215 | vpxor 0 * 32(%rdx), %ymm0, %ymm15; | ||
1216 | vmovdqu %ymm15, 15 * 32(%rax); | ||
1217 | vmovdqu %ymm0, 0 * 32(%rsi); | ||
1218 | |||
1219 | /* construct IVs */ | ||
1220 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1221 | vpxor 1 * 32(%rdx), %ymm0, %ymm15; | ||
1222 | vmovdqu %ymm15, 14 * 32(%rax); | ||
1223 | vmovdqu %ymm0, 1 * 32(%rsi); | ||
1224 | |||
1225 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1226 | vpxor 2 * 32(%rdx), %ymm0, %ymm15; | ||
1227 | vmovdqu %ymm15, 13 * 32(%rax); | ||
1228 | vmovdqu %ymm0, 2 * 32(%rsi); | ||
1229 | |||
1230 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1231 | vpxor 3 * 32(%rdx), %ymm0, %ymm15; | ||
1232 | vmovdqu %ymm15, 12 * 32(%rax); | ||
1233 | vmovdqu %ymm0, 3 * 32(%rsi); | ||
1234 | |||
1235 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1236 | vpxor 4 * 32(%rdx), %ymm0, %ymm11; | ||
1237 | vmovdqu %ymm0, 4 * 32(%rsi); | ||
1238 | |||
1239 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1240 | vpxor 5 * 32(%rdx), %ymm0, %ymm10; | ||
1241 | vmovdqu %ymm0, 5 * 32(%rsi); | ||
1242 | |||
1243 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1244 | vpxor 6 * 32(%rdx), %ymm0, %ymm9; | ||
1245 | vmovdqu %ymm0, 6 * 32(%rsi); | ||
1246 | |||
1247 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1248 | vpxor 7 * 32(%rdx), %ymm0, %ymm8; | ||
1249 | vmovdqu %ymm0, 7 * 32(%rsi); | ||
1250 | |||
1251 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1252 | vpxor 8 * 32(%rdx), %ymm0, %ymm7; | ||
1253 | vmovdqu %ymm0, 8 * 32(%rsi); | ||
1254 | |||
1255 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1256 | vpxor 9 * 32(%rdx), %ymm0, %ymm6; | ||
1257 | vmovdqu %ymm0, 9 * 32(%rsi); | ||
1258 | |||
1259 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1260 | vpxor 10 * 32(%rdx), %ymm0, %ymm5; | ||
1261 | vmovdqu %ymm0, 10 * 32(%rsi); | ||
1262 | |||
1263 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1264 | vpxor 11 * 32(%rdx), %ymm0, %ymm4; | ||
1265 | vmovdqu %ymm0, 11 * 32(%rsi); | ||
1266 | |||
1267 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1268 | vpxor 12 * 32(%rdx), %ymm0, %ymm3; | ||
1269 | vmovdqu %ymm0, 12 * 32(%rsi); | ||
1270 | |||
1271 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1272 | vpxor 13 * 32(%rdx), %ymm0, %ymm2; | ||
1273 | vmovdqu %ymm0, 13 * 32(%rsi); | ||
1274 | |||
1275 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1276 | vpxor 14 * 32(%rdx), %ymm0, %ymm1; | ||
1277 | vmovdqu %ymm0, 14 * 32(%rsi); | ||
1278 | |||
1279 | gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); | ||
1280 | vpxor 15 * 32(%rdx), %ymm0, %ymm15; | ||
1281 | vmovdqu %ymm15, 0 * 32(%rax); | ||
1282 | vmovdqu %ymm0, 15 * 32(%rsi); | ||
1283 | |||
1284 | vextracti128 $1, %ymm0, %xmm0; | ||
1285 | gf128mul_x_ble(%xmm0, %xmm12, %xmm15); | ||
1286 | vmovdqu %xmm0, (%rcx); | ||
1287 | |||
1288 | /* inpack32_pre: */ | ||
1289 | vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; | ||
1290 | vpshufb .Lpack_bswap, %ymm15, %ymm15; | ||
1291 | vpxor 0 * 32(%rax), %ymm15, %ymm0; | ||
1292 | vpxor %ymm1, %ymm15, %ymm1; | ||
1293 | vpxor %ymm2, %ymm15, %ymm2; | ||
1294 | vpxor %ymm3, %ymm15, %ymm3; | ||
1295 | vpxor %ymm4, %ymm15, %ymm4; | ||
1296 | vpxor %ymm5, %ymm15, %ymm5; | ||
1297 | vpxor %ymm6, %ymm15, %ymm6; | ||
1298 | vpxor %ymm7, %ymm15, %ymm7; | ||
1299 | vpxor %ymm8, %ymm15, %ymm8; | ||
1300 | vpxor %ymm9, %ymm15, %ymm9; | ||
1301 | vpxor %ymm10, %ymm15, %ymm10; | ||
1302 | vpxor %ymm11, %ymm15, %ymm11; | ||
1303 | vpxor 12 * 32(%rax), %ymm15, %ymm12; | ||
1304 | vpxor 13 * 32(%rax), %ymm15, %ymm13; | ||
1305 | vpxor 14 * 32(%rax), %ymm15, %ymm14; | ||
1306 | vpxor 15 * 32(%rax), %ymm15, %ymm15; | ||
1307 | |||
1308 | call *%r9; | ||
1309 | |||
1310 | addq $(16 * 32), %rsp; | ||
1311 | |||
1312 | vpxor 0 * 32(%rsi), %ymm7, %ymm7; | ||
1313 | vpxor 1 * 32(%rsi), %ymm6, %ymm6; | ||
1314 | vpxor 2 * 32(%rsi), %ymm5, %ymm5; | ||
1315 | vpxor 3 * 32(%rsi), %ymm4, %ymm4; | ||
1316 | vpxor 4 * 32(%rsi), %ymm3, %ymm3; | ||
1317 | vpxor 5 * 32(%rsi), %ymm2, %ymm2; | ||
1318 | vpxor 6 * 32(%rsi), %ymm1, %ymm1; | ||
1319 | vpxor 7 * 32(%rsi), %ymm0, %ymm0; | ||
1320 | vpxor 8 * 32(%rsi), %ymm15, %ymm15; | ||
1321 | vpxor 9 * 32(%rsi), %ymm14, %ymm14; | ||
1322 | vpxor 10 * 32(%rsi), %ymm13, %ymm13; | ||
1323 | vpxor 11 * 32(%rsi), %ymm12, %ymm12; | ||
1324 | vpxor 12 * 32(%rsi), %ymm11, %ymm11; | ||
1325 | vpxor 13 * 32(%rsi), %ymm10, %ymm10; | ||
1326 | vpxor 14 * 32(%rsi), %ymm9, %ymm9; | ||
1327 | vpxor 15 * 32(%rsi), %ymm8, %ymm8; | ||
1328 | write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, | ||
1329 | %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, | ||
1330 | %ymm8, %rsi); | ||
1331 | |||
1332 | vzeroupper; | ||
1333 | |||
1334 | ret; | ||
1335 | ENDPROC(camellia_xts_crypt_32way) | ||
1336 | |||
1337 | ENTRY(camellia_xts_enc_32way) | ||
1338 | /* input: | ||
1339 | * %rdi: ctx, CTX | ||
1340 | * %rsi: dst (32 blocks) | ||
1341 | * %rdx: src (32 blocks) | ||
1342 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1343 | */ | ||
1344 | |||
1345 | xorl %r8d, %r8d; /* input whitening key, 0 for enc */ | ||
1346 | |||
1347 | leaq __camellia_enc_blk32, %r9; | ||
1348 | |||
1349 | jmp camellia_xts_crypt_32way; | ||
1350 | ENDPROC(camellia_xts_enc_32way) | ||
1351 | |||
1352 | ENTRY(camellia_xts_dec_32way) | ||
1353 | /* input: | ||
1354 | * %rdi: ctx, CTX | ||
1355 | * %rsi: dst (32 blocks) | ||
1356 | * %rdx: src (32 blocks) | ||
1357 | * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) | ||
1358 | */ | ||
1359 | |||
1360 | cmpl $16, key_length(CTX); | ||
1361 | movl $32, %r8d; | ||
1362 | movl $24, %eax; | ||
1363 | cmovel %eax, %r8d; /* input whitening key, last for dec */ | ||
1364 | |||
1365 | leaq __camellia_dec_blk32, %r9; | ||
1366 | |||
1367 | jmp camellia_xts_crypt_32way; | ||
1368 | ENDPROC(camellia_xts_dec_32way) | ||
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c new file mode 100644 index 000000000000..414fe5d7946b --- /dev/null +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c | |||
@@ -0,0 +1,586 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX2/AES-NI assembler optimized version of Camellia | ||
3 | * | ||
4 | * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <crypto/algapi.h> | ||
18 | #include <crypto/ctr.h> | ||
19 | #include <crypto/lrw.h> | ||
20 | #include <crypto/xts.h> | ||
21 | #include <asm/xcr.h> | ||
22 | #include <asm/xsave.h> | ||
23 | #include <asm/crypto/camellia.h> | ||
24 | #include <asm/crypto/ablk_helper.h> | ||
25 | #include <asm/crypto/glue_helper.h> | ||
26 | |||
27 | #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 | ||
28 | #define CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS 32 | ||
29 | |||
30 | /* 32-way AVX2/AES-NI parallel cipher functions */ | ||
31 | asmlinkage void camellia_ecb_enc_32way(struct camellia_ctx *ctx, u8 *dst, | ||
32 | const u8 *src); | ||
33 | asmlinkage void camellia_ecb_dec_32way(struct camellia_ctx *ctx, u8 *dst, | ||
34 | const u8 *src); | ||
35 | |||
36 | asmlinkage void camellia_cbc_dec_32way(struct camellia_ctx *ctx, u8 *dst, | ||
37 | const u8 *src); | ||
38 | asmlinkage void camellia_ctr_32way(struct camellia_ctx *ctx, u8 *dst, | ||
39 | const u8 *src, le128 *iv); | ||
40 | |||
41 | asmlinkage void camellia_xts_enc_32way(struct camellia_ctx *ctx, u8 *dst, | ||
42 | const u8 *src, le128 *iv); | ||
43 | asmlinkage void camellia_xts_dec_32way(struct camellia_ctx *ctx, u8 *dst, | ||
44 | const u8 *src, le128 *iv); | ||
45 | |||
46 | static const struct common_glue_ctx camellia_enc = { | ||
47 | .num_funcs = 4, | ||
48 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
49 | |||
50 | .funcs = { { | ||
51 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
52 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_32way) } | ||
53 | }, { | ||
54 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
55 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } | ||
56 | }, { | ||
57 | .num_blocks = 2, | ||
58 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } | ||
59 | }, { | ||
60 | .num_blocks = 1, | ||
61 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } | ||
62 | } } | ||
63 | }; | ||
64 | |||
65 | static const struct common_glue_ctx camellia_ctr = { | ||
66 | .num_funcs = 4, | ||
67 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
68 | |||
69 | .funcs = { { | ||
70 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
71 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_32way) } | ||
72 | }, { | ||
73 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
74 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } | ||
75 | }, { | ||
76 | .num_blocks = 2, | ||
77 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } | ||
78 | }, { | ||
79 | .num_blocks = 1, | ||
80 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } | ||
81 | } } | ||
82 | }; | ||
83 | |||
84 | static const struct common_glue_ctx camellia_enc_xts = { | ||
85 | .num_funcs = 3, | ||
86 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
87 | |||
88 | .funcs = { { | ||
89 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
90 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_32way) } | ||
91 | }, { | ||
92 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
93 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc_16way) } | ||
94 | }, { | ||
95 | .num_blocks = 1, | ||
96 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_enc) } | ||
97 | } } | ||
98 | }; | ||
99 | |||
100 | static const struct common_glue_ctx camellia_dec = { | ||
101 | .num_funcs = 4, | ||
102 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
103 | |||
104 | .funcs = { { | ||
105 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
106 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_32way) } | ||
107 | }, { | ||
108 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
109 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } | ||
110 | }, { | ||
111 | .num_blocks = 2, | ||
112 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } | ||
113 | }, { | ||
114 | .num_blocks = 1, | ||
115 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } | ||
116 | } } | ||
117 | }; | ||
118 | |||
119 | static const struct common_glue_ctx camellia_dec_cbc = { | ||
120 | .num_funcs = 4, | ||
121 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
122 | |||
123 | .funcs = { { | ||
124 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
125 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_32way) } | ||
126 | }, { | ||
127 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
128 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } | ||
129 | }, { | ||
130 | .num_blocks = 2, | ||
131 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } | ||
132 | }, { | ||
133 | .num_blocks = 1, | ||
134 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } | ||
135 | } } | ||
136 | }; | ||
137 | |||
138 | static const struct common_glue_ctx camellia_dec_xts = { | ||
139 | .num_funcs = 3, | ||
140 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
141 | |||
142 | .funcs = { { | ||
143 | .num_blocks = CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS, | ||
144 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_32way) } | ||
145 | }, { | ||
146 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
147 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec_16way) } | ||
148 | }, { | ||
149 | .num_blocks = 1, | ||
150 | .fn_u = { .xts = GLUE_XTS_FUNC_CAST(camellia_xts_dec) } | ||
151 | } } | ||
152 | }; | ||
153 | |||
154 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
155 | struct scatterlist *src, unsigned int nbytes) | ||
156 | { | ||
157 | return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); | ||
158 | } | ||
159 | |||
160 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
161 | struct scatterlist *src, unsigned int nbytes) | ||
162 | { | ||
163 | return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); | ||
164 | } | ||
165 | |||
166 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
167 | struct scatterlist *src, unsigned int nbytes) | ||
168 | { | ||
169 | return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, | ||
170 | dst, src, nbytes); | ||
171 | } | ||
172 | |||
173 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
174 | struct scatterlist *src, unsigned int nbytes) | ||
175 | { | ||
176 | return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, | ||
177 | nbytes); | ||
178 | } | ||
179 | |||
180 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
181 | struct scatterlist *src, unsigned int nbytes) | ||
182 | { | ||
183 | return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); | ||
184 | } | ||
185 | |||
186 | static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
187 | { | ||
188 | return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, | ||
189 | CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, | ||
190 | nbytes); | ||
191 | } | ||
192 | |||
193 | static inline void camellia_fpu_end(bool fpu_enabled) | ||
194 | { | ||
195 | glue_fpu_end(fpu_enabled); | ||
196 | } | ||
197 | |||
198 | static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, | ||
199 | unsigned int key_len) | ||
200 | { | ||
201 | return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, | ||
202 | &tfm->crt_flags); | ||
203 | } | ||
204 | |||
205 | struct crypt_priv { | ||
206 | struct camellia_ctx *ctx; | ||
207 | bool fpu_enabled; | ||
208 | }; | ||
209 | |||
210 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
211 | { | ||
212 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | ||
213 | struct crypt_priv *ctx = priv; | ||
214 | int i; | ||
215 | |||
216 | ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); | ||
217 | |||
218 | if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { | ||
219 | camellia_ecb_enc_32way(ctx->ctx, srcdst, srcdst); | ||
220 | srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; | ||
221 | nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; | ||
222 | } | ||
223 | |||
224 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | ||
225 | camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); | ||
226 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
227 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
228 | } | ||
229 | |||
230 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | ||
231 | camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); | ||
232 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
233 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
234 | } | ||
235 | |||
236 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
237 | camellia_enc_blk(ctx->ctx, srcdst, srcdst); | ||
238 | } | ||
239 | |||
240 | static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
241 | { | ||
242 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | ||
243 | struct crypt_priv *ctx = priv; | ||
244 | int i; | ||
245 | |||
246 | ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); | ||
247 | |||
248 | if (nbytes >= CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS * bsize) { | ||
249 | camellia_ecb_dec_32way(ctx->ctx, srcdst, srcdst); | ||
250 | srcdst += bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; | ||
251 | nbytes -= bsize * CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS; | ||
252 | } | ||
253 | |||
254 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | ||
255 | camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); | ||
256 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
257 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
258 | } | ||
259 | |||
260 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | ||
261 | camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); | ||
262 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
263 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
264 | } | ||
265 | |||
266 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
267 | camellia_dec_blk(ctx->ctx, srcdst, srcdst); | ||
268 | } | ||
269 | |||
270 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
271 | struct scatterlist *src, unsigned int nbytes) | ||
272 | { | ||
273 | struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
274 | be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; | ||
275 | struct crypt_priv crypt_ctx = { | ||
276 | .ctx = &ctx->camellia_ctx, | ||
277 | .fpu_enabled = false, | ||
278 | }; | ||
279 | struct lrw_crypt_req req = { | ||
280 | .tbuf = buf, | ||
281 | .tbuflen = sizeof(buf), | ||
282 | |||
283 | .table_ctx = &ctx->lrw_table, | ||
284 | .crypt_ctx = &crypt_ctx, | ||
285 | .crypt_fn = encrypt_callback, | ||
286 | }; | ||
287 | int ret; | ||
288 | |||
289 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
290 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
291 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
292 | |||
293 | return ret; | ||
294 | } | ||
295 | |||
296 | static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
297 | struct scatterlist *src, unsigned int nbytes) | ||
298 | { | ||
299 | struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
300 | be128 buf[CAMELLIA_AESNI_AVX2_PARALLEL_BLOCKS]; | ||
301 | struct crypt_priv crypt_ctx = { | ||
302 | .ctx = &ctx->camellia_ctx, | ||
303 | .fpu_enabled = false, | ||
304 | }; | ||
305 | struct lrw_crypt_req req = { | ||
306 | .tbuf = buf, | ||
307 | .tbuflen = sizeof(buf), | ||
308 | |||
309 | .table_ctx = &ctx->lrw_table, | ||
310 | .crypt_ctx = &crypt_ctx, | ||
311 | .crypt_fn = decrypt_callback, | ||
312 | }; | ||
313 | int ret; | ||
314 | |||
315 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
316 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
317 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
318 | |||
319 | return ret; | ||
320 | } | ||
321 | |||
322 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
323 | struct scatterlist *src, unsigned int nbytes) | ||
324 | { | ||
325 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
326 | |||
327 | return glue_xts_crypt_128bit(&camellia_enc_xts, desc, dst, src, nbytes, | ||
328 | XTS_TWEAK_CAST(camellia_enc_blk), | ||
329 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
330 | } | ||
331 | |||
332 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
333 | struct scatterlist *src, unsigned int nbytes) | ||
334 | { | ||
335 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
336 | |||
337 | return glue_xts_crypt_128bit(&camellia_dec_xts, desc, dst, src, nbytes, | ||
338 | XTS_TWEAK_CAST(camellia_enc_blk), | ||
339 | &ctx->tweak_ctx, &ctx->crypt_ctx); | ||
340 | } | ||
341 | |||
342 | static struct crypto_alg cmll_algs[10] = { { | ||
343 | .cra_name = "__ecb-camellia-aesni-avx2", | ||
344 | .cra_driver_name = "__driver-ecb-camellia-aesni-avx2", | ||
345 | .cra_priority = 0, | ||
346 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
347 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
348 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
349 | .cra_alignmask = 0, | ||
350 | .cra_type = &crypto_blkcipher_type, | ||
351 | .cra_module = THIS_MODULE, | ||
352 | .cra_u = { | ||
353 | .blkcipher = { | ||
354 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
355 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
356 | .setkey = camellia_setkey, | ||
357 | .encrypt = ecb_encrypt, | ||
358 | .decrypt = ecb_decrypt, | ||
359 | }, | ||
360 | }, | ||
361 | }, { | ||
362 | .cra_name = "__cbc-camellia-aesni-avx2", | ||
363 | .cra_driver_name = "__driver-cbc-camellia-aesni-avx2", | ||
364 | .cra_priority = 0, | ||
365 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
366 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
367 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
368 | .cra_alignmask = 0, | ||
369 | .cra_type = &crypto_blkcipher_type, | ||
370 | .cra_module = THIS_MODULE, | ||
371 | .cra_u = { | ||
372 | .blkcipher = { | ||
373 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
374 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
375 | .setkey = camellia_setkey, | ||
376 | .encrypt = cbc_encrypt, | ||
377 | .decrypt = cbc_decrypt, | ||
378 | }, | ||
379 | }, | ||
380 | }, { | ||
381 | .cra_name = "__ctr-camellia-aesni-avx2", | ||
382 | .cra_driver_name = "__driver-ctr-camellia-aesni-avx2", | ||
383 | .cra_priority = 0, | ||
384 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
385 | .cra_blocksize = 1, | ||
386 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
387 | .cra_alignmask = 0, | ||
388 | .cra_type = &crypto_blkcipher_type, | ||
389 | .cra_module = THIS_MODULE, | ||
390 | .cra_u = { | ||
391 | .blkcipher = { | ||
392 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
393 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
394 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
395 | .setkey = camellia_setkey, | ||
396 | .encrypt = ctr_crypt, | ||
397 | .decrypt = ctr_crypt, | ||
398 | }, | ||
399 | }, | ||
400 | }, { | ||
401 | .cra_name = "__lrw-camellia-aesni-avx2", | ||
402 | .cra_driver_name = "__driver-lrw-camellia-aesni-avx2", | ||
403 | .cra_priority = 0, | ||
404 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
405 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
406 | .cra_ctxsize = sizeof(struct camellia_lrw_ctx), | ||
407 | .cra_alignmask = 0, | ||
408 | .cra_type = &crypto_blkcipher_type, | ||
409 | .cra_module = THIS_MODULE, | ||
410 | .cra_exit = lrw_camellia_exit_tfm, | ||
411 | .cra_u = { | ||
412 | .blkcipher = { | ||
413 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + | ||
414 | CAMELLIA_BLOCK_SIZE, | ||
415 | .max_keysize = CAMELLIA_MAX_KEY_SIZE + | ||
416 | CAMELLIA_BLOCK_SIZE, | ||
417 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
418 | .setkey = lrw_camellia_setkey, | ||
419 | .encrypt = lrw_encrypt, | ||
420 | .decrypt = lrw_decrypt, | ||
421 | }, | ||
422 | }, | ||
423 | }, { | ||
424 | .cra_name = "__xts-camellia-aesni-avx2", | ||
425 | .cra_driver_name = "__driver-xts-camellia-aesni-avx2", | ||
426 | .cra_priority = 0, | ||
427 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
428 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
429 | .cra_ctxsize = sizeof(struct camellia_xts_ctx), | ||
430 | .cra_alignmask = 0, | ||
431 | .cra_type = &crypto_blkcipher_type, | ||
432 | .cra_module = THIS_MODULE, | ||
433 | .cra_u = { | ||
434 | .blkcipher = { | ||
435 | .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, | ||
436 | .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, | ||
437 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
438 | .setkey = xts_camellia_setkey, | ||
439 | .encrypt = xts_encrypt, | ||
440 | .decrypt = xts_decrypt, | ||
441 | }, | ||
442 | }, | ||
443 | }, { | ||
444 | .cra_name = "ecb(camellia)", | ||
445 | .cra_driver_name = "ecb-camellia-aesni-avx2", | ||
446 | .cra_priority = 500, | ||
447 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
448 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
449 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
450 | .cra_alignmask = 0, | ||
451 | .cra_type = &crypto_ablkcipher_type, | ||
452 | .cra_module = THIS_MODULE, | ||
453 | .cra_init = ablk_init, | ||
454 | .cra_exit = ablk_exit, | ||
455 | .cra_u = { | ||
456 | .ablkcipher = { | ||
457 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
458 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
459 | .setkey = ablk_set_key, | ||
460 | .encrypt = ablk_encrypt, | ||
461 | .decrypt = ablk_decrypt, | ||
462 | }, | ||
463 | }, | ||
464 | }, { | ||
465 | .cra_name = "cbc(camellia)", | ||
466 | .cra_driver_name = "cbc-camellia-aesni-avx2", | ||
467 | .cra_priority = 500, | ||
468 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
469 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
470 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
471 | .cra_alignmask = 0, | ||
472 | .cra_type = &crypto_ablkcipher_type, | ||
473 | .cra_module = THIS_MODULE, | ||
474 | .cra_init = ablk_init, | ||
475 | .cra_exit = ablk_exit, | ||
476 | .cra_u = { | ||
477 | .ablkcipher = { | ||
478 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
479 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
480 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
481 | .setkey = ablk_set_key, | ||
482 | .encrypt = __ablk_encrypt, | ||
483 | .decrypt = ablk_decrypt, | ||
484 | }, | ||
485 | }, | ||
486 | }, { | ||
487 | .cra_name = "ctr(camellia)", | ||
488 | .cra_driver_name = "ctr-camellia-aesni-avx2", | ||
489 | .cra_priority = 500, | ||
490 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
491 | .cra_blocksize = 1, | ||
492 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
493 | .cra_alignmask = 0, | ||
494 | .cra_type = &crypto_ablkcipher_type, | ||
495 | .cra_module = THIS_MODULE, | ||
496 | .cra_init = ablk_init, | ||
497 | .cra_exit = ablk_exit, | ||
498 | .cra_u = { | ||
499 | .ablkcipher = { | ||
500 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
501 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
502 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
503 | .setkey = ablk_set_key, | ||
504 | .encrypt = ablk_encrypt, | ||
505 | .decrypt = ablk_encrypt, | ||
506 | .geniv = "chainiv", | ||
507 | }, | ||
508 | }, | ||
509 | }, { | ||
510 | .cra_name = "lrw(camellia)", | ||
511 | .cra_driver_name = "lrw-camellia-aesni-avx2", | ||
512 | .cra_priority = 500, | ||
513 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
514 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
515 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
516 | .cra_alignmask = 0, | ||
517 | .cra_type = &crypto_ablkcipher_type, | ||
518 | .cra_module = THIS_MODULE, | ||
519 | .cra_init = ablk_init, | ||
520 | .cra_exit = ablk_exit, | ||
521 | .cra_u = { | ||
522 | .ablkcipher = { | ||
523 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + | ||
524 | CAMELLIA_BLOCK_SIZE, | ||
525 | .max_keysize = CAMELLIA_MAX_KEY_SIZE + | ||
526 | CAMELLIA_BLOCK_SIZE, | ||
527 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
528 | .setkey = ablk_set_key, | ||
529 | .encrypt = ablk_encrypt, | ||
530 | .decrypt = ablk_decrypt, | ||
531 | }, | ||
532 | }, | ||
533 | }, { | ||
534 | .cra_name = "xts(camellia)", | ||
535 | .cra_driver_name = "xts-camellia-aesni-avx2", | ||
536 | .cra_priority = 500, | ||
537 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
538 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
539 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
540 | .cra_alignmask = 0, | ||
541 | .cra_type = &crypto_ablkcipher_type, | ||
542 | .cra_module = THIS_MODULE, | ||
543 | .cra_init = ablk_init, | ||
544 | .cra_exit = ablk_exit, | ||
545 | .cra_u = { | ||
546 | .ablkcipher = { | ||
547 | .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, | ||
548 | .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, | ||
549 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
550 | .setkey = ablk_set_key, | ||
551 | .encrypt = ablk_encrypt, | ||
552 | .decrypt = ablk_decrypt, | ||
553 | }, | ||
554 | }, | ||
555 | } }; | ||
556 | |||
557 | static int __init camellia_aesni_init(void) | ||
558 | { | ||
559 | u64 xcr0; | ||
560 | |||
561 | if (!cpu_has_avx2 || !cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { | ||
562 | pr_info("AVX2 or AES-NI instructions are not detected.\n"); | ||
563 | return -ENODEV; | ||
564 | } | ||
565 | |||
566 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
567 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
568 | pr_info("AVX2 detected but unusable.\n"); | ||
569 | return -ENODEV; | ||
570 | } | ||
571 | |||
572 | return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); | ||
573 | } | ||
574 | |||
575 | static void __exit camellia_aesni_fini(void) | ||
576 | { | ||
577 | crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); | ||
578 | } | ||
579 | |||
580 | module_init(camellia_aesni_init); | ||
581 | module_exit(camellia_aesni_fini); | ||
582 | |||
583 | MODULE_LICENSE("GPL"); | ||
584 | MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX2 optimized"); | ||
585 | MODULE_ALIAS("camellia"); | ||
586 | MODULE_ALIAS("camellia-asm"); | ||
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c index 4ff7ed47b3db..37fd0c0a81ea 100644 --- a/arch/x86/crypto/camellia_aesni_avx_glue.c +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c | |||
@@ -26,33 +26,44 @@ | |||
26 | 26 | ||
27 | #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 | 27 | #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 |
28 | 28 | ||
29 | /* 16-way AES-NI parallel cipher functions */ | 29 | /* 16-way parallel cipher functions (avx/aes-ni) */ |
30 | asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, | 30 | asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, |
31 | const u8 *src); | 31 | const u8 *src); |
32 | EXPORT_SYMBOL_GPL(camellia_ecb_enc_16way); | ||
33 | |||
32 | asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, | 34 | asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, |
33 | const u8 *src); | 35 | const u8 *src); |
36 | EXPORT_SYMBOL_GPL(camellia_ecb_dec_16way); | ||
34 | 37 | ||
35 | asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, | 38 | asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, |
36 | const u8 *src); | 39 | const u8 *src); |
40 | EXPORT_SYMBOL_GPL(camellia_cbc_dec_16way); | ||
41 | |||
37 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, | 42 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, |
38 | const u8 *src, le128 *iv); | 43 | const u8 *src, le128 *iv); |
44 | EXPORT_SYMBOL_GPL(camellia_ctr_16way); | ||
39 | 45 | ||
40 | asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, | 46 | asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, |
41 | const u8 *src, le128 *iv); | 47 | const u8 *src, le128 *iv); |
48 | EXPORT_SYMBOL_GPL(camellia_xts_enc_16way); | ||
49 | |||
42 | asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, | 50 | asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, |
43 | const u8 *src, le128 *iv); | 51 | const u8 *src, le128 *iv); |
52 | EXPORT_SYMBOL_GPL(camellia_xts_dec_16way); | ||
44 | 53 | ||
45 | static void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) | 54 | void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
46 | { | 55 | { |
47 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | 56 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, |
48 | GLUE_FUNC_CAST(camellia_enc_blk)); | 57 | GLUE_FUNC_CAST(camellia_enc_blk)); |
49 | } | 58 | } |
59 | EXPORT_SYMBOL_GPL(camellia_xts_enc); | ||
50 | 60 | ||
51 | static void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) | 61 | void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
52 | { | 62 | { |
53 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, | 63 | glue_xts_crypt_128bit_one(ctx, dst, src, iv, |
54 | GLUE_FUNC_CAST(camellia_dec_blk)); | 64 | GLUE_FUNC_CAST(camellia_dec_blk)); |
55 | } | 65 | } |
66 | EXPORT_SYMBOL_GPL(camellia_xts_dec); | ||
56 | 67 | ||
57 | static const struct common_glue_ctx camellia_enc = { | 68 | static const struct common_glue_ctx camellia_enc = { |
58 | .num_funcs = 3, | 69 | .num_funcs = 3, |
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h index 98038add801e..bb93333d9200 100644 --- a/arch/x86/include/asm/crypto/camellia.h +++ b/arch/x86/include/asm/crypto/camellia.h | |||
@@ -48,6 +48,22 @@ asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, | |||
48 | asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, | 48 | asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, |
49 | const u8 *src); | 49 | const u8 *src); |
50 | 50 | ||
51 | /* 16-way parallel cipher functions (avx/aes-ni) */ | ||
52 | asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, | ||
53 | const u8 *src); | ||
54 | asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
55 | const u8 *src); | ||
56 | |||
57 | asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
58 | const u8 *src); | ||
59 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, | ||
60 | const u8 *src, le128 *iv); | ||
61 | |||
62 | asmlinkage void camellia_xts_enc_16way(struct camellia_ctx *ctx, u8 *dst, | ||
63 | const u8 *src, le128 *iv); | ||
64 | asmlinkage void camellia_xts_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
65 | const u8 *src, le128 *iv); | ||
66 | |||
51 | static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, | 67 | static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, |
52 | const u8 *src) | 68 | const u8 *src) |
53 | { | 69 | { |
@@ -79,4 +95,7 @@ extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, | |||
79 | extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, | 95 | extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, |
80 | le128 *iv); | 96 | le128 *iv); |
81 | 97 | ||
98 | extern void camellia_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv); | ||
99 | extern void camellia_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv); | ||
100 | |||
82 | #endif /* ASM_X86_CAMELLIA_H */ | 101 | #endif /* ASM_X86_CAMELLIA_H */ |
diff --git a/crypto/Kconfig b/crypto/Kconfig index 9ad3d78c1075..622d8a48cbe9 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig | |||
@@ -894,6 +894,29 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 | |||
894 | See also: | 894 | See also: |
895 | <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html> | 895 | <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html> |
896 | 896 | ||
897 | config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 | ||
898 | tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)" | ||
899 | depends on X86 && 64BIT | ||
900 | depends on CRYPTO | ||
901 | select CRYPTO_ALGAPI | ||
902 | select CRYPTO_CRYPTD | ||
903 | select CRYPTO_ABLK_HELPER_X86 | ||
904 | select CRYPTO_GLUE_HELPER_X86 | ||
905 | select CRYPTO_CAMELLIA_X86_64 | ||
906 | select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 | ||
907 | select CRYPTO_LRW | ||
908 | select CRYPTO_XTS | ||
909 | help | ||
910 | Camellia cipher algorithm module (x86_64/AES-NI/AVX2). | ||
911 | |||
912 | Camellia is a symmetric key block cipher developed jointly | ||
913 | at NTT and Mitsubishi Electric Corporation. | ||
914 | |||
915 | The Camellia specifies three key sizes: 128, 192 and 256 bits. | ||
916 | |||
917 | See also: | ||
918 | <https://info.isl.ntt.co.jp/crypt/eng/camellia/index_s.html> | ||
919 | |||
897 | config CRYPTO_CAMELLIA_SPARC64 | 920 | config CRYPTO_CAMELLIA_SPARC64 |
898 | tristate "Camellia cipher algorithm (SPARC64)" | 921 | tristate "Camellia cipher algorithm (SPARC64)" |
899 | depends on SPARC64 | 922 | depends on SPARC64 |
diff --git a/crypto/testmgr.c b/crypto/testmgr.c index f5e13dea8cc9..5823735cf381 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c | |||
@@ -1667,6 +1667,9 @@ static const struct alg_test_desc alg_test_descs[] = { | |||
1667 | .alg = "__driver-cbc-camellia-aesni", | 1667 | .alg = "__driver-cbc-camellia-aesni", |
1668 | .test = alg_test_null, | 1668 | .test = alg_test_null, |
1669 | }, { | 1669 | }, { |
1670 | .alg = "__driver-cbc-camellia-aesni-avx2", | ||
1671 | .test = alg_test_null, | ||
1672 | }, { | ||
1670 | .alg = "__driver-cbc-cast5-avx", | 1673 | .alg = "__driver-cbc-cast5-avx", |
1671 | .test = alg_test_null, | 1674 | .test = alg_test_null, |
1672 | }, { | 1675 | }, { |
@@ -1698,6 +1701,9 @@ static const struct alg_test_desc alg_test_descs[] = { | |||
1698 | .alg = "__driver-ecb-camellia-aesni", | 1701 | .alg = "__driver-ecb-camellia-aesni", |
1699 | .test = alg_test_null, | 1702 | .test = alg_test_null, |
1700 | }, { | 1703 | }, { |
1704 | .alg = "__driver-ecb-camellia-aesni-avx2", | ||
1705 | .test = alg_test_null, | ||
1706 | }, { | ||
1701 | .alg = "__driver-ecb-cast5-avx", | 1707 | .alg = "__driver-ecb-cast5-avx", |
1702 | .test = alg_test_null, | 1708 | .test = alg_test_null, |
1703 | }, { | 1709 | }, { |
@@ -1978,6 +1984,9 @@ static const struct alg_test_desc alg_test_descs[] = { | |||
1978 | .alg = "cryptd(__driver-cbc-camellia-aesni)", | 1984 | .alg = "cryptd(__driver-cbc-camellia-aesni)", |
1979 | .test = alg_test_null, | 1985 | .test = alg_test_null, |
1980 | }, { | 1986 | }, { |
1987 | .alg = "cryptd(__driver-cbc-camellia-aesni-avx2)", | ||
1988 | .test = alg_test_null, | ||
1989 | }, { | ||
1981 | .alg = "cryptd(__driver-cbc-serpent-avx2)", | 1990 | .alg = "cryptd(__driver-cbc-serpent-avx2)", |
1982 | .test = alg_test_null, | 1991 | .test = alg_test_null, |
1983 | }, { | 1992 | }, { |
@@ -1991,6 +2000,9 @@ static const struct alg_test_desc alg_test_descs[] = { | |||
1991 | .alg = "cryptd(__driver-ecb-camellia-aesni)", | 2000 | .alg = "cryptd(__driver-ecb-camellia-aesni)", |
1992 | .test = alg_test_null, | 2001 | .test = alg_test_null, |
1993 | }, { | 2002 | }, { |
2003 | .alg = "cryptd(__driver-ecb-camellia-aesni-avx2)", | ||
2004 | .test = alg_test_null, | ||
2005 | }, { | ||
1994 | .alg = "cryptd(__driver-ecb-cast5-avx)", | 2006 | .alg = "cryptd(__driver-ecb-cast5-avx)", |
1995 | .test = alg_test_null, | 2007 | .test = alg_test_null, |
1996 | }, { | 2008 | }, { |