aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-15 15:35:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-15 15:35:19 -0500
commit1ed55eac3b1fc30b29cdb52251e0f13b24fc344c (patch)
treeb7a4c67f2e29f8aa418708c5da871e64c511f3ff /arch/x86
parent08242bc2210938761230f79c5288dbcf72e94808 (diff)
parenta2c0911c09190125f52c9941b9d187f601c2f7be (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu: - Added aesni/avx/x86_64 implementations for camellia. - Optimised AVX code for cast5/serpent/twofish/cast6. - Fixed vmac bug with unaligned input. - Allow compression algorithms in FIPS mode. - Optimised crc32c implementation for Intel. - Misc fixes. * git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (32 commits) crypto: caam - Updated SEC-4.0 device tree binding for ERA information. crypto: testmgr - remove superfluous initializers for xts(aes) crypto: testmgr - allow compression algs in fips mode crypto: testmgr - add larger crc32c test vector to test FPU path in crc32c_intel crypto: testmgr - clean alg_test_null entries in alg_test_descs[] crypto: testmgr - remove fips_allowed flag from camellia-aesni null-tests crypto: cast5/cast6 - move lookup tables to shared module padata: use __this_cpu_read per-cpu helper crypto: s5p-sss - Fix compilation error crypto: picoxcell - Add terminating entry for platform_device_id table crypto: omap-aes - select BLKCIPHER2 crypto: camellia - add AES-NI/AVX/x86_64 assembler implementation of camellia cipher crypto: camellia-x86_64 - share common functions and move structures and function definitions to header file crypto: tcrypt - add async speed test for camellia cipher crypto: tegra-aes - fix error-valued pointer dereference crypto: tegra - fix missing unlock on error case crypto: cast5/avx - avoid using temporary stack buffers crypto: serpent/avx - avoid using temporary stack buffers crypto: twofish/avx - avoid using temporary stack buffers crypto: cast6/avx - avoid using temporary stack buffers ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/crypto/Makefile5
-rw-r--r--arch/x86/crypto/camellia-aesni-avx-asm_64.S1102
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c558
-rw-r--r--arch/x86/crypto/camellia_glue.c92
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S348
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c79
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S206
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c77
-rw-r--r--arch/x86/crypto/crc32c-intel_glue.c (renamed from arch/x86/crypto/crc32c-intel.c)81
-rw-r--r--arch/x86/crypto/crc32c-pcl-intel-asm_64.S460
-rw-r--r--arch/x86/crypto/glue_helper-asm-avx.S91
-rw-r--r--arch/x86/crypto/glue_helper.c12
-rw-r--r--arch/x86/crypto/serpent-avx-x86_64-asm_64.S166
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c49
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c12
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S208
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c73
-rw-r--r--arch/x86/crypto/twofish_glue_3way.c20
-rw-r--r--arch/x86/include/asm/crypto/camellia.h82
-rw-r--r--arch/x86/include/asm/crypto/glue_helper.h28
-rw-r--r--arch/x86/include/asm/crypto/serpent-avx.h27
-rw-r--r--arch/x86/include/asm/crypto/twofish.h4
22 files changed, 3160 insertions, 620 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 5bacb4a226ac..e0ca7c9ac383 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
12 12
13obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 13obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
14obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o 14obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
15obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o
15obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o 16obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o
16obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o 17obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o
17obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 18obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
34 35
35aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 36aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
36camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o 37camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
38camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
39 camellia_aesni_avx_glue.o
37cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o 40cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o
38cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o 41cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o
39blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 42blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
@@ -47,3 +50,5 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o
47aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o 50aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
48ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 51ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
49sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o 52sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
53crc32c-intel-y := crc32c-intel_glue.o
54crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
new file mode 100644
index 000000000000..2306d2e4816f
--- /dev/null
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -0,0 +1,1102 @@
1/*
2 * x86_64/AVX/AES-NI assembler implementation of Camellia
3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13/*
14 * Version licensed under 2-clause BSD License is available at:
15 * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz
16 */
17
18#define CAMELLIA_TABLE_BYTE_LEN 272
19
20/* struct camellia_ctx: */
21#define key_table 0
22#define key_length CAMELLIA_TABLE_BYTE_LEN
23
24/* register macros */
25#define CTX %rdi
26
27/**********************************************************************
28 16-way camellia
29 **********************************************************************/
30#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
31 vpand x, mask4bit, tmp0; \
32 vpandn x, mask4bit, x; \
33 vpsrld $4, x, x; \
34 \
35 vpshufb tmp0, lo_t, tmp0; \
36 vpshufb x, hi_t, x; \
37 vpxor tmp0, x, x;
38
39/*
40 * IN:
41 * x0..x7: byte-sliced AB state
42 * mem_cd: register pointer storing CD state
43 * key: index for key material
44 * OUT:
45 * x0..x7: new byte-sliced CD state
46 */
47#define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
48 t7, mem_cd, key) \
49 /* \
50 * S-function with AES subbytes \
51 */ \
52 vmovdqa .Linv_shift_row, t4; \
53 vbroadcastss .L0f0f0f0f, t7; \
54 vmovdqa .Lpre_tf_lo_s1, t0; \
55 vmovdqa .Lpre_tf_hi_s1, t1; \
56 \
57 /* AES inverse shift rows */ \
58 vpshufb t4, x0, x0; \
59 vpshufb t4, x7, x7; \
60 vpshufb t4, x1, x1; \
61 vpshufb t4, x4, x4; \
62 vpshufb t4, x2, x2; \
63 vpshufb t4, x5, x5; \
64 vpshufb t4, x3, x3; \
65 vpshufb t4, x6, x6; \
66 \
67 /* prefilter sboxes 1, 2 and 3 */ \
68 vmovdqa .Lpre_tf_lo_s4, t2; \
69 vmovdqa .Lpre_tf_hi_s4, t3; \
70 filter_8bit(x0, t0, t1, t7, t6); \
71 filter_8bit(x7, t0, t1, t7, t6); \
72 filter_8bit(x1, t0, t1, t7, t6); \
73 filter_8bit(x4, t0, t1, t7, t6); \
74 filter_8bit(x2, t0, t1, t7, t6); \
75 filter_8bit(x5, t0, t1, t7, t6); \
76 \
77 /* prefilter sbox 4 */ \
78 vpxor t4, t4, t4; \
79 filter_8bit(x3, t2, t3, t7, t6); \
80 filter_8bit(x6, t2, t3, t7, t6); \
81 \
82 /* AES subbytes + AES shift rows */ \
83 vmovdqa .Lpost_tf_lo_s1, t0; \
84 vmovdqa .Lpost_tf_hi_s1, t1; \
85 vaesenclast t4, x0, x0; \
86 vaesenclast t4, x7, x7; \
87 vaesenclast t4, x1, x1; \
88 vaesenclast t4, x4, x4; \
89 vaesenclast t4, x2, x2; \
90 vaesenclast t4, x5, x5; \
91 vaesenclast t4, x3, x3; \
92 vaesenclast t4, x6, x6; \
93 \
94 /* postfilter sboxes 1 and 4 */ \
95 vmovdqa .Lpost_tf_lo_s3, t2; \
96 vmovdqa .Lpost_tf_hi_s3, t3; \
97 filter_8bit(x0, t0, t1, t7, t6); \
98 filter_8bit(x7, t0, t1, t7, t6); \
99 filter_8bit(x3, t0, t1, t7, t6); \
100 filter_8bit(x6, t0, t1, t7, t6); \
101 \
102 /* postfilter sbox 3 */ \
103 vmovdqa .Lpost_tf_lo_s2, t4; \
104 vmovdqa .Lpost_tf_hi_s2, t5; \
105 filter_8bit(x2, t2, t3, t7, t6); \
106 filter_8bit(x5, t2, t3, t7, t6); \
107 \
108 vpxor t6, t6, t6; \
109 vmovq key, t0; \
110 \
111 /* postfilter sbox 2 */ \
112 filter_8bit(x1, t4, t5, t7, t2); \
113 filter_8bit(x4, t4, t5, t7, t2); \
114 \
115 vpsrldq $5, t0, t5; \
116 vpsrldq $1, t0, t1; \
117 vpsrldq $2, t0, t2; \
118 vpsrldq $3, t0, t3; \
119 vpsrldq $4, t0, t4; \
120 vpshufb t6, t0, t0; \
121 vpshufb t6, t1, t1; \
122 vpshufb t6, t2, t2; \
123 vpshufb t6, t3, t3; \
124 vpshufb t6, t4, t4; \
125 vpsrldq $2, t5, t7; \
126 vpshufb t6, t7, t7; \
127 \
128 /* \
129 * P-function \
130 */ \
131 vpxor x5, x0, x0; \
132 vpxor x6, x1, x1; \
133 vpxor x7, x2, x2; \
134 vpxor x4, x3, x3; \
135 \
136 vpxor x2, x4, x4; \
137 vpxor x3, x5, x5; \
138 vpxor x0, x6, x6; \
139 vpxor x1, x7, x7; \
140 \
141 vpxor x7, x0, x0; \
142 vpxor x4, x1, x1; \
143 vpxor x5, x2, x2; \
144 vpxor x6, x3, x3; \
145 \
146 vpxor x3, x4, x4; \
147 vpxor x0, x5, x5; \
148 vpxor x1, x6, x6; \
149 vpxor x2, x7, x7; /* note: high and low parts swapped */ \
150 \
151 /* \
152 * Add key material and result to CD (x becomes new CD) \
153 */ \
154 \
155 vpxor t3, x4, x4; \
156 vpxor 0 * 16(mem_cd), x4, x4; \
157 \
158 vpxor t2, x5, x5; \
159 vpxor 1 * 16(mem_cd), x5, x5; \
160 \
161 vpsrldq $1, t5, t3; \
162 vpshufb t6, t5, t5; \
163 vpshufb t6, t3, t6; \
164 \
165 vpxor t1, x6, x6; \
166 vpxor 2 * 16(mem_cd), x6, x6; \
167 \
168 vpxor t0, x7, x7; \
169 vpxor 3 * 16(mem_cd), x7, x7; \
170 \
171 vpxor t7, x0, x0; \
172 vpxor 4 * 16(mem_cd), x0, x0; \
173 \
174 vpxor t6, x1, x1; \
175 vpxor 5 * 16(mem_cd), x1, x1; \
176 \
177 vpxor t5, x2, x2; \
178 vpxor 6 * 16(mem_cd), x2, x2; \
179 \
180 vpxor t4, x3, x3; \
181 vpxor 7 * 16(mem_cd), x3, x3;
182
183/*
184 * Size optimization... with inlined roundsm16, binary would be over 5 times
185 * larger and would only be 0.5% faster (on sandy-bridge).
186 */
187.align 8
188roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
189 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
190 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
191 %rcx, (%r9));
192 ret;
193
194.align 8
195roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
196 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
197 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
198 %rax, (%r9));
199 ret;
200
201/*
202 * IN/OUT:
203 * x0..x7: byte-sliced AB state preloaded
204 * mem_ab: byte-sliced AB state in memory
205 * mem_cb: byte-sliced CD state in memory
206 */
207#define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
208 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
209 leaq (key_table + (i) * 8)(CTX), %r9; \
210 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
211 \
212 vmovdqu x4, 0 * 16(mem_cd); \
213 vmovdqu x5, 1 * 16(mem_cd); \
214 vmovdqu x6, 2 * 16(mem_cd); \
215 vmovdqu x7, 3 * 16(mem_cd); \
216 vmovdqu x0, 4 * 16(mem_cd); \
217 vmovdqu x1, 5 * 16(mem_cd); \
218 vmovdqu x2, 6 * 16(mem_cd); \
219 vmovdqu x3, 7 * 16(mem_cd); \
220 \
221 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
222 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
223 \
224 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
225
226#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */
227
228#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
229 /* Store new AB state */ \
230 vmovdqu x0, 0 * 16(mem_ab); \
231 vmovdqu x1, 1 * 16(mem_ab); \
232 vmovdqu x2, 2 * 16(mem_ab); \
233 vmovdqu x3, 3 * 16(mem_ab); \
234 vmovdqu x4, 4 * 16(mem_ab); \
235 vmovdqu x5, 5 * 16(mem_ab); \
236 vmovdqu x6, 6 * 16(mem_ab); \
237 vmovdqu x7, 7 * 16(mem_ab);
238
239#define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
240 y6, y7, mem_ab, mem_cd, i) \
241 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
242 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
243 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
244 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
245 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
247
248#define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
249 y6, y7, mem_ab, mem_cd, i) \
250 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
251 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
252 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
253 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
254 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
256
257/*
258 * IN:
259 * v0..3: byte-sliced 32-bit integers
260 * OUT:
261 * v0..3: (IN <<< 1)
262 */
263#define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
264 vpcmpgtb v0, zero, t0; \
265 vpaddb v0, v0, v0; \
266 vpabsb t0, t0; \
267 \
268 vpcmpgtb v1, zero, t1; \
269 vpaddb v1, v1, v1; \
270 vpabsb t1, t1; \
271 \
272 vpcmpgtb v2, zero, t2; \
273 vpaddb v2, v2, v2; \
274 vpabsb t2, t2; \
275 \
276 vpor t0, v1, v1; \
277 \
278 vpcmpgtb v3, zero, t0; \
279 vpaddb v3, v3, v3; \
280 vpabsb t0, t0; \
281 \
282 vpor t1, v2, v2; \
283 vpor t2, v3, v3; \
284 vpor t0, v0, v0;
285
286/*
287 * IN:
288 * r: byte-sliced AB state in memory
289 * l: byte-sliced CD state in memory
290 * OUT:
291 * x0..x7: new byte-sliced CD state
292 */
293#define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
294 tt1, tt2, tt3, kll, klr, krl, krr) \
295 /* \
296 * t0 = kll; \
297 * t0 &= ll; \
298 * lr ^= rol32(t0, 1); \
299 */ \
300 vpxor tt0, tt0, tt0; \
301 vmovd kll, t0; \
302 vpshufb tt0, t0, t3; \
303 vpsrldq $1, t0, t0; \
304 vpshufb tt0, t0, t2; \
305 vpsrldq $1, t0, t0; \
306 vpshufb tt0, t0, t1; \
307 vpsrldq $1, t0, t0; \
308 vpshufb tt0, t0, t0; \
309 \
310 vpand l0, t0, t0; \
311 vpand l1, t1, t1; \
312 vpand l2, t2, t2; \
313 vpand l3, t3, t3; \
314 \
315 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
316 \
317 vpxor l4, t0, l4; \
318 vmovdqu l4, 4 * 16(l); \
319 vpxor l5, t1, l5; \
320 vmovdqu l5, 5 * 16(l); \
321 vpxor l6, t2, l6; \
322 vmovdqu l6, 6 * 16(l); \
323 vpxor l7, t3, l7; \
324 vmovdqu l7, 7 * 16(l); \
325 \
326 /* \
327 * t2 = krr; \
328 * t2 |= rr; \
329 * rl ^= t2; \
330 */ \
331 \
332 vmovd krr, t0; \
333 vpshufb tt0, t0, t3; \
334 vpsrldq $1, t0, t0; \
335 vpshufb tt0, t0, t2; \
336 vpsrldq $1, t0, t0; \
337 vpshufb tt0, t0, t1; \
338 vpsrldq $1, t0, t0; \
339 vpshufb tt0, t0, t0; \
340 \
341 vpor 4 * 16(r), t0, t0; \
342 vpor 5 * 16(r), t1, t1; \
343 vpor 6 * 16(r), t2, t2; \
344 vpor 7 * 16(r), t3, t3; \
345 \
346 vpxor 0 * 16(r), t0, t0; \
347 vpxor 1 * 16(r), t1, t1; \
348 vpxor 2 * 16(r), t2, t2; \
349 vpxor 3 * 16(r), t3, t3; \
350 vmovdqu t0, 0 * 16(r); \
351 vmovdqu t1, 1 * 16(r); \
352 vmovdqu t2, 2 * 16(r); \
353 vmovdqu t3, 3 * 16(r); \
354 \
355 /* \
356 * t2 = krl; \
357 * t2 &= rl; \
358 * rr ^= rol32(t2, 1); \
359 */ \
360 vmovd krl, t0; \
361 vpshufb tt0, t0, t3; \
362 vpsrldq $1, t0, t0; \
363 vpshufb tt0, t0, t2; \
364 vpsrldq $1, t0, t0; \
365 vpshufb tt0, t0, t1; \
366 vpsrldq $1, t0, t0; \
367 vpshufb tt0, t0, t0; \
368 \
369 vpand 0 * 16(r), t0, t0; \
370 vpand 1 * 16(r), t1, t1; \
371 vpand 2 * 16(r), t2, t2; \
372 vpand 3 * 16(r), t3, t3; \
373 \
374 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
375 \
376 vpxor 4 * 16(r), t0, t0; \
377 vpxor 5 * 16(r), t1, t1; \
378 vpxor 6 * 16(r), t2, t2; \
379 vpxor 7 * 16(r), t3, t3; \
380 vmovdqu t0, 4 * 16(r); \
381 vmovdqu t1, 5 * 16(r); \
382 vmovdqu t2, 6 * 16(r); \
383 vmovdqu t3, 7 * 16(r); \
384 \
385 /* \
386 * t0 = klr; \
387 * t0 |= lr; \
388 * ll ^= t0; \
389 */ \
390 \
391 vmovd klr, t0; \
392 vpshufb tt0, t0, t3; \
393 vpsrldq $1, t0, t0; \
394 vpshufb tt0, t0, t2; \
395 vpsrldq $1, t0, t0; \
396 vpshufb tt0, t0, t1; \
397 vpsrldq $1, t0, t0; \
398 vpshufb tt0, t0, t0; \
399 \
400 vpor l4, t0, t0; \
401 vpor l5, t1, t1; \
402 vpor l6, t2, t2; \
403 vpor l7, t3, t3; \
404 \
405 vpxor l0, t0, l0; \
406 vmovdqu l0, 0 * 16(l); \
407 vpxor l1, t1, l1; \
408 vmovdqu l1, 1 * 16(l); \
409 vpxor l2, t2, l2; \
410 vmovdqu l2, 2 * 16(l); \
411 vpxor l3, t3, l3; \
412 vmovdqu l3, 3 * 16(l);
413
414#define transpose_4x4(x0, x1, x2, x3, t1, t2) \
415 vpunpckhdq x1, x0, t2; \
416 vpunpckldq x1, x0, x0; \
417 \
418 vpunpckldq x3, x2, t1; \
419 vpunpckhdq x3, x2, x2; \
420 \
421 vpunpckhqdq t1, x0, x1; \
422 vpunpcklqdq t1, x0, x0; \
423 \
424 vpunpckhqdq x2, t2, x3; \
425 vpunpcklqdq x2, t2, x2;
426
427#define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
428 b3, c3, d3, st0, st1) \
429 vmovdqu d2, st0; \
430 vmovdqu d3, st1; \
431 transpose_4x4(a0, a1, a2, a3, d2, d3); \
432 transpose_4x4(b0, b1, b2, b3, d2, d3); \
433 vmovdqu st0, d2; \
434 vmovdqu st1, d3; \
435 \
436 vmovdqu a0, st0; \
437 vmovdqu a1, st1; \
438 transpose_4x4(c0, c1, c2, c3, a0, a1); \
439 transpose_4x4(d0, d1, d2, d3, a0, a1); \
440 \
441 vmovdqu .Lshufb_16x16b, a0; \
442 vmovdqu st1, a1; \
443 vpshufb a0, a2, a2; \
444 vpshufb a0, a3, a3; \
445 vpshufb a0, b0, b0; \
446 vpshufb a0, b1, b1; \
447 vpshufb a0, b2, b2; \
448 vpshufb a0, b3, b3; \
449 vpshufb a0, a1, a1; \
450 vpshufb a0, c0, c0; \
451 vpshufb a0, c1, c1; \
452 vpshufb a0, c2, c2; \
453 vpshufb a0, c3, c3; \
454 vpshufb a0, d0, d0; \
455 vpshufb a0, d1, d1; \
456 vpshufb a0, d2, d2; \
457 vpshufb a0, d3, d3; \
458 vmovdqu d3, st1; \
459 vmovdqu st0, d3; \
460 vpshufb a0, d3, a0; \
461 vmovdqu d2, st0; \
462 \
463 transpose_4x4(a0, b0, c0, d0, d2, d3); \
464 transpose_4x4(a1, b1, c1, d1, d2, d3); \
465 vmovdqu st0, d2; \
466 vmovdqu st1, d3; \
467 \
468 vmovdqu b0, st0; \
469 vmovdqu b1, st1; \
470 transpose_4x4(a2, b2, c2, d2, b0, b1); \
471 transpose_4x4(a3, b3, c3, d3, b0, b1); \
472 vmovdqu st0, b0; \
473 vmovdqu st1, b1; \
474 /* does not adjust output bytes inside vectors */
475
476/* load blocks to registers and apply pre-whitening */
477#define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
478 y6, y7, rio, key) \
479 vmovq key, x0; \
480 vpshufb .Lpack_bswap, x0, x0; \
481 \
482 vpxor 0 * 16(rio), x0, y7; \
483 vpxor 1 * 16(rio), x0, y6; \
484 vpxor 2 * 16(rio), x0, y5; \
485 vpxor 3 * 16(rio), x0, y4; \
486 vpxor 4 * 16(rio), x0, y3; \
487 vpxor 5 * 16(rio), x0, y2; \
488 vpxor 6 * 16(rio), x0, y1; \
489 vpxor 7 * 16(rio), x0, y0; \
490 vpxor 8 * 16(rio), x0, x7; \
491 vpxor 9 * 16(rio), x0, x6; \
492 vpxor 10 * 16(rio), x0, x5; \
493 vpxor 11 * 16(rio), x0, x4; \
494 vpxor 12 * 16(rio), x0, x3; \
495 vpxor 13 * 16(rio), x0, x2; \
496 vpxor 14 * 16(rio), x0, x1; \
497 vpxor 15 * 16(rio), x0, x0;
498
499/* byteslice pre-whitened blocks and store to temporary memory */
500#define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
501 y6, y7, mem_ab, mem_cd) \
502 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
503 y5, y6, y7, (mem_ab), (mem_cd)); \
504 \
505 vmovdqu x0, 0 * 16(mem_ab); \
506 vmovdqu x1, 1 * 16(mem_ab); \
507 vmovdqu x2, 2 * 16(mem_ab); \
508 vmovdqu x3, 3 * 16(mem_ab); \
509 vmovdqu x4, 4 * 16(mem_ab); \
510 vmovdqu x5, 5 * 16(mem_ab); \
511 vmovdqu x6, 6 * 16(mem_ab); \
512 vmovdqu x7, 7 * 16(mem_ab); \
513 vmovdqu y0, 0 * 16(mem_cd); \
514 vmovdqu y1, 1 * 16(mem_cd); \
515 vmovdqu y2, 2 * 16(mem_cd); \
516 vmovdqu y3, 3 * 16(mem_cd); \
517 vmovdqu y4, 4 * 16(mem_cd); \
518 vmovdqu y5, 5 * 16(mem_cd); \
519 vmovdqu y6, 6 * 16(mem_cd); \
520 vmovdqu y7, 7 * 16(mem_cd);
521
522/* de-byteslice, apply post-whitening and store blocks */
523#define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
524 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
525 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
526 y7, x3, x7, stack_tmp0, stack_tmp1); \
527 \
528 vmovdqu x0, stack_tmp0; \
529 \
530 vmovq key, x0; \
531 vpshufb .Lpack_bswap, x0, x0; \
532 \
533 vpxor x0, y7, y7; \
534 vpxor x0, y6, y6; \
535 vpxor x0, y5, y5; \
536 vpxor x0, y4, y4; \
537 vpxor x0, y3, y3; \
538 vpxor x0, y2, y2; \
539 vpxor x0, y1, y1; \
540 vpxor x0, y0, y0; \
541 vpxor x0, x7, x7; \
542 vpxor x0, x6, x6; \
543 vpxor x0, x5, x5; \
544 vpxor x0, x4, x4; \
545 vpxor x0, x3, x3; \
546 vpxor x0, x2, x2; \
547 vpxor x0, x1, x1; \
548 vpxor stack_tmp0, x0, x0;
549
550#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
551 y6, y7, rio) \
552 vmovdqu x0, 0 * 16(rio); \
553 vmovdqu x1, 1 * 16(rio); \
554 vmovdqu x2, 2 * 16(rio); \
555 vmovdqu x3, 3 * 16(rio); \
556 vmovdqu x4, 4 * 16(rio); \
557 vmovdqu x5, 5 * 16(rio); \
558 vmovdqu x6, 6 * 16(rio); \
559 vmovdqu x7, 7 * 16(rio); \
560 vmovdqu y0, 8 * 16(rio); \
561 vmovdqu y1, 9 * 16(rio); \
562 vmovdqu y2, 10 * 16(rio); \
563 vmovdqu y3, 11 * 16(rio); \
564 vmovdqu y4, 12 * 16(rio); \
565 vmovdqu y5, 13 * 16(rio); \
566 vmovdqu y6, 14 * 16(rio); \
567 vmovdqu y7, 15 * 16(rio);
568
569.data
570.align 16
571
572#define SHUFB_BYTES(idx) \
573 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
574
575.Lshufb_16x16b:
576 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
577
578.Lpack_bswap:
579 .long 0x00010203
580 .long 0x04050607
581 .long 0x80808080
582 .long 0x80808080
583
584/* For CTR-mode IV byteswap */
585.Lbswap128_mask:
586 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
587
588/*
589 * pre-SubByte transform
590 *
591 * pre-lookup for sbox1, sbox2, sbox3:
592 * swap_bitendianness(
593 * isom_map_camellia_to_aes(
594 * camellia_f(
595 * swap_bitendianess(in)
596 * )
597 * )
598 * )
599 *
600 * (note: '⊕ 0xc5' inside camellia_f())
601 */
602.Lpre_tf_lo_s1:
603 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
604 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
605.Lpre_tf_hi_s1:
606 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
607 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
608
609/*
610 * pre-SubByte transform
611 *
612 * pre-lookup for sbox4:
613 * swap_bitendianness(
614 * isom_map_camellia_to_aes(
615 * camellia_f(
616 * swap_bitendianess(in <<< 1)
617 * )
618 * )
619 * )
620 *
621 * (note: '⊕ 0xc5' inside camellia_f())
622 */
623.Lpre_tf_lo_s4:
624 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
625 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
626.Lpre_tf_hi_s4:
627 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
628 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
629
630/*
631 * post-SubByte transform
632 *
633 * post-lookup for sbox1, sbox4:
634 * swap_bitendianness(
635 * camellia_h(
636 * isom_map_aes_to_camellia(
637 * swap_bitendianness(
638 * aes_inverse_affine_transform(in)
639 * )
640 * )
641 * )
642 * )
643 *
644 * (note: '⊕ 0x6e' inside camellia_h())
645 */
646.Lpost_tf_lo_s1:
647 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
648 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
649.Lpost_tf_hi_s1:
650 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
651 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
652
653/*
654 * post-SubByte transform
655 *
656 * post-lookup for sbox2:
657 * swap_bitendianness(
658 * camellia_h(
659 * isom_map_aes_to_camellia(
660 * swap_bitendianness(
661 * aes_inverse_affine_transform(in)
662 * )
663 * )
664 * )
665 * ) <<< 1
666 *
667 * (note: '⊕ 0x6e' inside camellia_h())
668 */
669.Lpost_tf_lo_s2:
670 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
671 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
672.Lpost_tf_hi_s2:
673 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
674 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
675
676/*
677 * post-SubByte transform
678 *
679 * post-lookup for sbox3:
680 * swap_bitendianness(
681 * camellia_h(
682 * isom_map_aes_to_camellia(
683 * swap_bitendianness(
684 * aes_inverse_affine_transform(in)
685 * )
686 * )
687 * )
688 * ) >>> 1
689 *
690 * (note: '⊕ 0x6e' inside camellia_h())
691 */
692.Lpost_tf_lo_s3:
693 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
694 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
695.Lpost_tf_hi_s3:
696 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
697 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
698
699/* For isolating SubBytes from AESENCLAST, inverse shift row */
700.Linv_shift_row:
701 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
702 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
703
704/* 4-bit mask */
705.align 4
706.L0f0f0f0f:
707 .long 0x0f0f0f0f
708
709.text
710
711.align 8
712.type __camellia_enc_blk16,@function;
713
714__camellia_enc_blk16:
715 /* input:
716 * %rdi: ctx, CTX
717 * %rax: temporary storage, 256 bytes
718 * %xmm0..%xmm15: 16 plaintext blocks
719 * output:
720 * %xmm0..%xmm15: 16 encrypted blocks, order swapped:
721 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
722 */
723
724 leaq 8 * 16(%rax), %rcx;
725
726 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
727 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
728 %xmm15, %rax, %rcx);
729
730 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
731 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
732 %xmm15, %rax, %rcx, 0);
733
734 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
735 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
736 %xmm15,
737 ((key_table + (8) * 8) + 0)(CTX),
738 ((key_table + (8) * 8) + 4)(CTX),
739 ((key_table + (8) * 8) + 8)(CTX),
740 ((key_table + (8) * 8) + 12)(CTX));
741
742 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
743 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
744 %xmm15, %rax, %rcx, 8);
745
746 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
747 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
748 %xmm15,
749 ((key_table + (16) * 8) + 0)(CTX),
750 ((key_table + (16) * 8) + 4)(CTX),
751 ((key_table + (16) * 8) + 8)(CTX),
752 ((key_table + (16) * 8) + 12)(CTX));
753
754 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
755 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
756 %xmm15, %rax, %rcx, 16);
757
758 movl $24, %r8d;
759 cmpl $16, key_length(CTX);
760 jne .Lenc_max32;
761
762.Lenc_done:
763 /* load CD for output */
764 vmovdqu 0 * 16(%rcx), %xmm8;
765 vmovdqu 1 * 16(%rcx), %xmm9;
766 vmovdqu 2 * 16(%rcx), %xmm10;
767 vmovdqu 3 * 16(%rcx), %xmm11;
768 vmovdqu 4 * 16(%rcx), %xmm12;
769 vmovdqu 5 * 16(%rcx), %xmm13;
770 vmovdqu 6 * 16(%rcx), %xmm14;
771 vmovdqu 7 * 16(%rcx), %xmm15;
772
773 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
774 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
775 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
776
777 ret;
778
779.align 8
780.Lenc_max32:
781 movl $32, %r8d;
782
783 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
784 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
785 %xmm15,
786 ((key_table + (24) * 8) + 0)(CTX),
787 ((key_table + (24) * 8) + 4)(CTX),
788 ((key_table + (24) * 8) + 8)(CTX),
789 ((key_table + (24) * 8) + 12)(CTX));
790
791 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
792 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
793 %xmm15, %rax, %rcx, 24);
794
795 jmp .Lenc_done;
796
797.align 8
798.type __camellia_dec_blk16,@function;
799
800__camellia_dec_blk16:
801 /* input:
802 * %rdi: ctx, CTX
803 * %rax: temporary storage, 256 bytes
804 * %r8d: 24 for 16 byte key, 32 for larger
805 * %xmm0..%xmm15: 16 encrypted blocks
806 * output:
807 * %xmm0..%xmm15: 16 plaintext blocks, order swapped:
808 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
809 */
810
811 leaq 8 * 16(%rax), %rcx;
812
813 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
814 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
815 %xmm15, %rax, %rcx);
816
817 cmpl $32, %r8d;
818 je .Ldec_max32;
819
820.Ldec_max24:
821 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
822 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
823 %xmm15, %rax, %rcx, 16);
824
825 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
826 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
827 %xmm15,
828 ((key_table + (16) * 8) + 8)(CTX),
829 ((key_table + (16) * 8) + 12)(CTX),
830 ((key_table + (16) * 8) + 0)(CTX),
831 ((key_table + (16) * 8) + 4)(CTX));
832
833 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
834 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
835 %xmm15, %rax, %rcx, 8);
836
837 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
838 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
839 %xmm15,
840 ((key_table + (8) * 8) + 8)(CTX),
841 ((key_table + (8) * 8) + 12)(CTX),
842 ((key_table + (8) * 8) + 0)(CTX),
843 ((key_table + (8) * 8) + 4)(CTX));
844
845 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
846 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
847 %xmm15, %rax, %rcx, 0);
848
849 /* load CD for output */
850 vmovdqu 0 * 16(%rcx), %xmm8;
851 vmovdqu 1 * 16(%rcx), %xmm9;
852 vmovdqu 2 * 16(%rcx), %xmm10;
853 vmovdqu 3 * 16(%rcx), %xmm11;
854 vmovdqu 4 * 16(%rcx), %xmm12;
855 vmovdqu 5 * 16(%rcx), %xmm13;
856 vmovdqu 6 * 16(%rcx), %xmm14;
857 vmovdqu 7 * 16(%rcx), %xmm15;
858
859 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
860 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
861 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
862
863 ret;
864
865.align 8
866.Ldec_max32:
867 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
868 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
869 %xmm15, %rax, %rcx, 24);
870
871 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
872 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
873 %xmm15,
874 ((key_table + (24) * 8) + 8)(CTX),
875 ((key_table + (24) * 8) + 12)(CTX),
876 ((key_table + (24) * 8) + 0)(CTX),
877 ((key_table + (24) * 8) + 4)(CTX));
878
879 jmp .Ldec_max24;
880
881.align 8
882.global camellia_ecb_enc_16way
883.type camellia_ecb_enc_16way,@function;
884
885camellia_ecb_enc_16way:
886 /* input:
887 * %rdi: ctx, CTX
888 * %rsi: dst (16 blocks)
889 * %rdx: src (16 blocks)
890 */
891
892 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
893 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
894 %xmm15, %rdx, (key_table)(CTX));
895
896 /* now dst can be used as temporary buffer (even in src == dst case) */
897 movq %rsi, %rax;
898
899 call __camellia_enc_blk16;
900
901 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
902 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
903 %xmm8, %rsi);
904
905 ret;
906
907.align 8
908.global camellia_ecb_dec_16way
909.type camellia_ecb_dec_16way,@function;
910
911camellia_ecb_dec_16way:
912 /* input:
913 * %rdi: ctx, CTX
914 * %rsi: dst (16 blocks)
915 * %rdx: src (16 blocks)
916 */
917
918 cmpl $16, key_length(CTX);
919 movl $32, %r8d;
920 movl $24, %eax;
921 cmovel %eax, %r8d; /* max */
922
923 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
924 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
925 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
926
927 /* now dst can be used as temporary buffer (even in src == dst case) */
928 movq %rsi, %rax;
929
930 call __camellia_dec_blk16;
931
932 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
933 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
934 %xmm8, %rsi);
935
936 ret;
937
938.align 8
939.global camellia_cbc_dec_16way
940.type camellia_cbc_dec_16way,@function;
941
942camellia_cbc_dec_16way:
943 /* input:
944 * %rdi: ctx, CTX
945 * %rsi: dst (16 blocks)
946 * %rdx: src (16 blocks)
947 */
948
949 cmpl $16, key_length(CTX);
950 movl $32, %r8d;
951 movl $24, %eax;
952 cmovel %eax, %r8d; /* max */
953
954 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
955 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
956 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
957
958 /*
959 * dst might still be in-use (in case dst == src), so use stack for
960 * temporary storage.
961 */
962 subq $(16 * 16), %rsp;
963 movq %rsp, %rax;
964
965 call __camellia_dec_blk16;
966
967 addq $(16 * 16), %rsp;
968
969 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
970 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
971 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
972 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
973 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
974 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
975 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
976 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
977 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
978 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
979 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
980 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
981 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
982 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
983 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
984 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
985 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
986 %xmm8, %rsi);
987
988 ret;
989
990#define inc_le128(x, minus_one, tmp) \
991 vpcmpeqq minus_one, x, tmp; \
992 vpsubq minus_one, x, x; \
993 vpslldq $8, tmp, tmp; \
994 vpsubq tmp, x, x;
995
996.align 8
997.global camellia_ctr_16way
998.type camellia_ctr_16way,@function;
999
1000camellia_ctr_16way:
1001 /* input:
1002 * %rdi: ctx, CTX
1003 * %rsi: dst (16 blocks)
1004 * %rdx: src (16 blocks)
1005 * %rcx: iv (little endian, 128bit)
1006 */
1007
1008 subq $(16 * 16), %rsp;
1009 movq %rsp, %rax;
1010
1011 vmovdqa .Lbswap128_mask, %xmm14;
1012
1013 /* load IV and byteswap */
1014 vmovdqu (%rcx), %xmm0;
1015 vpshufb %xmm14, %xmm0, %xmm15;
1016 vmovdqu %xmm15, 15 * 16(%rax);
1017
1018 vpcmpeqd %xmm15, %xmm15, %xmm15;
1019 vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */
1020
1021 /* construct IVs */
1022 inc_le128(%xmm0, %xmm15, %xmm13);
1023 vpshufb %xmm14, %xmm0, %xmm13;
1024 vmovdqu %xmm13, 14 * 16(%rax);
1025 inc_le128(%xmm0, %xmm15, %xmm13);
1026 vpshufb %xmm14, %xmm0, %xmm13;
1027 vmovdqu %xmm13, 13 * 16(%rax);
1028 inc_le128(%xmm0, %xmm15, %xmm13);
1029 vpshufb %xmm14, %xmm0, %xmm12;
1030 inc_le128(%xmm0, %xmm15, %xmm13);
1031 vpshufb %xmm14, %xmm0, %xmm11;
1032 inc_le128(%xmm0, %xmm15, %xmm13);
1033 vpshufb %xmm14, %xmm0, %xmm10;
1034 inc_le128(%xmm0, %xmm15, %xmm13);
1035 vpshufb %xmm14, %xmm0, %xmm9;
1036 inc_le128(%xmm0, %xmm15, %xmm13);
1037 vpshufb %xmm14, %xmm0, %xmm8;
1038 inc_le128(%xmm0, %xmm15, %xmm13);
1039 vpshufb %xmm14, %xmm0, %xmm7;
1040 inc_le128(%xmm0, %xmm15, %xmm13);
1041 vpshufb %xmm14, %xmm0, %xmm6;
1042 inc_le128(%xmm0, %xmm15, %xmm13);
1043 vpshufb %xmm14, %xmm0, %xmm5;
1044 inc_le128(%xmm0, %xmm15, %xmm13);
1045 vpshufb %xmm14, %xmm0, %xmm4;
1046 inc_le128(%xmm0, %xmm15, %xmm13);
1047 vpshufb %xmm14, %xmm0, %xmm3;
1048 inc_le128(%xmm0, %xmm15, %xmm13);
1049 vpshufb %xmm14, %xmm0, %xmm2;
1050 inc_le128(%xmm0, %xmm15, %xmm13);
1051 vpshufb %xmm14, %xmm0, %xmm1;
1052 inc_le128(%xmm0, %xmm15, %xmm13);
1053 vmovdqa %xmm0, %xmm13;
1054 vpshufb %xmm14, %xmm0, %xmm0;
1055 inc_le128(%xmm13, %xmm15, %xmm14);
1056 vmovdqu %xmm13, (%rcx);
1057
1058 /* inpack16_pre: */
1059 vmovq (key_table)(CTX), %xmm15;
1060 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1061 vpxor %xmm0, %xmm15, %xmm0;
1062 vpxor %xmm1, %xmm15, %xmm1;
1063 vpxor %xmm2, %xmm15, %xmm2;
1064 vpxor %xmm3, %xmm15, %xmm3;
1065 vpxor %xmm4, %xmm15, %xmm4;
1066 vpxor %xmm5, %xmm15, %xmm5;
1067 vpxor %xmm6, %xmm15, %xmm6;
1068 vpxor %xmm7, %xmm15, %xmm7;
1069 vpxor %xmm8, %xmm15, %xmm8;
1070 vpxor %xmm9, %xmm15, %xmm9;
1071 vpxor %xmm10, %xmm15, %xmm10;
1072 vpxor %xmm11, %xmm15, %xmm11;
1073 vpxor %xmm12, %xmm15, %xmm12;
1074 vpxor 13 * 16(%rax), %xmm15, %xmm13;
1075 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1076 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1077
1078 call __camellia_enc_blk16;
1079
1080 addq $(16 * 16), %rsp;
1081
1082 vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1083 vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1084 vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1085 vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1086 vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1087 vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1088 vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1089 vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1090 vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1091 vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1092 vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1093 vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1094 vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1095 vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1096 vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1097 vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1098 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1099 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1100 %xmm8, %rsi);
1101
1102 ret;
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
new file mode 100644
index 000000000000..96cbb6068fce
--- /dev/null
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -0,0 +1,558 @@
1/*
2 * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia
3 *
4 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 */
12
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/crypto.h>
16#include <linux/err.h>
17#include <crypto/algapi.h>
18#include <crypto/ctr.h>
19#include <crypto/lrw.h>
20#include <crypto/xts.h>
21#include <asm/xcr.h>
22#include <asm/xsave.h>
23#include <asm/crypto/camellia.h>
24#include <asm/crypto/ablk_helper.h>
25#include <asm/crypto/glue_helper.h>
26
27#define CAMELLIA_AESNI_PARALLEL_BLOCKS 16
28
29/* 16-way AES-NI parallel cipher functions */
30asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst,
31 const u8 *src);
32asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst,
33 const u8 *src);
34
35asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst,
36 const u8 *src);
37asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst,
38 const u8 *src, le128 *iv);
39
40static const struct common_glue_ctx camellia_enc = {
41 .num_funcs = 3,
42 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
43
44 .funcs = { {
45 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
46 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) }
47 }, {
48 .num_blocks = 2,
49 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) }
50 }, {
51 .num_blocks = 1,
52 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) }
53 } }
54};
55
56static const struct common_glue_ctx camellia_ctr = {
57 .num_funcs = 3,
58 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
59
60 .funcs = { {
61 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
62 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) }
63 }, {
64 .num_blocks = 2,
65 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) }
66 }, {
67 .num_blocks = 1,
68 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) }
69 } }
70};
71
72static const struct common_glue_ctx camellia_dec = {
73 .num_funcs = 3,
74 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
75
76 .funcs = { {
77 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
78 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) }
79 }, {
80 .num_blocks = 2,
81 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) }
82 }, {
83 .num_blocks = 1,
84 .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) }
85 } }
86};
87
88static const struct common_glue_ctx camellia_dec_cbc = {
89 .num_funcs = 3,
90 .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS,
91
92 .funcs = { {
93 .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS,
94 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) }
95 }, {
96 .num_blocks = 2,
97 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) }
98 }, {
99 .num_blocks = 1,
100 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) }
101 } }
102};
103
104static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
105 struct scatterlist *src, unsigned int nbytes)
106{
107 return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes);
108}
109
110static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
111 struct scatterlist *src, unsigned int nbytes)
112{
113 return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes);
114}
115
116static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
117 struct scatterlist *src, unsigned int nbytes)
118{
119 return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc,
120 dst, src, nbytes);
121}
122
123static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
124 struct scatterlist *src, unsigned int nbytes)
125{
126 return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src,
127 nbytes);
128}
129
130static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst,
131 struct scatterlist *src, unsigned int nbytes)
132{
133 return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes);
134}
135
136static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes)
137{
138 return glue_fpu_begin(CAMELLIA_BLOCK_SIZE,
139 CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled,
140 nbytes);
141}
142
143static inline void camellia_fpu_end(bool fpu_enabled)
144{
145 glue_fpu_end(fpu_enabled);
146}
147
148static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
149 unsigned int key_len)
150{
151 return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len,
152 &tfm->crt_flags);
153}
154
155struct crypt_priv {
156 struct camellia_ctx *ctx;
157 bool fpu_enabled;
158};
159
160static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
161{
162 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
163 struct crypt_priv *ctx = priv;
164 int i;
165
166 ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
167
168 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
169 camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
170 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
171 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
172 }
173
174 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
175 camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
176 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
177 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
178 }
179
180 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
181 camellia_enc_blk(ctx->ctx, srcdst, srcdst);
182}
183
184static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
185{
186 const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
187 struct crypt_priv *ctx = priv;
188 int i;
189
190 ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes);
191
192 if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
193 camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
194 srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
195 nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
196 }
197
198 while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
199 camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
200 srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
201 nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
202 }
203
204 for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
205 camellia_dec_blk(ctx->ctx, srcdst, srcdst);
206}
207
208static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
209 struct scatterlist *src, unsigned int nbytes)
210{
211 struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
212 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
213 struct crypt_priv crypt_ctx = {
214 .ctx = &ctx->camellia_ctx,
215 .fpu_enabled = false,
216 };
217 struct lrw_crypt_req req = {
218 .tbuf = buf,
219 .tbuflen = sizeof(buf),
220
221 .table_ctx = &ctx->lrw_table,
222 .crypt_ctx = &crypt_ctx,
223 .crypt_fn = encrypt_callback,
224 };
225 int ret;
226
227 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
228 ret = lrw_crypt(desc, dst, src, nbytes, &req);
229 camellia_fpu_end(crypt_ctx.fpu_enabled);
230
231 return ret;
232}
233
234static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
235 struct scatterlist *src, unsigned int nbytes)
236{
237 struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
238 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
239 struct crypt_priv crypt_ctx = {
240 .ctx = &ctx->camellia_ctx,
241 .fpu_enabled = false,
242 };
243 struct lrw_crypt_req req = {
244 .tbuf = buf,
245 .tbuflen = sizeof(buf),
246
247 .table_ctx = &ctx->lrw_table,
248 .crypt_ctx = &crypt_ctx,
249 .crypt_fn = decrypt_callback,
250 };
251 int ret;
252
253 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
254 ret = lrw_crypt(desc, dst, src, nbytes, &req);
255 camellia_fpu_end(crypt_ctx.fpu_enabled);
256
257 return ret;
258}
259
260static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
261 struct scatterlist *src, unsigned int nbytes)
262{
263 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
264 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
265 struct crypt_priv crypt_ctx = {
266 .ctx = &ctx->crypt_ctx,
267 .fpu_enabled = false,
268 };
269 struct xts_crypt_req req = {
270 .tbuf = buf,
271 .tbuflen = sizeof(buf),
272
273 .tweak_ctx = &ctx->tweak_ctx,
274 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
275 .crypt_ctx = &crypt_ctx,
276 .crypt_fn = encrypt_callback,
277 };
278 int ret;
279
280 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
281 ret = xts_crypt(desc, dst, src, nbytes, &req);
282 camellia_fpu_end(crypt_ctx.fpu_enabled);
283
284 return ret;
285}
286
287static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
288 struct scatterlist *src, unsigned int nbytes)
289{
290 struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
291 be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS];
292 struct crypt_priv crypt_ctx = {
293 .ctx = &ctx->crypt_ctx,
294 .fpu_enabled = false,
295 };
296 struct xts_crypt_req req = {
297 .tbuf = buf,
298 .tbuflen = sizeof(buf),
299
300 .tweak_ctx = &ctx->tweak_ctx,
301 .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk),
302 .crypt_ctx = &crypt_ctx,
303 .crypt_fn = decrypt_callback,
304 };
305 int ret;
306
307 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
308 ret = xts_crypt(desc, dst, src, nbytes, &req);
309 camellia_fpu_end(crypt_ctx.fpu_enabled);
310
311 return ret;
312}
313
314static struct crypto_alg cmll_algs[10] = { {
315 .cra_name = "__ecb-camellia-aesni",
316 .cra_driver_name = "__driver-ecb-camellia-aesni",
317 .cra_priority = 0,
318 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
319 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
320 .cra_ctxsize = sizeof(struct camellia_ctx),
321 .cra_alignmask = 0,
322 .cra_type = &crypto_blkcipher_type,
323 .cra_module = THIS_MODULE,
324 .cra_u = {
325 .blkcipher = {
326 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
327 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
328 .setkey = camellia_setkey,
329 .encrypt = ecb_encrypt,
330 .decrypt = ecb_decrypt,
331 },
332 },
333}, {
334 .cra_name = "__cbc-camellia-aesni",
335 .cra_driver_name = "__driver-cbc-camellia-aesni",
336 .cra_priority = 0,
337 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
338 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
339 .cra_ctxsize = sizeof(struct camellia_ctx),
340 .cra_alignmask = 0,
341 .cra_type = &crypto_blkcipher_type,
342 .cra_module = THIS_MODULE,
343 .cra_u = {
344 .blkcipher = {
345 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
346 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
347 .setkey = camellia_setkey,
348 .encrypt = cbc_encrypt,
349 .decrypt = cbc_decrypt,
350 },
351 },
352}, {
353 .cra_name = "__ctr-camellia-aesni",
354 .cra_driver_name = "__driver-ctr-camellia-aesni",
355 .cra_priority = 0,
356 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
357 .cra_blocksize = 1,
358 .cra_ctxsize = sizeof(struct camellia_ctx),
359 .cra_alignmask = 0,
360 .cra_type = &crypto_blkcipher_type,
361 .cra_module = THIS_MODULE,
362 .cra_u = {
363 .blkcipher = {
364 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
365 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
366 .ivsize = CAMELLIA_BLOCK_SIZE,
367 .setkey = camellia_setkey,
368 .encrypt = ctr_crypt,
369 .decrypt = ctr_crypt,
370 },
371 },
372}, {
373 .cra_name = "__lrw-camellia-aesni",
374 .cra_driver_name = "__driver-lrw-camellia-aesni",
375 .cra_priority = 0,
376 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
377 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
378 .cra_ctxsize = sizeof(struct camellia_lrw_ctx),
379 .cra_alignmask = 0,
380 .cra_type = &crypto_blkcipher_type,
381 .cra_module = THIS_MODULE,
382 .cra_exit = lrw_camellia_exit_tfm,
383 .cra_u = {
384 .blkcipher = {
385 .min_keysize = CAMELLIA_MIN_KEY_SIZE +
386 CAMELLIA_BLOCK_SIZE,
387 .max_keysize = CAMELLIA_MAX_KEY_SIZE +
388 CAMELLIA_BLOCK_SIZE,
389 .ivsize = CAMELLIA_BLOCK_SIZE,
390 .setkey = lrw_camellia_setkey,
391 .encrypt = lrw_encrypt,
392 .decrypt = lrw_decrypt,
393 },
394 },
395}, {
396 .cra_name = "__xts-camellia-aesni",
397 .cra_driver_name = "__driver-xts-camellia-aesni",
398 .cra_priority = 0,
399 .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
400 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
401 .cra_ctxsize = sizeof(struct camellia_xts_ctx),
402 .cra_alignmask = 0,
403 .cra_type = &crypto_blkcipher_type,
404 .cra_module = THIS_MODULE,
405 .cra_u = {
406 .blkcipher = {
407 .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
408 .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
409 .ivsize = CAMELLIA_BLOCK_SIZE,
410 .setkey = xts_camellia_setkey,
411 .encrypt = xts_encrypt,
412 .decrypt = xts_decrypt,
413 },
414 },
415}, {
416 .cra_name = "ecb(camellia)",
417 .cra_driver_name = "ecb-camellia-aesni",
418 .cra_priority = 400,
419 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
420 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
421 .cra_ctxsize = sizeof(struct async_helper_ctx),
422 .cra_alignmask = 0,
423 .cra_type = &crypto_ablkcipher_type,
424 .cra_module = THIS_MODULE,
425 .cra_init = ablk_init,
426 .cra_exit = ablk_exit,
427 .cra_u = {
428 .ablkcipher = {
429 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
430 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
431 .setkey = ablk_set_key,
432 .encrypt = ablk_encrypt,
433 .decrypt = ablk_decrypt,
434 },
435 },
436}, {
437 .cra_name = "cbc(camellia)",
438 .cra_driver_name = "cbc-camellia-aesni",
439 .cra_priority = 400,
440 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
441 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
442 .cra_ctxsize = sizeof(struct async_helper_ctx),
443 .cra_alignmask = 0,
444 .cra_type = &crypto_ablkcipher_type,
445 .cra_module = THIS_MODULE,
446 .cra_init = ablk_init,
447 .cra_exit = ablk_exit,
448 .cra_u = {
449 .ablkcipher = {
450 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
451 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
452 .ivsize = CAMELLIA_BLOCK_SIZE,
453 .setkey = ablk_set_key,
454 .encrypt = __ablk_encrypt,
455 .decrypt = ablk_decrypt,
456 },
457 },
458}, {
459 .cra_name = "ctr(camellia)",
460 .cra_driver_name = "ctr-camellia-aesni",
461 .cra_priority = 400,
462 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
463 .cra_blocksize = 1,
464 .cra_ctxsize = sizeof(struct async_helper_ctx),
465 .cra_alignmask = 0,
466 .cra_type = &crypto_ablkcipher_type,
467 .cra_module = THIS_MODULE,
468 .cra_init = ablk_init,
469 .cra_exit = ablk_exit,
470 .cra_u = {
471 .ablkcipher = {
472 .min_keysize = CAMELLIA_MIN_KEY_SIZE,
473 .max_keysize = CAMELLIA_MAX_KEY_SIZE,
474 .ivsize = CAMELLIA_BLOCK_SIZE,
475 .setkey = ablk_set_key,
476 .encrypt = ablk_encrypt,
477 .decrypt = ablk_encrypt,
478 .geniv = "chainiv",
479 },
480 },
481}, {
482 .cra_name = "lrw(camellia)",
483 .cra_driver_name = "lrw-camellia-aesni",
484 .cra_priority = 400,
485 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
486 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
487 .cra_ctxsize = sizeof(struct async_helper_ctx),
488 .cra_alignmask = 0,
489 .cra_type = &crypto_ablkcipher_type,
490 .cra_module = THIS_MODULE,
491 .cra_init = ablk_init,
492 .cra_exit = ablk_exit,
493 .cra_u = {
494 .ablkcipher = {
495 .min_keysize = CAMELLIA_MIN_KEY_SIZE +
496 CAMELLIA_BLOCK_SIZE,
497 .max_keysize = CAMELLIA_MAX_KEY_SIZE +
498 CAMELLIA_BLOCK_SIZE,
499 .ivsize = CAMELLIA_BLOCK_SIZE,
500 .setkey = ablk_set_key,
501 .encrypt = ablk_encrypt,
502 .decrypt = ablk_decrypt,
503 },
504 },
505}, {
506 .cra_name = "xts(camellia)",
507 .cra_driver_name = "xts-camellia-aesni",
508 .cra_priority = 400,
509 .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC,
510 .cra_blocksize = CAMELLIA_BLOCK_SIZE,
511 .cra_ctxsize = sizeof(struct async_helper_ctx),
512 .cra_alignmask = 0,
513 .cra_type = &crypto_ablkcipher_type,
514 .cra_module = THIS_MODULE,
515 .cra_init = ablk_init,
516 .cra_exit = ablk_exit,
517 .cra_u = {
518 .ablkcipher = {
519 .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2,
520 .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2,
521 .ivsize = CAMELLIA_BLOCK_SIZE,
522 .setkey = ablk_set_key,
523 .encrypt = ablk_encrypt,
524 .decrypt = ablk_decrypt,
525 },
526 },
527} };
528
529static int __init camellia_aesni_init(void)
530{
531 u64 xcr0;
532
533 if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) {
534 pr_info("AVX or AES-NI instructions are not detected.\n");
535 return -ENODEV;
536 }
537
538 xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
539 if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) {
540 pr_info("AVX detected but unusable.\n");
541 return -ENODEV;
542 }
543
544 return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
545}
546
547static void __exit camellia_aesni_fini(void)
548{
549 crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs));
550}
551
552module_init(camellia_aesni_init);
553module_exit(camellia_aesni_fini);
554
555MODULE_LICENSE("GPL");
556MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized");
557MODULE_ALIAS("camellia");
558MODULE_ALIAS("camellia-asm");
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c
index 42ffd2bbab5b..5cb86ccd4acb 100644
--- a/arch/x86/crypto/camellia_glue.c
+++ b/arch/x86/crypto/camellia_glue.c
@@ -32,53 +32,24 @@
32#include <crypto/algapi.h> 32#include <crypto/algapi.h>
33#include <crypto/lrw.h> 33#include <crypto/lrw.h>
34#include <crypto/xts.h> 34#include <crypto/xts.h>
35#include <asm/crypto/camellia.h>
35#include <asm/crypto/glue_helper.h> 36#include <asm/crypto/glue_helper.h>
36 37
37#define CAMELLIA_MIN_KEY_SIZE 16
38#define CAMELLIA_MAX_KEY_SIZE 32
39#define CAMELLIA_BLOCK_SIZE 16
40#define CAMELLIA_TABLE_BYTE_LEN 272
41
42struct camellia_ctx {
43 u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
44 u32 key_length;
45};
46
47/* regular block cipher functions */ 38/* regular block cipher functions */
48asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, 39asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
49 const u8 *src, bool xor); 40 const u8 *src, bool xor);
41EXPORT_SYMBOL_GPL(__camellia_enc_blk);
50asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, 42asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
51 const u8 *src); 43 const u8 *src);
44EXPORT_SYMBOL_GPL(camellia_dec_blk);
52 45
53/* 2-way parallel cipher functions */ 46/* 2-way parallel cipher functions */
54asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, 47asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
55 const u8 *src, bool xor); 48 const u8 *src, bool xor);
49EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way);
56asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, 50asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
57 const u8 *src); 51 const u8 *src);
58 52EXPORT_SYMBOL_GPL(camellia_dec_blk_2way);
59static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
60 const u8 *src)
61{
62 __camellia_enc_blk(ctx, dst, src, false);
63}
64
65static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
66 const u8 *src)
67{
68 __camellia_enc_blk(ctx, dst, src, true);
69}
70
71static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
72 const u8 *src)
73{
74 __camellia_enc_blk_2way(ctx, dst, src, false);
75}
76
77static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
78 const u8 *src)
79{
80 __camellia_enc_blk_2way(ctx, dst, src, true);
81}
82 53
83static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) 54static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
84{ 55{
@@ -1275,9 +1246,8 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey)
1275 camellia_setup256(kk, subkey); 1246 camellia_setup256(kk, subkey);
1276} 1247}
1277 1248
1278static int __camellia_setkey(struct camellia_ctx *cctx, 1249int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key,
1279 const unsigned char *key, 1250 unsigned int key_len, u32 *flags)
1280 unsigned int key_len, u32 *flags)
1281{ 1251{
1282 if (key_len != 16 && key_len != 24 && key_len != 32) { 1252 if (key_len != 16 && key_len != 24 && key_len != 32) {
1283 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; 1253 *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
@@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx,
1300 1270
1301 return 0; 1271 return 0;
1302} 1272}
1273EXPORT_SYMBOL_GPL(__camellia_setkey);
1303 1274
1304static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, 1275static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
1305 unsigned int key_len) 1276 unsigned int key_len)
@@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key,
1308 &tfm->crt_flags); 1279 &tfm->crt_flags);
1309} 1280}
1310 1281
1311static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) 1282void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
1312{ 1283{
1313 u128 iv = *src; 1284 u128 iv = *src;
1314 1285
@@ -1316,22 +1287,23 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src)
1316 1287
1317 u128_xor(&dst[1], &dst[1], &iv); 1288 u128_xor(&dst[1], &dst[1], &iv);
1318} 1289}
1290EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way);
1319 1291
1320static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) 1292void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
1321{ 1293{
1322 be128 ctrblk; 1294 be128 ctrblk;
1323 1295
1324 if (dst != src) 1296 if (dst != src)
1325 *dst = *src; 1297 *dst = *src;
1326 1298
1327 u128_to_be128(&ctrblk, iv); 1299 le128_to_be128(&ctrblk, iv);
1328 u128_inc(iv); 1300 le128_inc(iv);
1329 1301
1330 camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); 1302 camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk);
1331} 1303}
1304EXPORT_SYMBOL_GPL(camellia_crypt_ctr);
1332 1305
1333static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, 1306void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv)
1334 u128 *iv)
1335{ 1307{
1336 be128 ctrblks[2]; 1308 be128 ctrblks[2];
1337 1309
@@ -1340,13 +1312,14 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
1340 dst[1] = src[1]; 1312 dst[1] = src[1];
1341 } 1313 }
1342 1314
1343 u128_to_be128(&ctrblks[0], iv); 1315 le128_to_be128(&ctrblks[0], iv);
1344 u128_inc(iv); 1316 le128_inc(iv);
1345 u128_to_be128(&ctrblks[1], iv); 1317 le128_to_be128(&ctrblks[1], iv);
1346 u128_inc(iv); 1318 le128_inc(iv);
1347 1319
1348 camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); 1320 camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks);
1349} 1321}
1322EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way);
1350 1323
1351static const struct common_glue_ctx camellia_enc = { 1324static const struct common_glue_ctx camellia_enc = {
1352 .num_funcs = 2, 1325 .num_funcs = 2,
@@ -1464,13 +1437,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
1464 camellia_dec_blk(ctx, srcdst, srcdst); 1437 camellia_dec_blk(ctx, srcdst, srcdst);
1465} 1438}
1466 1439
1467struct camellia_lrw_ctx { 1440int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
1468 struct lrw_table_ctx lrw_table; 1441 unsigned int keylen)
1469 struct camellia_ctx camellia_ctx;
1470};
1471
1472static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
1473 unsigned int keylen)
1474{ 1442{
1475 struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 1443 struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
1476 int err; 1444 int err;
@@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
1484 return lrw_init_table(&ctx->lrw_table, 1452 return lrw_init_table(&ctx->lrw_table,
1485 key + keylen - CAMELLIA_BLOCK_SIZE); 1453 key + keylen - CAMELLIA_BLOCK_SIZE);
1486} 1454}
1455EXPORT_SYMBOL_GPL(lrw_camellia_setkey);
1487 1456
1488static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1457static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1489 struct scatterlist *src, unsigned int nbytes) 1458 struct scatterlist *src, unsigned int nbytes)
@@ -1519,20 +1488,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1519 return lrw_crypt(desc, dst, src, nbytes, &req); 1488 return lrw_crypt(desc, dst, src, nbytes, &req);
1520} 1489}
1521 1490
1522static void lrw_exit_tfm(struct crypto_tfm *tfm) 1491void lrw_camellia_exit_tfm(struct crypto_tfm *tfm)
1523{ 1492{
1524 struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); 1493 struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm);
1525 1494
1526 lrw_free_table(&ctx->lrw_table); 1495 lrw_free_table(&ctx->lrw_table);
1527} 1496}
1497EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm);
1528 1498
1529struct camellia_xts_ctx { 1499int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
1530 struct camellia_ctx tweak_ctx; 1500 unsigned int keylen)
1531 struct camellia_ctx crypt_ctx;
1532};
1533
1534static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
1535 unsigned int keylen)
1536{ 1501{
1537 struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm); 1502 struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm);
1538 u32 *flags = &tfm->crt_flags; 1503 u32 *flags = &tfm->crt_flags;
@@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
1555 return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, 1520 return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2,
1556 flags); 1521 flags);
1557} 1522}
1523EXPORT_SYMBOL_GPL(xts_camellia_setkey);
1558 1524
1559static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, 1525static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst,
1560 struct scatterlist *src, unsigned int nbytes) 1526 struct scatterlist *src, unsigned int nbytes)
@@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { {
1679 .cra_alignmask = 0, 1645 .cra_alignmask = 0,
1680 .cra_type = &crypto_blkcipher_type, 1646 .cra_type = &crypto_blkcipher_type,
1681 .cra_module = THIS_MODULE, 1647 .cra_module = THIS_MODULE,
1682 .cra_exit = lrw_exit_tfm, 1648 .cra_exit = lrw_camellia_exit_tfm,
1683 .cra_u = { 1649 .cra_u = {
1684 .blkcipher = { 1650 .blkcipher = {
1685 .min_keysize = CAMELLIA_MIN_KEY_SIZE + 1651 .min_keysize = CAMELLIA_MIN_KEY_SIZE +
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index a41a3aaba220..15b00ac7cbd3 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -25,10 +25,10 @@
25 25
26.file "cast5-avx-x86_64-asm_64.S" 26.file "cast5-avx-x86_64-asm_64.S"
27 27
28.extern cast5_s1 28.extern cast_s1
29.extern cast5_s2 29.extern cast_s2
30.extern cast5_s3 30.extern cast_s3
31.extern cast5_s4 31.extern cast_s4
32 32
33/* structure of crypto context */ 33/* structure of crypto context */
34#define km 0 34#define km 0
@@ -36,10 +36,10 @@
36#define rr ((16*4)+16) 36#define rr ((16*4)+16)
37 37
38/* s-boxes */ 38/* s-boxes */
39#define s1 cast5_s1 39#define s1 cast_s1
40#define s2 cast5_s2 40#define s2 cast_s2
41#define s3 cast5_s3 41#define s3 cast_s3
42#define s4 cast5_s4 42#define s4 cast_s4
43 43
44/********************************************************************** 44/**********************************************************************
45 16-way AVX cast5 45 16-way AVX cast5
@@ -180,31 +180,17 @@
180 vpunpcklqdq t1, t0, x0; \ 180 vpunpcklqdq t1, t0, x0; \
181 vpunpckhqdq t1, t0, x1; 181 vpunpckhqdq t1, t0, x1;
182 182
183#define inpack_blocks(in, x0, x1, t0, t1, rmask) \ 183#define inpack_blocks(x0, x1, t0, t1, rmask) \
184 vmovdqu (0*4*4)(in), x0; \
185 vmovdqu (1*4*4)(in), x1; \
186 vpshufb rmask, x0, x0; \ 184 vpshufb rmask, x0, x0; \
187 vpshufb rmask, x1, x1; \ 185 vpshufb rmask, x1, x1; \
188 \ 186 \
189 transpose_2x4(x0, x1, t0, t1) 187 transpose_2x4(x0, x1, t0, t1)
190 188
191#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \ 189#define outunpack_blocks(x0, x1, t0, t1, rmask) \
192 transpose_2x4(x0, x1, t0, t1) \ 190 transpose_2x4(x0, x1, t0, t1) \
193 \ 191 \
194 vpshufb rmask, x0, x0; \ 192 vpshufb rmask, x0, x0; \
195 vpshufb rmask, x1, x1; \ 193 vpshufb rmask, x1, x1;
196 vmovdqu x0, (0*4*4)(out); \
197 vmovdqu x1, (1*4*4)(out);
198
199#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
200 transpose_2x4(x0, x1, t0, t1) \
201 \
202 vpshufb rmask, x0, x0; \
203 vpshufb rmask, x1, x1; \
204 vpxor (0*4*4)(out), x0, x0; \
205 vmovdqu x0, (0*4*4)(out); \
206 vpxor (1*4*4)(out), x1, x1; \
207 vmovdqu x1, (1*4*4)(out);
208 194
209.data 195.data
210 196
@@ -213,6 +199,8 @@
213 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 199 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
214.Lbswap128_mask: 200.Lbswap128_mask:
215 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 201 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
202.Lbswap_iv_mask:
203 .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
216.L16_mask: 204.L16_mask:
217 .byte 16, 16, 16, 16 205 .byte 16, 16, 16, 16
218.L32_mask: 206.L32_mask:
@@ -223,35 +211,42 @@
223.text 211.text
224 212
225.align 16 213.align 16
226.global __cast5_enc_blk_16way 214.type __cast5_enc_blk16,@function;
227.type __cast5_enc_blk_16way,@function;
228 215
229__cast5_enc_blk_16way: 216__cast5_enc_blk16:
230 /* input: 217 /* input:
231 * %rdi: ctx, CTX 218 * %rdi: ctx, CTX
232 * %rsi: dst 219 * RL1: blocks 1 and 2
233 * %rdx: src 220 * RR1: blocks 3 and 4
234 * %rcx: bool, if true: xor output 221 * RL2: blocks 5 and 6
222 * RR2: blocks 7 and 8
223 * RL3: blocks 9 and 10
224 * RR3: blocks 11 and 12
225 * RL4: blocks 13 and 14
226 * RR4: blocks 15 and 16
227 * output:
228 * RL1: encrypted blocks 1 and 2
229 * RR1: encrypted blocks 3 and 4
230 * RL2: encrypted blocks 5 and 6
231 * RR2: encrypted blocks 7 and 8
232 * RL3: encrypted blocks 9 and 10
233 * RR3: encrypted blocks 11 and 12
234 * RL4: encrypted blocks 13 and 14
235 * RR4: encrypted blocks 15 and 16
235 */ 236 */
236 237
237 pushq %rbp; 238 pushq %rbp;
238 pushq %rbx; 239 pushq %rbx;
239 pushq %rcx;
240 240
241 vmovdqa .Lbswap_mask, RKM; 241 vmovdqa .Lbswap_mask, RKM;
242 vmovd .Lfirst_mask, R1ST; 242 vmovd .Lfirst_mask, R1ST;
243 vmovd .L32_mask, R32; 243 vmovd .L32_mask, R32;
244 enc_preload_rkr(); 244 enc_preload_rkr();
245 245
246 leaq 1*(2*4*4)(%rdx), %rax; 246 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
247 inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); 247 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
248 inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); 248 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
249 leaq 2*(2*4*4)(%rdx), %rax; 249 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
250 inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
251 leaq 3*(2*4*4)(%rdx), %rax;
252 inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
253
254 movq %rsi, %r11;
255 250
256 round(RL, RR, 0, 1); 251 round(RL, RR, 0, 1);
257 round(RR, RL, 1, 2); 252 round(RR, RL, 1, 2);
@@ -276,44 +271,41 @@ __cast5_enc_blk_16way:
276 round(RR, RL, 15, 1); 271 round(RR, RL, 15, 1);
277 272
278__skip_enc: 273__skip_enc:
279 popq %rcx;
280 popq %rbx; 274 popq %rbx;
281 popq %rbp; 275 popq %rbp;
282 276
283 vmovdqa .Lbswap_mask, RKM; 277 vmovdqa .Lbswap_mask, RKM;
284 leaq 1*(2*4*4)(%r11), %rax;
285 278
286 testb %cl, %cl; 279 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
287 jnz __enc_xor16; 280 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
288 281 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
289 outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); 282 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
290 outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
291 leaq 2*(2*4*4)(%r11), %rax;
292 outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
293 leaq 3*(2*4*4)(%r11), %rax;
294 outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
295
296 ret;
297
298__enc_xor16:
299 outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
300 outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
301 leaq 2*(2*4*4)(%r11), %rax;
302 outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
303 leaq 3*(2*4*4)(%r11), %rax;
304 outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
305 283
306 ret; 284 ret;
307 285
308.align 16 286.align 16
309.global cast5_dec_blk_16way 287.type __cast5_dec_blk16,@function;
310.type cast5_dec_blk_16way,@function;
311 288
312cast5_dec_blk_16way: 289__cast5_dec_blk16:
313 /* input: 290 /* input:
314 * %rdi: ctx, CTX 291 * %rdi: ctx, CTX
315 * %rsi: dst 292 * RL1: encrypted blocks 1 and 2
316 * %rdx: src 293 * RR1: encrypted blocks 3 and 4
294 * RL2: encrypted blocks 5 and 6
295 * RR2: encrypted blocks 7 and 8
296 * RL3: encrypted blocks 9 and 10
297 * RR3: encrypted blocks 11 and 12
298 * RL4: encrypted blocks 13 and 14
299 * RR4: encrypted blocks 15 and 16
300 * output:
301 * RL1: decrypted blocks 1 and 2
302 * RR1: decrypted blocks 3 and 4
303 * RL2: decrypted blocks 5 and 6
304 * RR2: decrypted blocks 7 and 8
305 * RL3: decrypted blocks 9 and 10
306 * RR3: decrypted blocks 11 and 12
307 * RL4: decrypted blocks 13 and 14
308 * RR4: decrypted blocks 15 and 16
317 */ 309 */
318 310
319 pushq %rbp; 311 pushq %rbp;
@@ -324,15 +316,10 @@ cast5_dec_blk_16way:
324 vmovd .L32_mask, R32; 316 vmovd .L32_mask, R32;
325 dec_preload_rkr(); 317 dec_preload_rkr();
326 318
327 leaq 1*(2*4*4)(%rdx), %rax; 319 inpack_blocks(RL1, RR1, RTMP, RX, RKM);
328 inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); 320 inpack_blocks(RL2, RR2, RTMP, RX, RKM);
329 inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); 321 inpack_blocks(RL3, RR3, RTMP, RX, RKM);
330 leaq 2*(2*4*4)(%rdx), %rax; 322 inpack_blocks(RL4, RR4, RTMP, RX, RKM);
331 inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
332 leaq 3*(2*4*4)(%rdx), %rax;
333 inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
334
335 movq %rsi, %r11;
336 323
337 movzbl rr(CTX), %eax; 324 movzbl rr(CTX), %eax;
338 testl %eax, %eax; 325 testl %eax, %eax;
@@ -361,16 +348,211 @@ __dec_tail:
361 popq %rbx; 348 popq %rbx;
362 popq %rbp; 349 popq %rbp;
363 350
364 leaq 1*(2*4*4)(%r11), %rax; 351 outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
365 outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); 352 outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
366 outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); 353 outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
367 leaq 2*(2*4*4)(%r11), %rax; 354 outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
368 outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
369 leaq 3*(2*4*4)(%r11), %rax;
370 outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
371 355
372 ret; 356 ret;
373 357
374__skip_dec: 358__skip_dec:
375 vpsrldq $4, RKR, RKR; 359 vpsrldq $4, RKR, RKR;
376 jmp __dec_tail; 360 jmp __dec_tail;
361
362.align 16
363.global cast5_ecb_enc_16way
364.type cast5_ecb_enc_16way,@function;
365
366cast5_ecb_enc_16way:
367 /* input:
368 * %rdi: ctx, CTX
369 * %rsi: dst
370 * %rdx: src
371 */
372
373 movq %rsi, %r11;
374
375 vmovdqu (0*4*4)(%rdx), RL1;
376 vmovdqu (1*4*4)(%rdx), RR1;
377 vmovdqu (2*4*4)(%rdx), RL2;
378 vmovdqu (3*4*4)(%rdx), RR2;
379 vmovdqu (4*4*4)(%rdx), RL3;
380 vmovdqu (5*4*4)(%rdx), RR3;
381 vmovdqu (6*4*4)(%rdx), RL4;
382 vmovdqu (7*4*4)(%rdx), RR4;
383
384 call __cast5_enc_blk16;
385
386 vmovdqu RR1, (0*4*4)(%r11);
387 vmovdqu RL1, (1*4*4)(%r11);
388 vmovdqu RR2, (2*4*4)(%r11);
389 vmovdqu RL2, (3*4*4)(%r11);
390 vmovdqu RR3, (4*4*4)(%r11);
391 vmovdqu RL3, (5*4*4)(%r11);
392 vmovdqu RR4, (6*4*4)(%r11);
393 vmovdqu RL4, (7*4*4)(%r11);
394
395 ret;
396
397.align 16
398.global cast5_ecb_dec_16way
399.type cast5_ecb_dec_16way,@function;
400
401cast5_ecb_dec_16way:
402 /* input:
403 * %rdi: ctx, CTX
404 * %rsi: dst
405 * %rdx: src
406 */
407
408 movq %rsi, %r11;
409
410 vmovdqu (0*4*4)(%rdx), RL1;
411 vmovdqu (1*4*4)(%rdx), RR1;
412 vmovdqu (2*4*4)(%rdx), RL2;
413 vmovdqu (3*4*4)(%rdx), RR2;
414 vmovdqu (4*4*4)(%rdx), RL3;
415 vmovdqu (5*4*4)(%rdx), RR3;
416 vmovdqu (6*4*4)(%rdx), RL4;
417 vmovdqu (7*4*4)(%rdx), RR4;
418
419 call __cast5_dec_blk16;
420
421 vmovdqu RR1, (0*4*4)(%r11);
422 vmovdqu RL1, (1*4*4)(%r11);
423 vmovdqu RR2, (2*4*4)(%r11);
424 vmovdqu RL2, (3*4*4)(%r11);
425 vmovdqu RR3, (4*4*4)(%r11);
426 vmovdqu RL3, (5*4*4)(%r11);
427 vmovdqu RR4, (6*4*4)(%r11);
428 vmovdqu RL4, (7*4*4)(%r11);
429
430 ret;
431
432.align 16
433.global cast5_cbc_dec_16way
434.type cast5_cbc_dec_16way,@function;
435
436cast5_cbc_dec_16way:
437 /* input:
438 * %rdi: ctx, CTX
439 * %rsi: dst
440 * %rdx: src
441 */
442
443 pushq %r12;
444
445 movq %rsi, %r11;
446 movq %rdx, %r12;
447
448 vmovdqu (0*16)(%rdx), RL1;
449 vmovdqu (1*16)(%rdx), RR1;
450 vmovdqu (2*16)(%rdx), RL2;
451 vmovdqu (3*16)(%rdx), RR2;
452 vmovdqu (4*16)(%rdx), RL3;
453 vmovdqu (5*16)(%rdx), RR3;
454 vmovdqu (6*16)(%rdx), RL4;
455 vmovdqu (7*16)(%rdx), RR4;
456
457 call __cast5_dec_blk16;
458
459 /* xor with src */
460 vmovq (%r12), RX;
461 vpshufd $0x4f, RX, RX;
462 vpxor RX, RR1, RR1;
463 vpxor 0*16+8(%r12), RL1, RL1;
464 vpxor 1*16+8(%r12), RR2, RR2;
465 vpxor 2*16+8(%r12), RL2, RL2;
466 vpxor 3*16+8(%r12), RR3, RR3;
467 vpxor 4*16+8(%r12), RL3, RL3;
468 vpxor 5*16+8(%r12), RR4, RR4;
469 vpxor 6*16+8(%r12), RL4, RL4;
470
471 vmovdqu RR1, (0*16)(%r11);
472 vmovdqu RL1, (1*16)(%r11);
473 vmovdqu RR2, (2*16)(%r11);
474 vmovdqu RL2, (3*16)(%r11);
475 vmovdqu RR3, (4*16)(%r11);
476 vmovdqu RL3, (5*16)(%r11);
477 vmovdqu RR4, (6*16)(%r11);
478 vmovdqu RL4, (7*16)(%r11);
479
480 popq %r12;
481
482 ret;
483
484.align 16
485.global cast5_ctr_16way
486.type cast5_ctr_16way,@function;
487
488cast5_ctr_16way:
489 /* input:
490 * %rdi: ctx, CTX
491 * %rsi: dst
492 * %rdx: src
493 * %rcx: iv (big endian, 64bit)
494 */
495
496 pushq %r12;
497
498 movq %rsi, %r11;
499 movq %rdx, %r12;
500
501 vpcmpeqd RTMP, RTMP, RTMP;
502 vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */
503
504 vpcmpeqd RKR, RKR, RKR;
505 vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */
506 vmovdqa .Lbswap_iv_mask, R1ST;
507 vmovdqa .Lbswap128_mask, RKM;
508
509 /* load IV and byteswap */
510 vmovq (%rcx), RX;
511 vpshufb R1ST, RX, RX;
512
513 /* construct IVs */
514 vpsubq RTMP, RX, RX; /* le: IV1, IV0 */
515 vpshufb RKM, RX, RL1; /* be: IV0, IV1 */
516 vpsubq RKR, RX, RX;
517 vpshufb RKM, RX, RR1; /* be: IV2, IV3 */
518 vpsubq RKR, RX, RX;
519 vpshufb RKM, RX, RL2; /* be: IV4, IV5 */
520 vpsubq RKR, RX, RX;
521 vpshufb RKM, RX, RR2; /* be: IV6, IV7 */
522 vpsubq RKR, RX, RX;
523 vpshufb RKM, RX, RL3; /* be: IV8, IV9 */
524 vpsubq RKR, RX, RX;
525 vpshufb RKM, RX, RR3; /* be: IV10, IV11 */
526 vpsubq RKR, RX, RX;
527 vpshufb RKM, RX, RL4; /* be: IV12, IV13 */
528 vpsubq RKR, RX, RX;
529 vpshufb RKM, RX, RR4; /* be: IV14, IV15 */
530
531 /* store last IV */
532 vpsubq RTMP, RX, RX; /* le: IV16, IV14 */
533 vpshufb R1ST, RX, RX; /* be: IV16, IV16 */
534 vmovq RX, (%rcx);
535
536 call __cast5_enc_blk16;
537
538 /* dst = src ^ iv */
539 vpxor (0*16)(%r12), RR1, RR1;
540 vpxor (1*16)(%r12), RL1, RL1;
541 vpxor (2*16)(%r12), RR2, RR2;
542 vpxor (3*16)(%r12), RL2, RL2;
543 vpxor (4*16)(%r12), RR3, RR3;
544 vpxor (5*16)(%r12), RL3, RL3;
545 vpxor (6*16)(%r12), RR4, RR4;
546 vpxor (7*16)(%r12), RL4, RL4;
547 vmovdqu RR1, (0*16)(%r11);
548 vmovdqu RL1, (1*16)(%r11);
549 vmovdqu RR2, (2*16)(%r11);
550 vmovdqu RL2, (3*16)(%r11);
551 vmovdqu RR3, (4*16)(%r11);
552 vmovdqu RL3, (5*16)(%r11);
553 vmovdqu RR4, (6*16)(%r11);
554 vmovdqu RL4, (7*16)(%r11);
555
556 popq %r12;
557
558 ret;
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index e0ea14f9547f..c6631813dc11 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -37,29 +37,14 @@
37 37
38#define CAST5_PARALLEL_BLOCKS 16 38#define CAST5_PARALLEL_BLOCKS 16
39 39
40asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst, 40asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst,
41 const u8 *src, bool xor);
42asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst,
43 const u8 *src); 41 const u8 *src);
44 42asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst,
45static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst, 43 const u8 *src);
46 const u8 *src) 44asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst,
47{ 45 const u8 *src);
48 __cast5_enc_blk_16way(ctx, dst, src, false); 46asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src,
49} 47 __be64 *iv);
50
51static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst,
52 const u8 *src)
53{
54 __cast5_enc_blk_16way(ctx, dst, src, true);
55}
56
57static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst,
58 const u8 *src)
59{
60 cast5_dec_blk_16way(ctx, dst, src);
61}
62
63 48
64static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes) 49static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes)
65{ 50{
@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
79 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); 64 struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm);
80 const unsigned int bsize = CAST5_BLOCK_SIZE; 65 const unsigned int bsize = CAST5_BLOCK_SIZE;
81 unsigned int nbytes; 66 unsigned int nbytes;
67 void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src);
82 int err; 68 int err;
83 69
70 fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way;
71
84 err = blkcipher_walk_virt(desc, walk); 72 err = blkcipher_walk_virt(desc, walk);
85 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; 73 desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
86 74
@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
93 /* Process multi-block batch */ 81 /* Process multi-block batch */
94 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { 82 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
95 do { 83 do {
96 if (enc) 84 fn(ctx, wdst, wsrc);
97 cast5_enc_blk_xway(ctx, wdst, wsrc);
98 else
99 cast5_dec_blk_xway(ctx, wdst, wsrc);
100 85
101 wsrc += bsize * CAST5_PARALLEL_BLOCKS; 86 wsrc += bsize * CAST5_PARALLEL_BLOCKS;
102 wdst += bsize * CAST5_PARALLEL_BLOCKS; 87 wdst += bsize * CAST5_PARALLEL_BLOCKS;
@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk,
107 goto done; 92 goto done;
108 } 93 }
109 94
95 fn = (enc) ? __cast5_encrypt : __cast5_decrypt;
96
110 /* Handle leftovers */ 97 /* Handle leftovers */
111 do { 98 do {
112 if (enc) 99 fn(ctx, wdst, wsrc);
113 __cast5_encrypt(ctx, wdst, wsrc);
114 else
115 __cast5_decrypt(ctx, wdst, wsrc);
116 100
117 wsrc += bsize; 101 wsrc += bsize;
118 wdst += bsize; 102 wdst += bsize;
@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
194 unsigned int nbytes = walk->nbytes; 178 unsigned int nbytes = walk->nbytes;
195 u64 *src = (u64 *)walk->src.virt.addr; 179 u64 *src = (u64 *)walk->src.virt.addr;
196 u64 *dst = (u64 *)walk->dst.virt.addr; 180 u64 *dst = (u64 *)walk->dst.virt.addr;
197 u64 ivs[CAST5_PARALLEL_BLOCKS - 1];
198 u64 last_iv; 181 u64 last_iv;
199 int i;
200 182
201 /* Start of the last block. */ 183 /* Start of the last block. */
202 src += nbytes / bsize - 1; 184 src += nbytes / bsize - 1;
@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc,
211 src -= CAST5_PARALLEL_BLOCKS - 1; 193 src -= CAST5_PARALLEL_BLOCKS - 1;
212 dst -= CAST5_PARALLEL_BLOCKS - 1; 194 dst -= CAST5_PARALLEL_BLOCKS - 1;
213 195
214 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) 196 cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src);
215 ivs[i] = src[i];
216
217 cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
218
219 for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++)
220 *(dst + (i + 1)) ^= *(ivs + i);
221 197
222 nbytes -= bsize; 198 nbytes -= bsize;
223 if (nbytes < bsize) 199 if (nbytes < bsize)
@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
298 unsigned int nbytes = walk->nbytes; 274 unsigned int nbytes = walk->nbytes;
299 u64 *src = (u64 *)walk->src.virt.addr; 275 u64 *src = (u64 *)walk->src.virt.addr;
300 u64 *dst = (u64 *)walk->dst.virt.addr; 276 u64 *dst = (u64 *)walk->dst.virt.addr;
301 u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv);
302 __be64 ctrblocks[CAST5_PARALLEL_BLOCKS];
303 int i;
304 277
305 /* Process multi-block batch */ 278 /* Process multi-block batch */
306 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { 279 if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) {
307 do { 280 do {
308 /* create ctrblks for parallel encrypt */ 281 cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src,
309 for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) { 282 (__be64 *)walk->iv);
310 if (dst != src)
311 dst[i] = src[i];
312
313 ctrblocks[i] = cpu_to_be64(ctrblk++);
314 }
315
316 cast5_enc_blk_xway_xor(ctx, (u8 *)dst,
317 (u8 *)ctrblocks);
318 283
319 src += CAST5_PARALLEL_BLOCKS; 284 src += CAST5_PARALLEL_BLOCKS;
320 dst += CAST5_PARALLEL_BLOCKS; 285 dst += CAST5_PARALLEL_BLOCKS;
@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
327 292
328 /* Handle leftovers */ 293 /* Handle leftovers */
329 do { 294 do {
295 u64 ctrblk;
296
330 if (dst != src) 297 if (dst != src)
331 *dst = *src; 298 *dst = *src;
332 299
333 ctrblocks[0] = cpu_to_be64(ctrblk++); 300 ctrblk = *(u64 *)walk->iv;
301 be64_add_cpu((__be64 *)walk->iv, 1);
334 302
335 __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); 303 __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
336 *dst ^= ctrblocks[0]; 304 *dst ^= ctrblk;
337 305
338 src += 1; 306 src += 1;
339 dst += 1; 307 dst += 1;
@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc,
341 } while (nbytes >= bsize); 309 } while (nbytes >= bsize);
342 310
343done: 311done:
344 *(__be64 *)walk->iv = cpu_to_be64(ctrblk);
345 return nbytes; 312 return nbytes;
346} 313}
347 314
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 218d283772f4..2569d0da841f 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -23,22 +23,24 @@
23 * 23 *
24 */ 24 */
25 25
26#include "glue_helper-asm-avx.S"
27
26.file "cast6-avx-x86_64-asm_64.S" 28.file "cast6-avx-x86_64-asm_64.S"
27 29
28.extern cast6_s1 30.extern cast_s1
29.extern cast6_s2 31.extern cast_s2
30.extern cast6_s3 32.extern cast_s3
31.extern cast6_s4 33.extern cast_s4
32 34
33/* structure of crypto context */ 35/* structure of crypto context */
34#define km 0 36#define km 0
35#define kr (12*4*4) 37#define kr (12*4*4)
36 38
37/* s-boxes */ 39/* s-boxes */
38#define s1 cast6_s1 40#define s1 cast_s1
39#define s2 cast6_s2 41#define s2 cast_s2
40#define s3 cast6_s3 42#define s3 cast_s3
41#define s4 cast6_s4 43#define s4 cast_s4
42 44
43/********************************************************************** 45/**********************************************************************
44 8-way AVX cast6 46 8-way AVX cast6
@@ -205,11 +207,7 @@
205 vpunpcklqdq x3, t2, x2; \ 207 vpunpcklqdq x3, t2, x2; \
206 vpunpckhqdq x3, t2, x3; 208 vpunpckhqdq x3, t2, x3;
207 209
208#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ 210#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
209 vmovdqu (0*4*4)(in), x0; \
210 vmovdqu (1*4*4)(in), x1; \
211 vmovdqu (2*4*4)(in), x2; \
212 vmovdqu (3*4*4)(in), x3; \
213 vpshufb rmask, x0, x0; \ 211 vpshufb rmask, x0, x0; \
214 vpshufb rmask, x1, x1; \ 212 vpshufb rmask, x1, x1; \
215 vpshufb rmask, x2, x2; \ 213 vpshufb rmask, x2, x2; \
@@ -217,39 +215,21 @@
217 \ 215 \
218 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 216 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
219 217
220#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ 218#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
221 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 219 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
222 \ 220 \
223 vpshufb rmask, x0, x0; \ 221 vpshufb rmask, x0, x0; \
224 vpshufb rmask, x1, x1; \ 222 vpshufb rmask, x1, x1; \
225 vpshufb rmask, x2, x2; \ 223 vpshufb rmask, x2, x2; \
226 vpshufb rmask, x3, x3; \ 224 vpshufb rmask, x3, x3;
227 vmovdqu x0, (0*4*4)(out); \
228 vmovdqu x1, (1*4*4)(out); \
229 vmovdqu x2, (2*4*4)(out); \
230 vmovdqu x3, (3*4*4)(out);
231
232#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
233 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
234 \
235 vpshufb rmask, x0, x0; \
236 vpshufb rmask, x1, x1; \
237 vpshufb rmask, x2, x2; \
238 vpshufb rmask, x3, x3; \
239 vpxor (0*4*4)(out), x0, x0; \
240 vmovdqu x0, (0*4*4)(out); \
241 vpxor (1*4*4)(out), x1, x1; \
242 vmovdqu x1, (1*4*4)(out); \
243 vpxor (2*4*4)(out), x2, x2; \
244 vmovdqu x2, (2*4*4)(out); \
245 vpxor (3*4*4)(out), x3, x3; \
246 vmovdqu x3, (3*4*4)(out);
247 225
248.data 226.data
249 227
250.align 16 228.align 16
251.Lbswap_mask: 229.Lbswap_mask:
252 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
231.Lbswap128_mask:
232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
253.Lrkr_enc_Q_Q_QBAR_QBAR: 233.Lrkr_enc_Q_Q_QBAR_QBAR:
254 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 234 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
255.Lrkr_enc_QBAR_QBAR_QBAR_QBAR: 235.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
@@ -269,31 +249,26 @@
269 249
270.text 250.text
271 251
272.align 16 252.align 8
273.global __cast6_enc_blk_8way 253.type __cast6_enc_blk8,@function;
274.type __cast6_enc_blk_8way,@function;
275 254
276__cast6_enc_blk_8way: 255__cast6_enc_blk8:
277 /* input: 256 /* input:
278 * %rdi: ctx, CTX 257 * %rdi: ctx, CTX
279 * %rsi: dst 258 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
280 * %rdx: src 259 * output:
281 * %rcx: bool, if true: xor output 260 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
282 */ 261 */
283 262
284 pushq %rbp; 263 pushq %rbp;
285 pushq %rbx; 264 pushq %rbx;
286 pushq %rcx;
287 265
288 vmovdqa .Lbswap_mask, RKM; 266 vmovdqa .Lbswap_mask, RKM;
289 vmovd .Lfirst_mask, R1ST; 267 vmovd .Lfirst_mask, R1ST;
290 vmovd .L32_mask, R32; 268 vmovd .L32_mask, R32;
291 269
292 leaq (4*4*4)(%rdx), %rax; 270 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
293 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 271 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
294 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
295
296 movq %rsi, %r11;
297 272
298 preload_rkr(0, dummy, none); 273 preload_rkr(0, dummy, none);
299 Q(0); 274 Q(0);
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way:
311 QBAR(10); 286 QBAR(10);
312 QBAR(11); 287 QBAR(11);
313 288
314 popq %rcx;
315 popq %rbx; 289 popq %rbx;
316 popq %rbp; 290 popq %rbp;
317 291
318 vmovdqa .Lbswap_mask, RKM; 292 vmovdqa .Lbswap_mask, RKM;
319 leaq (4*4*4)(%r11), %rax;
320
321 testb %cl, %cl;
322 jnz __enc_xor8;
323
324 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
325 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
326
327 ret;
328 293
329__enc_xor8: 294 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
330 outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 295 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
331 outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
332 296
333 ret; 297 ret;
334 298
335.align 16 299.align 8
336.global cast6_dec_blk_8way 300.type __cast6_dec_blk8,@function;
337.type cast6_dec_blk_8way,@function;
338 301
339cast6_dec_blk_8way: 302__cast6_dec_blk8:
340 /* input: 303 /* input:
341 * %rdi: ctx, CTX 304 * %rdi: ctx, CTX
342 * %rsi: dst 305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
343 * %rdx: src 306 * output:
307 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
344 */ 308 */
345 309
346 pushq %rbp; 310 pushq %rbp;
@@ -350,11 +314,8 @@ cast6_dec_blk_8way:
350 vmovd .Lfirst_mask, R1ST; 314 vmovd .Lfirst_mask, R1ST;
351 vmovd .L32_mask, R32; 315 vmovd .L32_mask, R32;
352 316
353 leaq (4*4*4)(%rdx), %rax; 317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
354 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
355 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
356
357 movq %rsi, %r11;
358 319
359 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); 320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
360 Q(11); 321 Q(11);
@@ -376,8 +337,103 @@ cast6_dec_blk_8way:
376 popq %rbp; 337 popq %rbp;
377 338
378 vmovdqa .Lbswap_mask, RKM; 339 vmovdqa .Lbswap_mask, RKM;
379 leaq (4*4*4)(%r11), %rax; 340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
380 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
381 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 342
343 ret;
344
345.align 8
346.global cast6_ecb_enc_8way
347.type cast6_ecb_enc_8way,@function;
348
349cast6_ecb_enc_8way:
350 /* input:
351 * %rdi: ctx, CTX
352 * %rsi: dst
353 * %rdx: src
354 */
355
356 movq %rsi, %r11;
357
358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
359
360 call __cast6_enc_blk8;
361
362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
363
364 ret;
365
366.align 8
367.global cast6_ecb_dec_8way
368.type cast6_ecb_dec_8way,@function;
369
370cast6_ecb_dec_8way:
371 /* input:
372 * %rdi: ctx, CTX
373 * %rsi: dst
374 * %rdx: src
375 */
376
377 movq %rsi, %r11;
378
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380
381 call __cast6_dec_blk8;
382
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
384
385 ret;
386
387.align 8
388.global cast6_cbc_dec_8way
389.type cast6_cbc_dec_8way,@function;
390
391cast6_cbc_dec_8way:
392 /* input:
393 * %rdi: ctx, CTX
394 * %rsi: dst
395 * %rdx: src
396 */
397
398 pushq %r12;
399
400 movq %rsi, %r11;
401 movq %rdx, %r12;
402
403 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
404
405 call __cast6_dec_blk8;
406
407 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
408
409 popq %r12;
410
411 ret;
412
413.align 8
414.global cast6_ctr_8way
415.type cast6_ctr_8way,@function;
416
417cast6_ctr_8way:
418 /* input:
419 * %rdi: ctx, CTX
420 * %rsi: dst
421 * %rdx: src
422 * %rcx: iv (little endian, 128bit)
423 */
424
425 pushq %r12;
426
427 movq %rsi, %r11;
428 movq %rdx, %r12;
429
430 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
431 RD2, RX, RKR, RKM);
432
433 call __cast6_enc_blk8;
434
435 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
436
437 popq %r12;
382 438
383 ret; 439 ret;
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 15e5f85a5011..92f7ca24790a 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -40,79 +40,34 @@
40 40
41#define CAST6_PARALLEL_BLOCKS 8 41#define CAST6_PARALLEL_BLOCKS 8
42 42
43asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst, 43asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst,
44 const u8 *src, bool xor); 44 const u8 *src);
45asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst, 45asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst,
46 const u8 *src); 46 const u8 *src);
47 47
48static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst, 48asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst,
49 const u8 *src) 49 const u8 *src);
50{ 50asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src,
51 __cast6_enc_blk_8way(ctx, dst, src, false); 51 le128 *iv);
52}
53
54static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst,
55 const u8 *src)
56{
57 __cast6_enc_blk_8way(ctx, dst, src, true);
58}
59
60static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst,
61 const u8 *src)
62{
63 cast6_dec_blk_8way(ctx, dst, src);
64}
65
66
67static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
68{
69 u128 ivs[CAST6_PARALLEL_BLOCKS - 1];
70 unsigned int j;
71
72 for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
73 ivs[j] = src[j];
74
75 cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
76
77 for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++)
78 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
79}
80 52
81static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) 53static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
82{ 54{
83 be128 ctrblk; 55 be128 ctrblk;
84 56
85 u128_to_be128(&ctrblk, iv); 57 le128_to_be128(&ctrblk, iv);
86 u128_inc(iv); 58 le128_inc(iv);
87 59
88 __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); 60 __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
89 u128_xor(dst, src, (u128 *)&ctrblk); 61 u128_xor(dst, src, (u128 *)&ctrblk);
90} 62}
91 63
92static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
93 u128 *iv)
94{
95 be128 ctrblks[CAST6_PARALLEL_BLOCKS];
96 unsigned int i;
97
98 for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) {
99 if (dst != src)
100 dst[i] = src[i];
101
102 u128_to_be128(&ctrblks[i], iv);
103 u128_inc(iv);
104 }
105
106 cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
107}
108
109static const struct common_glue_ctx cast6_enc = { 64static const struct common_glue_ctx cast6_enc = {
110 .num_funcs = 2, 65 .num_funcs = 2,
111 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, 66 .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS,
112 67
113 .funcs = { { 68 .funcs = { {
114 .num_blocks = CAST6_PARALLEL_BLOCKS, 69 .num_blocks = CAST6_PARALLEL_BLOCKS,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) } 70 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) }
116 }, { 71 }, {
117 .num_blocks = 1, 72 .num_blocks = 1,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } 73 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) }
@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = {
125 80
126 .funcs = { { 81 .funcs = { {
127 .num_blocks = CAST6_PARALLEL_BLOCKS, 82 .num_blocks = CAST6_PARALLEL_BLOCKS,
128 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) } 83 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) }
129 }, { 84 }, {
130 .num_blocks = 1, 85 .num_blocks = 1,
131 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } 86 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) }
@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = {
138 93
139 .funcs = { { 94 .funcs = { {
140 .num_blocks = CAST6_PARALLEL_BLOCKS, 95 .num_blocks = CAST6_PARALLEL_BLOCKS,
141 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) } 96 .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) }
142 }, { 97 }, {
143 .num_blocks = 1, 98 .num_blocks = 1,
144 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } 99 .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) }
@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = {
151 106
152 .funcs = { { 107 .funcs = { {
153 .num_blocks = CAST6_PARALLEL_BLOCKS, 108 .num_blocks = CAST6_PARALLEL_BLOCKS,
154 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) } 109 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) }
155 }, { 110 }, {
156 .num_blocks = 1, 111 .num_blocks = 1,
157 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } 112 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) }
@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
215 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); 170 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
216 171
217 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { 172 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
218 cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst); 173 cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
219 return; 174 return;
220 } 175 }
221 176
@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
232 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); 187 ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes);
233 188
234 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { 189 if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) {
235 cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst); 190 cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
236 return; 191 return;
237 } 192 }
238 193
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel_glue.c
index 493f959261f7..6812ad98355c 100644
--- a/arch/x86/crypto/crc32c-intel.c
+++ b/arch/x86/crypto/crc32c-intel_glue.c
@@ -32,6 +32,8 @@
32 32
33#include <asm/cpufeature.h> 33#include <asm/cpufeature.h>
34#include <asm/cpu_device_id.h> 34#include <asm/cpu_device_id.h>
35#include <asm/i387.h>
36#include <asm/fpu-internal.h>
35 37
36#define CHKSUM_BLOCK_SIZE 1 38#define CHKSUM_BLOCK_SIZE 1
37#define CHKSUM_DIGEST_SIZE 4 39#define CHKSUM_DIGEST_SIZE 4
@@ -44,6 +46,31 @@
44#define REX_PRE 46#define REX_PRE
45#endif 47#endif
46 48
49#ifdef CONFIG_X86_64
50/*
51 * use carryless multiply version of crc32c when buffer
52 * size is >= 512 (when eager fpu is enabled) or
53 * >= 1024 (when eager fpu is disabled) to account
54 * for fpu state save/restore overhead.
55 */
56#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512
57#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024
58
59asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
60 unsigned int crc_init);
61static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
62#if defined(X86_FEATURE_EAGER_FPU)
63#define set_pcl_breakeven_point() \
64do { \
65 if (!use_eager_fpu()) \
66 crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
67} while (0)
68#else
69#define set_pcl_breakeven_point() \
70 (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU)
71#endif
72#endif /* CONFIG_X86_64 */
73
47static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) 74static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
48{ 75{
49 while (length--) { 76 while (length--) {
@@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
154 return 0; 181 return 0;
155} 182}
156 183
184#ifdef CONFIG_X86_64
185static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
186 unsigned int len)
187{
188 u32 *crcp = shash_desc_ctx(desc);
189
190 /*
191 * use faster PCL version if datasize is large enough to
192 * overcome kernel fpu state save/restore overhead
193 */
194 if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
195 kernel_fpu_begin();
196 *crcp = crc_pcl(data, len, *crcp);
197 kernel_fpu_end();
198 } else
199 *crcp = crc32c_intel_le_hw(*crcp, data, len);
200 return 0;
201}
202
203static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
204 u8 *out)
205{
206 if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
207 kernel_fpu_begin();
208 *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
209 kernel_fpu_end();
210 } else
211 *(__le32 *)out =
212 ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
213 return 0;
214}
215
216static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
217 unsigned int len, u8 *out)
218{
219 return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
220}
221
222static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
223 unsigned int len, u8 *out)
224{
225 return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
226 out);
227}
228#endif /* CONFIG_X86_64 */
229
157static struct shash_alg alg = { 230static struct shash_alg alg = {
158 .setkey = crc32c_intel_setkey, 231 .setkey = crc32c_intel_setkey,
159 .init = crc32c_intel_init, 232 .init = crc32c_intel_init,
@@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void)
184{ 257{
185 if (!x86_match_cpu(crc32c_cpu_id)) 258 if (!x86_match_cpu(crc32c_cpu_id))
186 return -ENODEV; 259 return -ENODEV;
260#ifdef CONFIG_X86_64
261 if (cpu_has_pclmulqdq) {
262 alg.update = crc32c_pcl_intel_update;
263 alg.finup = crc32c_pcl_intel_finup;
264 alg.digest = crc32c_pcl_intel_digest;
265 set_pcl_breakeven_point();
266 }
267#endif
187 return crypto_register_shash(&alg); 268 return crypto_register_shash(&alg);
188} 269}
189 270
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
new file mode 100644
index 000000000000..93c6d39237ac
--- /dev/null
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -0,0 +1,460 @@
1/*
2 * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
3 *
4 * The white paper on CRC32C calculations with PCLMULQDQ instruction can be
5 * downloaded from:
6 * http://download.intel.com/design/intarch/papers/323405.pdf
7 *
8 * Copyright (C) 2012 Intel Corporation.
9 *
10 * Authors:
11 * Wajdi Feghali <wajdi.k.feghali@intel.com>
12 * James Guilford <james.guilford@intel.com>
13 * David Cote <david.m.cote@intel.com>
14 * Tim Chen <tim.c.chen@linux.intel.com>
15 *
16 * This software is available to you under a choice of one of two
17 * licenses. You may choose to be licensed under the terms of the GNU
18 * General Public License (GPL) Version 2, available from the file
19 * COPYING in the main directory of this source tree, or the
20 * OpenIB.org BSD license below:
21 *
22 * Redistribution and use in source and binary forms, with or
23 * without modification, are permitted provided that the following
24 * conditions are met:
25 *
26 * - Redistributions of source code must retain the above
27 * copyright notice, this list of conditions and the following
28 * disclaimer.
29 *
30 * - Redistributions in binary form must reproduce the above
31 * copyright notice, this list of conditions and the following
32 * disclaimer in the documentation and/or other materials
33 * provided with the distribution.
34 *
35 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
36 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
37 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
38 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
39 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
40 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
41 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
42 * SOFTWARE.
43 */
44
45## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
46
47.macro LABEL prefix n
48\prefix\n\():
49.endm
50
51.macro JMPTBL_ENTRY i
52.word crc_\i - crc_array
53.endm
54
55.macro JNC_LESS_THAN j
56 jnc less_than_\j
57.endm
58
59# Define threshold where buffers are considered "small" and routed to more
60# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
61# SMALL_SIZE can be no larger than 255.
62
63#define SMALL_SIZE 200
64
65.if (SMALL_SIZE > 255)
66.error "SMALL_ SIZE must be < 256"
67.endif
68
69# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
70
71.global crc_pcl
72crc_pcl:
73#define bufp %rdi
74#define bufp_dw %edi
75#define bufp_w %di
76#define bufp_b %dil
77#define bufptmp %rcx
78#define block_0 %rcx
79#define block_1 %rdx
80#define block_2 %r11
81#define len %rsi
82#define len_dw %esi
83#define len_w %si
84#define len_b %sil
85#define crc_init_arg %rdx
86#define tmp %rbx
87#define crc_init %r8
88#define crc_init_dw %r8d
89#define crc1 %r9
90#define crc2 %r10
91
92 pushq %rbx
93 pushq %rdi
94 pushq %rsi
95
96 ## Move crc_init for Linux to a different
97 mov crc_init_arg, crc_init
98
99 ################################################################
100 ## 1) ALIGN:
101 ################################################################
102
103 mov bufp, bufptmp # rdi = *buf
104 neg bufp
105 and $7, bufp # calculate the unalignment amount of
106 # the address
107 je proc_block # Skip if aligned
108
109 ## If len is less than 8 and we're unaligned, we need to jump
110 ## to special code to avoid reading beyond the end of the buffer
111 cmp $8, len
112 jae do_align
113 # less_than_8 expects length in upper 3 bits of len_dw
114 # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
115 shl $32-3+1, len_dw
116 jmp less_than_8_post_shl1
117
118do_align:
119 #### Calculate CRC of unaligned bytes of the buffer (if any)
120 movq (bufptmp), tmp # load a quadward from the buffer
121 add bufp, bufptmp # align buffer pointer for quadword
122 # processing
123 sub bufp, len # update buffer length
124align_loop:
125 crc32b %bl, crc_init_dw # compute crc32 of 1-byte
126 shr $8, tmp # get next byte
127 dec bufp
128 jne align_loop
129
130proc_block:
131
132 ################################################################
133 ## 2) PROCESS BLOCKS:
134 ################################################################
135
136 ## compute num of bytes to be processed
137 movq len, tmp # save num bytes in tmp
138
139 cmpq $128*24, len
140 jae full_block
141
142continue_block:
143 cmpq $SMALL_SIZE, len
144 jb small
145
146 ## len < 128*24
147 movq $2731, %rax # 2731 = ceil(2^16 / 24)
148 mul len_dw
149 shrq $16, %rax
150
151 ## eax contains floor(bytes / 24) = num 24-byte chunks to do
152
153 ## process rax 24-byte chunks (128 >= rax >= 0)
154
155 ## compute end address of each block
156 ## block 0 (base addr + RAX * 8)
157 ## block 1 (base addr + RAX * 16)
158 ## block 2 (base addr + RAX * 24)
159 lea (bufptmp, %rax, 8), block_0
160 lea (block_0, %rax, 8), block_1
161 lea (block_1, %rax, 8), block_2
162
163 xor crc1, crc1
164 xor crc2, crc2
165
166 ## branch into array
167 lea jump_table(%rip), bufp
168 movzxw (bufp, %rax, 2), len
169 offset=crc_array-jump_table
170 lea offset(bufp, len, 1), bufp
171 jmp *bufp
172
173 ################################################################
174 ## 2a) PROCESS FULL BLOCKS:
175 ################################################################
176full_block:
177 movq $128,%rax
178 lea 128*8*2(block_0), block_1
179 lea 128*8*3(block_0), block_2
180 add $128*8*1, block_0
181
182 xor crc1,crc1
183 xor crc2,crc2
184
185 # Fall thruogh into top of crc array (crc_128)
186
187 ################################################################
188 ## 3) CRC Array:
189 ################################################################
190
191crc_array:
192 i=128
193.rept 128-1
194.altmacro
195LABEL crc_ %i
196.noaltmacro
197 crc32q -i*8(block_0), crc_init
198 crc32q -i*8(block_1), crc1
199 crc32q -i*8(block_2), crc2
200 i=(i-1)
201.endr
202
203.altmacro
204LABEL crc_ %i
205.noaltmacro
206 crc32q -i*8(block_0), crc_init
207 crc32q -i*8(block_1), crc1
208# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet
209
210 mov block_2, block_0
211
212 ################################################################
213 ## 4) Combine three results:
214 ################################################################
215
216 lea (K_table-16)(%rip), bufp # first entry is for idx 1
217 shlq $3, %rax # rax *= 8
218 subq %rax, tmp # tmp -= rax*8
219 shlq $1, %rax
220 subq %rax, tmp # tmp -= rax*16
221 # (total tmp -= rax*24)
222 addq %rax, bufp
223
224 movdqa (bufp), %xmm0 # 2 consts: K1:K2
225
226 movq crc_init, %xmm1 # CRC for block 1
227 pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2
228
229 movq crc1, %xmm2 # CRC for block 2
230 pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1
231
232 pxor %xmm2,%xmm1
233 movq %xmm1, %rax
234 xor -i*8(block_2), %rax
235 mov crc2, crc_init
236 crc32 %rax, crc_init
237
238################################################################
239## 5) Check for end:
240################################################################
241
242LABEL crc_ 0
243 mov tmp, len
244 cmp $128*24, tmp
245 jae full_block
246 cmp $24, tmp
247 jae continue_block
248
249less_than_24:
250 shl $32-4, len_dw # less_than_16 expects length
251 # in upper 4 bits of len_dw
252 jnc less_than_16
253 crc32q (bufptmp), crc_init
254 crc32q 8(bufptmp), crc_init
255 jz do_return
256 add $16, bufptmp
257 # len is less than 8 if we got here
258 # less_than_8 expects length in upper 3 bits of len_dw
259 # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
260 shl $2, len_dw
261 jmp less_than_8_post_shl1
262
263 #######################################################################
264 ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
265 #######################################################################
266small:
267 shl $32-8, len_dw # Prepare len_dw for less_than_256
268 j=256
269.rept 5 # j = {256, 128, 64, 32, 16}
270.altmacro
271LABEL less_than_ %j # less_than_j: Length should be in
272 # upper lg(j) bits of len_dw
273 j=(j/2)
274 shl $1, len_dw # Get next MSB
275 JNC_LESS_THAN %j
276.noaltmacro
277 i=0
278.rept (j/8)
279 crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data
280 i=i+8
281.endr
282 jz do_return # Return if remaining length is zero
283 add $j, bufptmp # Advance buf
284.endr
285
286less_than_8: # Length should be stored in
287 # upper 3 bits of len_dw
288 shl $1, len_dw
289less_than_8_post_shl1:
290 jnc less_than_4
291 crc32l (bufptmp), crc_init_dw # CRC of 4 bytes
292 jz do_return # return if remaining data is zero
293 add $4, bufptmp
294less_than_4: # Length should be stored in
295 # upper 2 bits of len_dw
296 shl $1, len_dw
297 jnc less_than_2
298 crc32w (bufptmp), crc_init_dw # CRC of 2 bytes
299 jz do_return # return if remaining data is zero
300 add $2, bufptmp
301less_than_2: # Length should be stored in the MSB
302 # of len_dw
303 shl $1, len_dw
304 jnc less_than_1
305 crc32b (bufptmp), crc_init_dw # CRC of 1 byte
306less_than_1: # Length should be zero
307do_return:
308 movq crc_init, %rax
309 popq %rsi
310 popq %rdi
311 popq %rbx
312 ret
313
314 ################################################################
315 ## jump table Table is 129 entries x 2 bytes each
316 ################################################################
317.align 4
318jump_table:
319 i=0
320.rept 129
321.altmacro
322JMPTBL_ENTRY %i
323.noaltmacro
324 i=i+1
325.endr
326 ################################################################
327 ## PCLMULQDQ tables
328 ## Table is 128 entries x 2 quad words each
329 ################################################################
330.data
331.align 64
332K_table:
333 .quad 0x14cd00bd6,0x105ec76f0
334 .quad 0x0ba4fc28e,0x14cd00bd6
335 .quad 0x1d82c63da,0x0f20c0dfe
336 .quad 0x09e4addf8,0x0ba4fc28e
337 .quad 0x039d3b296,0x1384aa63a
338 .quad 0x102f9b8a2,0x1d82c63da
339 .quad 0x14237f5e6,0x01c291d04
340 .quad 0x00d3b6092,0x09e4addf8
341 .quad 0x0c96cfdc0,0x0740eef02
342 .quad 0x18266e456,0x039d3b296
343 .quad 0x0daece73e,0x0083a6eec
344 .quad 0x0ab7aff2a,0x102f9b8a2
345 .quad 0x1248ea574,0x1c1733996
346 .quad 0x083348832,0x14237f5e6
347 .quad 0x12c743124,0x02ad91c30
348 .quad 0x0b9e02b86,0x00d3b6092
349 .quad 0x018b33a4e,0x06992cea2
350 .quad 0x1b331e26a,0x0c96cfdc0
351 .quad 0x17d35ba46,0x07e908048
352 .quad 0x1bf2e8b8a,0x18266e456
353 .quad 0x1a3e0968a,0x11ed1f9d8
354 .quad 0x0ce7f39f4,0x0daece73e
355 .quad 0x061d82e56,0x0f1d0f55e
356 .quad 0x0d270f1a2,0x0ab7aff2a
357 .quad 0x1c3f5f66c,0x0a87ab8a8
358 .quad 0x12ed0daac,0x1248ea574
359 .quad 0x065863b64,0x08462d800
360 .quad 0x11eef4f8e,0x083348832
361 .quad 0x1ee54f54c,0x071d111a8
362 .quad 0x0b3e32c28,0x12c743124
363 .quad 0x0064f7f26,0x0ffd852c6
364 .quad 0x0dd7e3b0c,0x0b9e02b86
365 .quad 0x0f285651c,0x0dcb17aa4
366 .quad 0x010746f3c,0x018b33a4e
367 .quad 0x1c24afea4,0x0f37c5aee
368 .quad 0x0271d9844,0x1b331e26a
369 .quad 0x08e766a0c,0x06051d5a2
370 .quad 0x093a5f730,0x17d35ba46
371 .quad 0x06cb08e5c,0x11d5ca20e
372 .quad 0x06b749fb2,0x1bf2e8b8a
373 .quad 0x1167f94f2,0x021f3d99c
374 .quad 0x0cec3662e,0x1a3e0968a
375 .quad 0x19329634a,0x08f158014
376 .quad 0x0e6fc4e6a,0x0ce7f39f4
377 .quad 0x08227bb8a,0x1a5e82106
378 .quad 0x0b0cd4768,0x061d82e56
379 .quad 0x13c2b89c4,0x188815ab2
380 .quad 0x0d7a4825c,0x0d270f1a2
381 .quad 0x10f5ff2ba,0x105405f3e
382 .quad 0x00167d312,0x1c3f5f66c
383 .quad 0x0f6076544,0x0e9adf796
384 .quad 0x026f6a60a,0x12ed0daac
385 .quad 0x1a2adb74e,0x096638b34
386 .quad 0x19d34af3a,0x065863b64
387 .quad 0x049c3cc9c,0x1e50585a0
388 .quad 0x068bce87a,0x11eef4f8e
389 .quad 0x1524fa6c6,0x19f1c69dc
390 .quad 0x16cba8aca,0x1ee54f54c
391 .quad 0x042d98888,0x12913343e
392 .quad 0x1329d9f7e,0x0b3e32c28
393 .quad 0x1b1c69528,0x088f25a3a
394 .quad 0x02178513a,0x0064f7f26
395 .quad 0x0e0ac139e,0x04e36f0b0
396 .quad 0x0170076fa,0x0dd7e3b0c
397 .quad 0x141a1a2e2,0x0bd6f81f8
398 .quad 0x16ad828b4,0x0f285651c
399 .quad 0x041d17b64,0x19425cbba
400 .quad 0x1fae1cc66,0x010746f3c
401 .quad 0x1a75b4b00,0x18db37e8a
402 .quad 0x0f872e54c,0x1c24afea4
403 .quad 0x01e41e9fc,0x04c144932
404 .quad 0x086d8e4d2,0x0271d9844
405 .quad 0x160f7af7a,0x052148f02
406 .quad 0x05bb8f1bc,0x08e766a0c
407 .quad 0x0a90fd27a,0x0a3c6f37a
408 .quad 0x0b3af077a,0x093a5f730
409 .quad 0x04984d782,0x1d22c238e
410 .quad 0x0ca6ef3ac,0x06cb08e5c
411 .quad 0x0234e0b26,0x063ded06a
412 .quad 0x1d88abd4a,0x06b749fb2
413 .quad 0x04597456a,0x04d56973c
414 .quad 0x0e9e28eb4,0x1167f94f2
415 .quad 0x07b3ff57a,0x19385bf2e
416 .quad 0x0c9c8b782,0x0cec3662e
417 .quad 0x13a9cba9e,0x0e417f38a
418 .quad 0x093e106a4,0x19329634a
419 .quad 0x167001a9c,0x14e727980
420 .quad 0x1ddffc5d4,0x0e6fc4e6a
421 .quad 0x00df04680,0x0d104b8fc
422 .quad 0x02342001e,0x08227bb8a
423 .quad 0x00a2a8d7e,0x05b397730
424 .quad 0x168763fa6,0x0b0cd4768
425 .quad 0x1ed5a407a,0x0e78eb416
426 .quad 0x0d2c3ed1a,0x13c2b89c4
427 .quad 0x0995a5724,0x1641378f0
428 .quad 0x19b1afbc4,0x0d7a4825c
429 .quad 0x109ffedc0,0x08d96551c
430 .quad 0x0f2271e60,0x10f5ff2ba
431 .quad 0x00b0bf8ca,0x00bf80dd2
432 .quad 0x123888b7a,0x00167d312
433 .quad 0x1e888f7dc,0x18dcddd1c
434 .quad 0x002ee03b2,0x0f6076544
435 .quad 0x183e8d8fe,0x06a45d2b2
436 .quad 0x133d7a042,0x026f6a60a
437 .quad 0x116b0f50c,0x1dd3e10e8
438 .quad 0x05fabe670,0x1a2adb74e
439 .quad 0x130004488,0x0de87806c
440 .quad 0x000bcf5f6,0x19d34af3a
441 .quad 0x18f0c7078,0x014338754
442 .quad 0x017f27698,0x049c3cc9c
443 .quad 0x058ca5f00,0x15e3e77ee
444 .quad 0x1af900c24,0x068bce87a
445 .quad 0x0b5cfca28,0x0dd07448e
446 .quad 0x0ded288f8,0x1524fa6c6
447 .quad 0x059f229bc,0x1d8048348
448 .quad 0x06d390dec,0x16cba8aca
449 .quad 0x037170390,0x0a3e3e02c
450 .quad 0x06353c1cc,0x042d98888
451 .quad 0x0c4584f5c,0x0d73c7bea
452 .quad 0x1f16a3418,0x1329d9f7e
453 .quad 0x0531377e2,0x185137662
454 .quad 0x1d8d9ca7c,0x1b1c69528
455 .quad 0x0b25b29f2,0x18a08b5bc
456 .quad 0x19fb2a8b0,0x02178513a
457 .quad 0x1a08fe6ac,0x1da758ae0
458 .quad 0x045cddf4e,0x0e0ac139e
459 .quad 0x1a91647f2,0x169cf9eb0
460 .quad 0x1a0f717c4,0x0170076fa
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S
new file mode 100644
index 000000000000..f7b6ea2ddfdb
--- /dev/null
+++ b/arch/x86/crypto/glue_helper-asm-avx.S
@@ -0,0 +1,91 @@
1/*
2 * Shared glue code for 128bit block ciphers, AVX assembler macros
3 *
4 * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 */
17
18#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \
19 vmovdqu (0*16)(src), x0; \
20 vmovdqu (1*16)(src), x1; \
21 vmovdqu (2*16)(src), x2; \
22 vmovdqu (3*16)(src), x3; \
23 vmovdqu (4*16)(src), x4; \
24 vmovdqu (5*16)(src), x5; \
25 vmovdqu (6*16)(src), x6; \
26 vmovdqu (7*16)(src), x7;
27
28#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \
29 vmovdqu x0, (0*16)(dst); \
30 vmovdqu x1, (1*16)(dst); \
31 vmovdqu x2, (2*16)(dst); \
32 vmovdqu x3, (3*16)(dst); \
33 vmovdqu x4, (4*16)(dst); \
34 vmovdqu x5, (5*16)(dst); \
35 vmovdqu x6, (6*16)(dst); \
36 vmovdqu x7, (7*16)(dst);
37
38#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
39 vpxor (0*16)(src), x1, x1; \
40 vpxor (1*16)(src), x2, x2; \
41 vpxor (2*16)(src), x3, x3; \
42 vpxor (3*16)(src), x4, x4; \
43 vpxor (4*16)(src), x5, x5; \
44 vpxor (5*16)(src), x6, x6; \
45 vpxor (6*16)(src), x7, x7; \
46 store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
47
48#define inc_le128(x, minus_one, tmp) \
49 vpcmpeqq minus_one, x, tmp; \
50 vpsubq minus_one, x, x; \
51 vpslldq $8, tmp, tmp; \
52 vpsubq tmp, x, x;
53
54#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \
55 vpcmpeqd t0, t0, t0; \
56 vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \
57 vmovdqa bswap, t1; \
58 \
59 /* load IV and byteswap */ \
60 vmovdqu (iv), x7; \
61 vpshufb t1, x7, x0; \
62 \
63 /* construct IVs */ \
64 inc_le128(x7, t0, t2); \
65 vpshufb t1, x7, x1; \
66 inc_le128(x7, t0, t2); \
67 vpshufb t1, x7, x2; \
68 inc_le128(x7, t0, t2); \
69 vpshufb t1, x7, x3; \
70 inc_le128(x7, t0, t2); \
71 vpshufb t1, x7, x4; \
72 inc_le128(x7, t0, t2); \
73 vpshufb t1, x7, x5; \
74 inc_le128(x7, t0, t2); \
75 vpshufb t1, x7, x6; \
76 inc_le128(x7, t0, t2); \
77 vmovdqa x7, t2; \
78 vpshufb t1, x7, x7; \
79 inc_le128(t2, t0, t1); \
80 vmovdqu t2, (iv);
81
82#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \
83 vpxor (0*16)(src), x0, x0; \
84 vpxor (1*16)(src), x1, x1; \
85 vpxor (2*16)(src), x2, x2; \
86 vpxor (3*16)(src), x3, x3; \
87 vpxor (4*16)(src), x4, x4; \
88 vpxor (5*16)(src), x5, x5; \
89 vpxor (6*16)(src), x6, x6; \
90 vpxor (7*16)(src), x7, x7; \
91 store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7);
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 30b3927bd733..22ce4f683e55 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
221 u8 *src = (u8 *)walk->src.virt.addr; 221 u8 *src = (u8 *)walk->src.virt.addr;
222 u8 *dst = (u8 *)walk->dst.virt.addr; 222 u8 *dst = (u8 *)walk->dst.virt.addr;
223 unsigned int nbytes = walk->nbytes; 223 unsigned int nbytes = walk->nbytes;
224 u128 ctrblk; 224 le128 ctrblk;
225 u128 tmp; 225 u128 tmp;
226 226
227 be128_to_u128(&ctrblk, (be128 *)walk->iv); 227 be128_to_le128(&ctrblk, (be128 *)walk->iv);
228 228
229 memcpy(&tmp, src, nbytes); 229 memcpy(&tmp, src, nbytes);
230 fn_ctr(ctx, &tmp, &tmp, &ctrblk); 230 fn_ctr(ctx, &tmp, &tmp, &ctrblk);
231 memcpy(dst, &tmp, nbytes); 231 memcpy(dst, &tmp, nbytes);
232 232
233 u128_to_be128((be128 *)walk->iv, &ctrblk); 233 le128_to_be128((be128 *)walk->iv, &ctrblk);
234} 234}
235EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); 235EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
236 236
@@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
243 unsigned int nbytes = walk->nbytes; 243 unsigned int nbytes = walk->nbytes;
244 u128 *src = (u128 *)walk->src.virt.addr; 244 u128 *src = (u128 *)walk->src.virt.addr;
245 u128 *dst = (u128 *)walk->dst.virt.addr; 245 u128 *dst = (u128 *)walk->dst.virt.addr;
246 u128 ctrblk; 246 le128 ctrblk;
247 unsigned int num_blocks, func_bytes; 247 unsigned int num_blocks, func_bytes;
248 unsigned int i; 248 unsigned int i;
249 249
250 be128_to_u128(&ctrblk, (be128 *)walk->iv); 250 be128_to_le128(&ctrblk, (be128 *)walk->iv);
251 251
252 /* Process multi-block batch */ 252 /* Process multi-block batch */
253 for (i = 0; i < gctx->num_funcs; i++) { 253 for (i = 0; i < gctx->num_funcs; i++) {
@@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
269 } 269 }
270 270
271done: 271done:
272 u128_to_be128((be128 *)walk->iv, &ctrblk); 272 le128_to_be128((be128 *)walk->iv, &ctrblk);
273 return nbytes; 273 return nbytes;
274} 274}
275 275
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
index 504106bf04a2..02b0e9fe997c 100644
--- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S
@@ -24,7 +24,16 @@
24 * 24 *
25 */ 25 */
26 26
27#include "glue_helper-asm-avx.S"
28
27.file "serpent-avx-x86_64-asm_64.S" 29.file "serpent-avx-x86_64-asm_64.S"
30
31.data
32.align 16
33
34.Lbswap128_mask:
35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
36
28.text 37.text
29 38
30#define CTX %rdi 39#define CTX %rdi
@@ -550,51 +559,27 @@
550 vpunpcklqdq x3, t2, x2; \ 559 vpunpcklqdq x3, t2, x2; \
551 vpunpckhqdq x3, t2, x3; 560 vpunpckhqdq x3, t2, x3;
552 561
553#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 562#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
554 vmovdqu (0*4*4)(in), x0; \
555 vmovdqu (1*4*4)(in), x1; \
556 vmovdqu (2*4*4)(in), x2; \
557 vmovdqu (3*4*4)(in), x3; \
558 \
559 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 563 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
560 564
561#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 565#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
562 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 566 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
563 \
564 vmovdqu x0, (0*4*4)(out); \
565 vmovdqu x1, (1*4*4)(out); \
566 vmovdqu x2, (2*4*4)(out); \
567 vmovdqu x3, (3*4*4)(out);
568
569#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
570 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
571 \
572 vpxor (0*4*4)(out), x0, x0; \
573 vmovdqu x0, (0*4*4)(out); \
574 vpxor (1*4*4)(out), x1, x1; \
575 vmovdqu x1, (1*4*4)(out); \
576 vpxor (2*4*4)(out), x2, x2; \
577 vmovdqu x2, (2*4*4)(out); \
578 vpxor (3*4*4)(out), x3, x3; \
579 vmovdqu x3, (3*4*4)(out);
580 567
581.align 8 568.align 8
582.global __serpent_enc_blk_8way_avx 569.type __serpent_enc_blk8_avx,@function;
583.type __serpent_enc_blk_8way_avx,@function;
584 570
585__serpent_enc_blk_8way_avx: 571__serpent_enc_blk8_avx:
586 /* input: 572 /* input:
587 * %rdi: ctx, CTX 573 * %rdi: ctx, CTX
588 * %rsi: dst 574 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
589 * %rdx: src 575 * output:
590 * %rcx: bool, if true: xor output 576 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
591 */ 577 */
592 578
593 vpcmpeqd RNOT, RNOT, RNOT; 579 vpcmpeqd RNOT, RNOT, RNOT;
594 580
595 leaq (4*4*4)(%rdx), %rax; 581 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
596 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 582 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
597 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
598 583
599 K2(RA, RB, RC, RD, RE, 0); 584 K2(RA, RB, RC, RD, RE, 0);
600 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 585 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1);
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx:
630 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 615 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31);
631 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 616 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32);
632 617
633 leaq (4*4*4)(%rsi), %rax; 618 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
634 619 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
635 testb %cl, %cl;
636 jnz __enc_xor8;
637
638 write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
639 write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
640
641 ret;
642
643__enc_xor8:
644 xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
645 xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
646 620
647 ret; 621 ret;
648 622
649.align 8 623.align 8
650.global serpent_dec_blk_8way_avx 624.type __serpent_dec_blk8_avx,@function;
651.type serpent_dec_blk_8way_avx,@function;
652 625
653serpent_dec_blk_8way_avx: 626__serpent_dec_blk8_avx:
654 /* input: 627 /* input:
655 * %rdi: ctx, CTX 628 * %rdi: ctx, CTX
656 * %rsi: dst 629 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
657 * %rdx: src 630 * output:
631 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
658 */ 632 */
659 633
660 vpcmpeqd RNOT, RNOT, RNOT; 634 vpcmpeqd RNOT, RNOT, RNOT;
661 635
662 leaq (4*4*4)(%rdx), %rax; 636 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
663 read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); 637 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
664 read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
665 638
666 K2(RA, RB, RC, RD, RE, 32); 639 K2(RA, RB, RC, RD, RE, 32);
667 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 640 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31);
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx:
697 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 670 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1);
698 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 671 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0);
699 672
700 leaq (4*4*4)(%rsi), %rax; 673 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
701 write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); 674 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
702 write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); 675
676 ret;
677
678.align 8
679.global serpent_ecb_enc_8way_avx
680.type serpent_ecb_enc_8way_avx,@function;
681
682serpent_ecb_enc_8way_avx:
683 /* input:
684 * %rdi: ctx, CTX
685 * %rsi: dst
686 * %rdx: src
687 */
688
689 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
690
691 call __serpent_enc_blk8_avx;
692
693 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
694
695 ret;
696
697.align 8
698.global serpent_ecb_dec_8way_avx
699.type serpent_ecb_dec_8way_avx,@function;
700
701serpent_ecb_dec_8way_avx:
702 /* input:
703 * %rdi: ctx, CTX
704 * %rsi: dst
705 * %rdx: src
706 */
707
708 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
709
710 call __serpent_dec_blk8_avx;
711
712 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
713
714 ret;
715
716.align 8
717.global serpent_cbc_dec_8way_avx
718.type serpent_cbc_dec_8way_avx,@function;
719
720serpent_cbc_dec_8way_avx:
721 /* input:
722 * %rdi: ctx, CTX
723 * %rsi: dst
724 * %rdx: src
725 */
726
727 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
728
729 call __serpent_dec_blk8_avx;
730
731 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
732
733 ret;
734
735.align 8
736.global serpent_ctr_8way_avx
737.type serpent_ctr_8way_avx,@function;
738
739serpent_ctr_8way_avx:
740 /* input:
741 * %rdi: ctx, CTX
742 * %rsi: dst
743 * %rdx: src
744 * %rcx: iv (little endian, 128bit)
745 */
746
747 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
748 RD2, RK0, RK1, RK2);
749
750 call __serpent_enc_blk8_avx;
751
752 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
703 753
704 ret; 754 ret;
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 3f543a04cf1e..52abaaf28e7f 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -42,55 +42,24 @@
42#include <asm/crypto/ablk_helper.h> 42#include <asm/crypto/ablk_helper.h>
43#include <asm/crypto/glue_helper.h> 43#include <asm/crypto/glue_helper.h>
44 44
45static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) 45static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
46{
47 u128 ivs[SERPENT_PARALLEL_BLOCKS - 1];
48 unsigned int j;
49
50 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
51 ivs[j] = src[j];
52
53 serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
54
55 for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++)
56 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
57}
58
59static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
60{ 46{
61 be128 ctrblk; 47 be128 ctrblk;
62 48
63 u128_to_be128(&ctrblk, iv); 49 le128_to_be128(&ctrblk, iv);
64 u128_inc(iv); 50 le128_inc(iv);
65 51
66 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); 52 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
67 u128_xor(dst, src, (u128 *)&ctrblk); 53 u128_xor(dst, src, (u128 *)&ctrblk);
68} 54}
69 55
70static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
71 u128 *iv)
72{
73 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
74 unsigned int i;
75
76 for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) {
77 if (dst != src)
78 dst[i] = src[i];
79
80 u128_to_be128(&ctrblks[i], iv);
81 u128_inc(iv);
82 }
83
84 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
85}
86
87static const struct common_glue_ctx serpent_enc = { 56static const struct common_glue_ctx serpent_enc = {
88 .num_funcs = 2, 57 .num_funcs = 2,
89 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, 58 .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS,
90 59
91 .funcs = { { 60 .funcs = { {
92 .num_blocks = SERPENT_PARALLEL_BLOCKS, 61 .num_blocks = SERPENT_PARALLEL_BLOCKS,
93 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } 62 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) }
94 }, { 63 }, {
95 .num_blocks = 1, 64 .num_blocks = 1,
96 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } 65 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) }
@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = {
103 72
104 .funcs = { { 73 .funcs = { {
105 .num_blocks = SERPENT_PARALLEL_BLOCKS, 74 .num_blocks = SERPENT_PARALLEL_BLOCKS,
106 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } 75 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) }
107 }, { 76 }, {
108 .num_blocks = 1, 77 .num_blocks = 1,
109 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } 78 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) }
@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = {
116 85
117 .funcs = { { 86 .funcs = { {
118 .num_blocks = SERPENT_PARALLEL_BLOCKS, 87 .num_blocks = SERPENT_PARALLEL_BLOCKS,
119 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } 88 .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) }
120 }, { 89 }, {
121 .num_blocks = 1, 90 .num_blocks = 1,
122 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } 91 .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) }
@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = {
129 98
130 .funcs = { { 99 .funcs = { {
131 .num_blocks = SERPENT_PARALLEL_BLOCKS, 100 .num_blocks = SERPENT_PARALLEL_BLOCKS,
132 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } 101 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) }
133 }, { 102 }, {
134 .num_blocks = 1, 103 .num_blocks = 1,
135 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } 104 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) }
@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
193 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); 162 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
194 163
195 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { 164 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
196 serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); 165 serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst);
197 return; 166 return;
198 } 167 }
199 168
@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
210 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); 179 ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes);
211 180
212 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { 181 if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) {
213 serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); 182 serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst);
214 return; 183 return;
215 } 184 }
216 185
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index 9107a9908c41..97a356ece24d 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src)
59 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); 59 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
60} 60}
61 61
62static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) 62static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
63{ 63{
64 be128 ctrblk; 64 be128 ctrblk;
65 65
66 u128_to_be128(&ctrblk, iv); 66 le128_to_be128(&ctrblk, iv);
67 u128_inc(iv); 67 le128_inc(iv);
68 68
69 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); 69 __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
70 u128_xor(dst, src, (u128 *)&ctrblk); 70 u128_xor(dst, src, (u128 *)&ctrblk);
71} 71}
72 72
73static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, 73static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
74 u128 *iv) 74 le128 *iv)
75{ 75{
76 be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; 76 be128 ctrblks[SERPENT_PARALLEL_BLOCKS];
77 unsigned int i; 77 unsigned int i;
@@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src,
80 if (dst != src) 80 if (dst != src)
81 dst[i] = src[i]; 81 dst[i] = src[i];
82 82
83 u128_to_be128(&ctrblks[i], iv); 83 le128_to_be128(&ctrblks[i], iv);
84 u128_inc(iv); 84 le128_inc(iv);
85 } 85 }
86 86
87 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); 87 serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 1585abb13dde..ebac16bfa830 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -23,7 +23,16 @@
23 * 23 *
24 */ 24 */
25 25
26#include "glue_helper-asm-avx.S"
27
26.file "twofish-avx-x86_64-asm_64.S" 28.file "twofish-avx-x86_64-asm_64.S"
29
30.data
31.align 16
32
33.Lbswap128_mask:
34 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
35
27.text 36.text
28 37
29/* structure of crypto context */ 38/* structure of crypto context */
@@ -217,69 +226,45 @@
217 vpunpcklqdq x3, t2, x2; \ 226 vpunpcklqdq x3, t2, x2; \
218 vpunpckhqdq x3, t2, x3; 227 vpunpckhqdq x3, t2, x3;
219 228
220#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ 229#define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
221 vpxor (0*4*4)(in), wkey, x0; \ 230 vpxor x0, wkey, x0; \
222 vpxor (1*4*4)(in), wkey, x1; \ 231 vpxor x1, wkey, x1; \
223 vpxor (2*4*4)(in), wkey, x2; \ 232 vpxor x2, wkey, x2; \
224 vpxor (3*4*4)(in), wkey, x3; \ 233 vpxor x3, wkey, x3; \
225 \ 234 \
226 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 235 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
227 236
228#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ 237#define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \
229 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
230 \
231 vpxor x0, wkey, x0; \
232 vmovdqu x0, (0*4*4)(out); \
233 vpxor x1, wkey, x1; \
234 vmovdqu x1, (1*4*4)(out); \
235 vpxor x2, wkey, x2; \
236 vmovdqu x2, (2*4*4)(out); \
237 vpxor x3, wkey, x3; \
238 vmovdqu x3, (3*4*4)(out);
239
240#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
241 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 238 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
242 \ 239 \
243 vpxor x0, wkey, x0; \ 240 vpxor x0, wkey, x0; \
244 vpxor (0*4*4)(out), x0, x0; \ 241 vpxor x1, wkey, x1; \
245 vmovdqu x0, (0*4*4)(out); \ 242 vpxor x2, wkey, x2; \
246 vpxor x1, wkey, x1; \ 243 vpxor x3, wkey, x3;
247 vpxor (1*4*4)(out), x1, x1; \
248 vmovdqu x1, (1*4*4)(out); \
249 vpxor x2, wkey, x2; \
250 vpxor (2*4*4)(out), x2, x2; \
251 vmovdqu x2, (2*4*4)(out); \
252 vpxor x3, wkey, x3; \
253 vpxor (3*4*4)(out), x3, x3; \
254 vmovdqu x3, (3*4*4)(out);
255 244
256.align 8 245.align 8
257.global __twofish_enc_blk_8way 246.type __twofish_enc_blk8,@function;
258.type __twofish_enc_blk_8way,@function;
259 247
260__twofish_enc_blk_8way: 248__twofish_enc_blk8:
261 /* input: 249 /* input:
262 * %rdi: ctx, CTX 250 * %rdi: ctx, CTX
263 * %rsi: dst 251 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
264 * %rdx: src 252 * output:
265 * %rcx: bool, if true: xor output 253 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
266 */ 254 */
267 255
256 vmovdqu w(CTX), RK1;
257
268 pushq %rbp; 258 pushq %rbp;
269 pushq %rbx; 259 pushq %rbx;
270 pushq %rcx; 260 pushq %rcx;
271 261
272 vmovdqu w(CTX), RK1; 262 inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
273
274 leaq (4*4*4)(%rdx), %rax;
275 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
276 preload_rgi(RA1); 263 preload_rgi(RA1);
277 rotate_1l(RD1); 264 rotate_1l(RD1);
278 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 265 inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
279 rotate_1l(RD2); 266 rotate_1l(RD2);
280 267
281 movq %rsi, %r11;
282
283 encrypt_cycle(0); 268 encrypt_cycle(0);
284 encrypt_cycle(1); 269 encrypt_cycle(1);
285 encrypt_cycle(2); 270 encrypt_cycle(2);
@@ -295,47 +280,33 @@ __twofish_enc_blk_8way:
295 popq %rbx; 280 popq %rbx;
296 popq %rbp; 281 popq %rbp;
297 282
298 leaq (4*4*4)(%r11), %rax; 283 outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
299 284 outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
300 testb %cl, %cl;
301 jnz __enc_xor8;
302
303 outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
304 outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
305
306 ret;
307
308__enc_xor8:
309 outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
310 outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
311 285
312 ret; 286 ret;
313 287
314.align 8 288.align 8
315.global twofish_dec_blk_8way 289.type __twofish_dec_blk8,@function;
316.type twofish_dec_blk_8way,@function;
317 290
318twofish_dec_blk_8way: 291__twofish_dec_blk8:
319 /* input: 292 /* input:
320 * %rdi: ctx, CTX 293 * %rdi: ctx, CTX
321 * %rsi: dst 294 * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks
322 * %rdx: src 295 * output:
296 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
323 */ 297 */
324 298
299 vmovdqu (w+4*4)(CTX), RK1;
300
325 pushq %rbp; 301 pushq %rbp;
326 pushq %rbx; 302 pushq %rbx;
327 303
328 vmovdqu (w+4*4)(CTX), RK1; 304 inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
329
330 leaq (4*4*4)(%rdx), %rax;
331 inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
332 preload_rgi(RC1); 305 preload_rgi(RC1);
333 rotate_1l(RA1); 306 rotate_1l(RA1);
334 inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); 307 inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
335 rotate_1l(RA2); 308 rotate_1l(RA2);
336 309
337 movq %rsi, %r11;
338
339 decrypt_cycle(7); 310 decrypt_cycle(7);
340 decrypt_cycle(6); 311 decrypt_cycle(6);
341 decrypt_cycle(5); 312 decrypt_cycle(5);
@@ -350,8 +321,103 @@ twofish_dec_blk_8way:
350 popq %rbx; 321 popq %rbx;
351 popq %rbp; 322 popq %rbp;
352 323
353 leaq (4*4*4)(%r11), %rax; 324 outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
354 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); 325 outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
355 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); 326
327 ret;
328
329.align 8
330.global twofish_ecb_enc_8way
331.type twofish_ecb_enc_8way,@function;
332
333twofish_ecb_enc_8way:
334 /* input:
335 * %rdi: ctx, CTX
336 * %rsi: dst
337 * %rdx: src
338 */
339
340 movq %rsi, %r11;
341
342 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
343
344 call __twofish_enc_blk8;
345
346 store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
347
348 ret;
349
350.align 8
351.global twofish_ecb_dec_8way
352.type twofish_ecb_dec_8way,@function;
353
354twofish_ecb_dec_8way:
355 /* input:
356 * %rdi: ctx, CTX
357 * %rsi: dst
358 * %rdx: src
359 */
360
361 movq %rsi, %r11;
362
363 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
364
365 call __twofish_dec_blk8;
366
367 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
368
369 ret;
370
371.align 8
372.global twofish_cbc_dec_8way
373.type twofish_cbc_dec_8way,@function;
374
375twofish_cbc_dec_8way:
376 /* input:
377 * %rdi: ctx, CTX
378 * %rsi: dst
379 * %rdx: src
380 */
381
382 pushq %r12;
383
384 movq %rsi, %r11;
385 movq %rdx, %r12;
386
387 load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
388
389 call __twofish_dec_blk8;
390
391 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
392
393 popq %r12;
394
395 ret;
396
397.align 8
398.global twofish_ctr_8way
399.type twofish_ctr_8way,@function;
400
401twofish_ctr_8way:
402 /* input:
403 * %rdi: ctx, CTX
404 * %rsi: dst
405 * %rdx: src
406 * %rcx: iv (little endian, 128bit)
407 */
408
409 pushq %r12;
410
411 movq %rsi, %r11;
412 movq %rdx, %r12;
413
414 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
415 RD2, RX0, RX1, RY0);
416
417 call __twofish_enc_blk8;
418
419 store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2);
420
421 popq %r12;
356 422
357 ret; 423 ret;
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index e7708b5442e0..94ac91d26e47 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -45,66 +45,23 @@
45 45
46#define TWOFISH_PARALLEL_BLOCKS 8 46#define TWOFISH_PARALLEL_BLOCKS 8
47 47
48static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
49 const u8 *src)
50{
51 __twofish_enc_blk_3way(ctx, dst, src, false);
52}
53
54/* 8-way parallel cipher functions */ 48/* 8-way parallel cipher functions */
55asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, 49asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst,
56 const u8 *src, bool xor); 50 const u8 *src);
57asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, 51asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst,
58 const u8 *src); 52 const u8 *src);
59 53
60static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, 54asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst,
61 const u8 *src) 55 const u8 *src);
62{ 56asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst,
63 __twofish_enc_blk_8way(ctx, dst, src, false); 57 const u8 *src, le128 *iv);
64}
65
66static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst,
67 const u8 *src)
68{
69 __twofish_enc_blk_8way(ctx, dst, src, true);
70}
71 58
72static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, 59static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst,
73 const u8 *src) 60 const u8 *src)
74{ 61{
75 twofish_dec_blk_8way(ctx, dst, src); 62 __twofish_enc_blk_3way(ctx, dst, src, false);
76}
77
78static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src)
79{
80 u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1];
81 unsigned int j;
82
83 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
84 ivs[j] = src[j];
85
86 twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src);
87
88 for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++)
89 u128_xor(dst + (j + 1), dst + (j + 1), ivs + j);
90} 63}
91 64
92static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src,
93 u128 *iv)
94{
95 be128 ctrblks[TWOFISH_PARALLEL_BLOCKS];
96 unsigned int i;
97
98 for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) {
99 if (dst != src)
100 dst[i] = src[i];
101
102 u128_to_be128(&ctrblks[i], iv);
103 u128_inc(iv);
104 }
105
106 twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks);
107}
108 65
109static const struct common_glue_ctx twofish_enc = { 66static const struct common_glue_ctx twofish_enc = {
110 .num_funcs = 3, 67 .num_funcs = 3,
@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = {
112 69
113 .funcs = { { 70 .funcs = { {
114 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 71 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
115 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } 72 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) }
116 }, { 73 }, {
117 .num_blocks = 3, 74 .num_blocks = 3,
118 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } 75 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) }
@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = {
128 85
129 .funcs = { { 86 .funcs = { {
130 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 87 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
131 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } 88 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) }
132 }, { 89 }, {
133 .num_blocks = 3, 90 .num_blocks = 3,
134 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } 91 .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) }
@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = {
144 101
145 .funcs = { { 102 .funcs = { {
146 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 103 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
147 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } 104 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) }
148 }, { 105 }, {
149 .num_blocks = 3, 106 .num_blocks = 3,
150 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } 107 .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) }
@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = {
160 117
161 .funcs = { { 118 .funcs = { {
162 .num_blocks = TWOFISH_PARALLEL_BLOCKS, 119 .num_blocks = TWOFISH_PARALLEL_BLOCKS,
163 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } 120 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) }
164 }, { 121 }, {
165 .num_blocks = 3, 122 .num_blocks = 3,
166 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } 123 .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) }
@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
227 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); 184 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
228 185
229 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { 186 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
230 twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); 187 twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst);
231 return; 188 return;
232 } 189 }
233 190
@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
249 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); 206 ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes);
250 207
251 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { 208 if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) {
252 twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); 209 twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst);
253 return; 210 return;
254 } 211 }
255 212
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c
index aa3eb358b7e8..13e63b3e1dfb 100644
--- a/arch/x86/crypto/twofish_glue_3way.c
+++ b/arch/x86/crypto/twofish_glue_3way.c
@@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src)
62} 62}
63EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); 63EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way);
64 64
65void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) 65void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv)
66{ 66{
67 be128 ctrblk; 67 be128 ctrblk;
68 68
69 if (dst != src) 69 if (dst != src)
70 *dst = *src; 70 *dst = *src;
71 71
72 u128_to_be128(&ctrblk, iv); 72 le128_to_be128(&ctrblk, iv);
73 u128_inc(iv); 73 le128_inc(iv);
74 74
75 twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); 75 twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk);
76 u128_xor(dst, dst, (u128 *)&ctrblk); 76 u128_xor(dst, dst, (u128 *)&ctrblk);
@@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv)
78EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); 78EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr);
79 79
80void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, 80void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
81 u128 *iv) 81 le128 *iv)
82{ 82{
83 be128 ctrblks[3]; 83 be128 ctrblks[3];
84 84
@@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
88 dst[2] = src[2]; 88 dst[2] = src[2];
89 } 89 }
90 90
91 u128_to_be128(&ctrblks[0], iv); 91 le128_to_be128(&ctrblks[0], iv);
92 u128_inc(iv); 92 le128_inc(iv);
93 u128_to_be128(&ctrblks[1], iv); 93 le128_to_be128(&ctrblks[1], iv);
94 u128_inc(iv); 94 le128_inc(iv);
95 u128_to_be128(&ctrblks[2], iv); 95 le128_to_be128(&ctrblks[2], iv);
96 u128_inc(iv); 96 le128_inc(iv);
97 97
98 twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); 98 twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks);
99} 99}
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h
new file mode 100644
index 000000000000..98038add801e
--- /dev/null
+++ b/arch/x86/include/asm/crypto/camellia.h
@@ -0,0 +1,82 @@
1#ifndef ASM_X86_CAMELLIA_H
2#define ASM_X86_CAMELLIA_H
3
4#include <linux/kernel.h>
5#include <linux/crypto.h>
6
7#define CAMELLIA_MIN_KEY_SIZE 16
8#define CAMELLIA_MAX_KEY_SIZE 32
9#define CAMELLIA_BLOCK_SIZE 16
10#define CAMELLIA_TABLE_BYTE_LEN 272
11#define CAMELLIA_PARALLEL_BLOCKS 2
12
13struct camellia_ctx {
14 u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)];
15 u32 key_length;
16};
17
18struct camellia_lrw_ctx {
19 struct lrw_table_ctx lrw_table;
20 struct camellia_ctx camellia_ctx;
21};
22
23struct camellia_xts_ctx {
24 struct camellia_ctx tweak_ctx;
25 struct camellia_ctx crypt_ctx;
26};
27
28extern int __camellia_setkey(struct camellia_ctx *cctx,
29 const unsigned char *key,
30 unsigned int key_len, u32 *flags);
31
32extern int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
33 unsigned int keylen);
34extern void lrw_camellia_exit_tfm(struct crypto_tfm *tfm);
35
36extern int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key,
37 unsigned int keylen);
38
39/* regular block cipher functions */
40asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
41 const u8 *src, bool xor);
42asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst,
43 const u8 *src);
44
45/* 2-way parallel cipher functions */
46asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
47 const u8 *src, bool xor);
48asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst,
49 const u8 *src);
50
51static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst,
52 const u8 *src)
53{
54 __camellia_enc_blk(ctx, dst, src, false);
55}
56
57static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst,
58 const u8 *src)
59{
60 __camellia_enc_blk(ctx, dst, src, true);
61}
62
63static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst,
64 const u8 *src)
65{
66 __camellia_enc_blk_2way(ctx, dst, src, false);
67}
68
69static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst,
70 const u8 *src)
71{
72 __camellia_enc_blk_2way(ctx, dst, src, true);
73}
74
75/* glue helpers */
76extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src);
77extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src,
78 le128 *iv);
79extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src,
80 le128 *iv);
81
82#endif /* ASM_X86_CAMELLIA_H */
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h
index 3e408bddc96f..e2d65b061d27 100644
--- a/arch/x86/include/asm/crypto/glue_helper.h
+++ b/arch/x86/include/asm/crypto/glue_helper.h
@@ -13,7 +13,7 @@
13typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); 13typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src);
14typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); 14typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src);
15typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, 15typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src,
16 u128 *iv); 16 le128 *iv);
17 17
18#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) 18#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn))
19#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) 19#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn))
@@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled)
71 kernel_fpu_end(); 71 kernel_fpu_end();
72} 72}
73 73
74static inline void u128_to_be128(be128 *dst, const u128 *src) 74static inline void le128_to_be128(be128 *dst, const le128 *src)
75{ 75{
76 dst->a = cpu_to_be64(src->a); 76 dst->a = cpu_to_be64(le64_to_cpu(src->a));
77 dst->b = cpu_to_be64(src->b); 77 dst->b = cpu_to_be64(le64_to_cpu(src->b));
78} 78}
79 79
80static inline void be128_to_u128(u128 *dst, const be128 *src) 80static inline void be128_to_le128(le128 *dst, const be128 *src)
81{ 81{
82 dst->a = be64_to_cpu(src->a); 82 dst->a = cpu_to_le64(be64_to_cpu(src->a));
83 dst->b = be64_to_cpu(src->b); 83 dst->b = cpu_to_le64(be64_to_cpu(src->b));
84} 84}
85 85
86static inline void u128_inc(u128 *i) 86static inline void le128_inc(le128 *i)
87{ 87{
88 i->b++; 88 u64 a = le64_to_cpu(i->a);
89 if (!i->b) 89 u64 b = le64_to_cpu(i->b);
90 i->a++; 90
91 b++;
92 if (!b)
93 a++;
94
95 i->a = cpu_to_le64(a);
96 i->b = cpu_to_le64(b);
91} 97}
92 98
93extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, 99extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx,
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h
index 432deedd2945..0da1d3e2a55c 100644
--- a/arch/x86/include/asm/crypto/serpent-avx.h
+++ b/arch/x86/include/asm/crypto/serpent-avx.h
@@ -6,27 +6,14 @@
6 6
7#define SERPENT_PARALLEL_BLOCKS 8 7#define SERPENT_PARALLEL_BLOCKS 8
8 8
9asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, 9asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst,
10 const u8 *src, bool xor); 10 const u8 *src);
11asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, 11asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
12 const u8 *src); 12 const u8 *src);
13 13
14static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, 14asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst,
15 const u8 *src) 15 const u8 *src);
16{ 16asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst,
17 __serpent_enc_blk_8way_avx(ctx, dst, src, false); 17 const u8 *src, le128 *iv);
18}
19
20static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
21 const u8 *src)
22{
23 __serpent_enc_blk_8way_avx(ctx, dst, src, true);
24}
25
26static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
27 const u8 *src)
28{
29 serpent_dec_blk_8way_avx(ctx, dst, src);
30}
31 18
32#endif 19#endif
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h
index 9d2c514bd5f9..878c51ceebb5 100644
--- a/arch/x86/include/asm/crypto/twofish.h
+++ b/arch/x86/include/asm/crypto/twofish.h
@@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst,
31/* helpers from twofish_x86_64-3way module */ 31/* helpers from twofish_x86_64-3way module */
32extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); 32extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src);
33extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, 33extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src,
34 u128 *iv); 34 le128 *iv);
35extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, 35extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src,
36 u128 *iv); 36 le128 *iv);
37 37
38extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, 38extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key,
39 unsigned int keylen); 39 unsigned int keylen);