diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-15 15:35:19 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-15 15:35:19 -0500 |
commit | 1ed55eac3b1fc30b29cdb52251e0f13b24fc344c (patch) | |
tree | b7a4c67f2e29f8aa418708c5da871e64c511f3ff /arch/x86 | |
parent | 08242bc2210938761230f79c5288dbcf72e94808 (diff) | |
parent | a2c0911c09190125f52c9941b9d187f601c2f7be (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6
Pull crypto update from Herbert Xu:
- Added aesni/avx/x86_64 implementations for camellia.
- Optimised AVX code for cast5/serpent/twofish/cast6.
- Fixed vmac bug with unaligned input.
- Allow compression algorithms in FIPS mode.
- Optimised crc32c implementation for Intel.
- Misc fixes.
* git://git.kernel.org/pub/scm/linux/kernel/git/herbert/crypto-2.6: (32 commits)
crypto: caam - Updated SEC-4.0 device tree binding for ERA information.
crypto: testmgr - remove superfluous initializers for xts(aes)
crypto: testmgr - allow compression algs in fips mode
crypto: testmgr - add larger crc32c test vector to test FPU path in crc32c_intel
crypto: testmgr - clean alg_test_null entries in alg_test_descs[]
crypto: testmgr - remove fips_allowed flag from camellia-aesni null-tests
crypto: cast5/cast6 - move lookup tables to shared module
padata: use __this_cpu_read per-cpu helper
crypto: s5p-sss - Fix compilation error
crypto: picoxcell - Add terminating entry for platform_device_id table
crypto: omap-aes - select BLKCIPHER2
crypto: camellia - add AES-NI/AVX/x86_64 assembler implementation of camellia cipher
crypto: camellia-x86_64 - share common functions and move structures and function definitions to header file
crypto: tcrypt - add async speed test for camellia cipher
crypto: tegra-aes - fix error-valued pointer dereference
crypto: tegra - fix missing unlock on error case
crypto: cast5/avx - avoid using temporary stack buffers
crypto: serpent/avx - avoid using temporary stack buffers
crypto: twofish/avx - avoid using temporary stack buffers
crypto: cast6/avx - avoid using temporary stack buffers
...
Diffstat (limited to 'arch/x86')
22 files changed, 3160 insertions, 620 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 5bacb4a226ac..e0ca7c9ac383 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o | |||
12 | 12 | ||
13 | obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o | 13 | obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o |
14 | obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o | 14 | obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o |
15 | obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += camellia-aesni-avx-x86_64.o | ||
15 | obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o | 16 | obj-$(CONFIG_CRYPTO_CAST5_AVX_X86_64) += cast5-avx-x86_64.o |
16 | obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o | 17 | obj-$(CONFIG_CRYPTO_CAST6_AVX_X86_64) += cast6-avx-x86_64.o |
17 | obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o | 18 | obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o |
@@ -34,6 +35,8 @@ serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o | |||
34 | 35 | ||
35 | aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o | 36 | aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o |
36 | camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o | 37 | camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o |
38 | camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \ | ||
39 | camellia_aesni_avx_glue.o | ||
37 | cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o | 40 | cast5-avx-x86_64-y := cast5-avx-x86_64-asm_64.o cast5_avx_glue.o |
38 | cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o | 41 | cast6-avx-x86_64-y := cast6-avx-x86_64-asm_64.o cast6_avx_glue.o |
39 | blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o | 42 | blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o |
@@ -47,3 +50,5 @@ serpent-avx-x86_64-y := serpent-avx-x86_64-asm_64.o serpent_avx_glue.o | |||
47 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o | 50 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o |
48 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o | 51 | ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o |
49 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o | 52 | sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o |
53 | crc32c-intel-y := crc32c-intel_glue.o | ||
54 | crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o | ||
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S new file mode 100644 index 000000000000..2306d2e4816f --- /dev/null +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S | |||
@@ -0,0 +1,1102 @@ | |||
1 | /* | ||
2 | * x86_64/AVX/AES-NI assembler implementation of Camellia | ||
3 | * | ||
4 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | /* | ||
14 | * Version licensed under 2-clause BSD License is available at: | ||
15 | * http://koti.mbnet.fi/axh/crypto/camellia-BSD-1.2.0-aesni1.tar.xz | ||
16 | */ | ||
17 | |||
18 | #define CAMELLIA_TABLE_BYTE_LEN 272 | ||
19 | |||
20 | /* struct camellia_ctx: */ | ||
21 | #define key_table 0 | ||
22 | #define key_length CAMELLIA_TABLE_BYTE_LEN | ||
23 | |||
24 | /* register macros */ | ||
25 | #define CTX %rdi | ||
26 | |||
27 | /********************************************************************** | ||
28 | 16-way camellia | ||
29 | **********************************************************************/ | ||
30 | #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ | ||
31 | vpand x, mask4bit, tmp0; \ | ||
32 | vpandn x, mask4bit, x; \ | ||
33 | vpsrld $4, x, x; \ | ||
34 | \ | ||
35 | vpshufb tmp0, lo_t, tmp0; \ | ||
36 | vpshufb x, hi_t, x; \ | ||
37 | vpxor tmp0, x, x; | ||
38 | |||
39 | /* | ||
40 | * IN: | ||
41 | * x0..x7: byte-sliced AB state | ||
42 | * mem_cd: register pointer storing CD state | ||
43 | * key: index for key material | ||
44 | * OUT: | ||
45 | * x0..x7: new byte-sliced CD state | ||
46 | */ | ||
47 | #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ | ||
48 | t7, mem_cd, key) \ | ||
49 | /* \ | ||
50 | * S-function with AES subbytes \ | ||
51 | */ \ | ||
52 | vmovdqa .Linv_shift_row, t4; \ | ||
53 | vbroadcastss .L0f0f0f0f, t7; \ | ||
54 | vmovdqa .Lpre_tf_lo_s1, t0; \ | ||
55 | vmovdqa .Lpre_tf_hi_s1, t1; \ | ||
56 | \ | ||
57 | /* AES inverse shift rows */ \ | ||
58 | vpshufb t4, x0, x0; \ | ||
59 | vpshufb t4, x7, x7; \ | ||
60 | vpshufb t4, x1, x1; \ | ||
61 | vpshufb t4, x4, x4; \ | ||
62 | vpshufb t4, x2, x2; \ | ||
63 | vpshufb t4, x5, x5; \ | ||
64 | vpshufb t4, x3, x3; \ | ||
65 | vpshufb t4, x6, x6; \ | ||
66 | \ | ||
67 | /* prefilter sboxes 1, 2 and 3 */ \ | ||
68 | vmovdqa .Lpre_tf_lo_s4, t2; \ | ||
69 | vmovdqa .Lpre_tf_hi_s4, t3; \ | ||
70 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
71 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
72 | filter_8bit(x1, t0, t1, t7, t6); \ | ||
73 | filter_8bit(x4, t0, t1, t7, t6); \ | ||
74 | filter_8bit(x2, t0, t1, t7, t6); \ | ||
75 | filter_8bit(x5, t0, t1, t7, t6); \ | ||
76 | \ | ||
77 | /* prefilter sbox 4 */ \ | ||
78 | vpxor t4, t4, t4; \ | ||
79 | filter_8bit(x3, t2, t3, t7, t6); \ | ||
80 | filter_8bit(x6, t2, t3, t7, t6); \ | ||
81 | \ | ||
82 | /* AES subbytes + AES shift rows */ \ | ||
83 | vmovdqa .Lpost_tf_lo_s1, t0; \ | ||
84 | vmovdqa .Lpost_tf_hi_s1, t1; \ | ||
85 | vaesenclast t4, x0, x0; \ | ||
86 | vaesenclast t4, x7, x7; \ | ||
87 | vaesenclast t4, x1, x1; \ | ||
88 | vaesenclast t4, x4, x4; \ | ||
89 | vaesenclast t4, x2, x2; \ | ||
90 | vaesenclast t4, x5, x5; \ | ||
91 | vaesenclast t4, x3, x3; \ | ||
92 | vaesenclast t4, x6, x6; \ | ||
93 | \ | ||
94 | /* postfilter sboxes 1 and 4 */ \ | ||
95 | vmovdqa .Lpost_tf_lo_s3, t2; \ | ||
96 | vmovdqa .Lpost_tf_hi_s3, t3; \ | ||
97 | filter_8bit(x0, t0, t1, t7, t6); \ | ||
98 | filter_8bit(x7, t0, t1, t7, t6); \ | ||
99 | filter_8bit(x3, t0, t1, t7, t6); \ | ||
100 | filter_8bit(x6, t0, t1, t7, t6); \ | ||
101 | \ | ||
102 | /* postfilter sbox 3 */ \ | ||
103 | vmovdqa .Lpost_tf_lo_s2, t4; \ | ||
104 | vmovdqa .Lpost_tf_hi_s2, t5; \ | ||
105 | filter_8bit(x2, t2, t3, t7, t6); \ | ||
106 | filter_8bit(x5, t2, t3, t7, t6); \ | ||
107 | \ | ||
108 | vpxor t6, t6, t6; \ | ||
109 | vmovq key, t0; \ | ||
110 | \ | ||
111 | /* postfilter sbox 2 */ \ | ||
112 | filter_8bit(x1, t4, t5, t7, t2); \ | ||
113 | filter_8bit(x4, t4, t5, t7, t2); \ | ||
114 | \ | ||
115 | vpsrldq $5, t0, t5; \ | ||
116 | vpsrldq $1, t0, t1; \ | ||
117 | vpsrldq $2, t0, t2; \ | ||
118 | vpsrldq $3, t0, t3; \ | ||
119 | vpsrldq $4, t0, t4; \ | ||
120 | vpshufb t6, t0, t0; \ | ||
121 | vpshufb t6, t1, t1; \ | ||
122 | vpshufb t6, t2, t2; \ | ||
123 | vpshufb t6, t3, t3; \ | ||
124 | vpshufb t6, t4, t4; \ | ||
125 | vpsrldq $2, t5, t7; \ | ||
126 | vpshufb t6, t7, t7; \ | ||
127 | \ | ||
128 | /* \ | ||
129 | * P-function \ | ||
130 | */ \ | ||
131 | vpxor x5, x0, x0; \ | ||
132 | vpxor x6, x1, x1; \ | ||
133 | vpxor x7, x2, x2; \ | ||
134 | vpxor x4, x3, x3; \ | ||
135 | \ | ||
136 | vpxor x2, x4, x4; \ | ||
137 | vpxor x3, x5, x5; \ | ||
138 | vpxor x0, x6, x6; \ | ||
139 | vpxor x1, x7, x7; \ | ||
140 | \ | ||
141 | vpxor x7, x0, x0; \ | ||
142 | vpxor x4, x1, x1; \ | ||
143 | vpxor x5, x2, x2; \ | ||
144 | vpxor x6, x3, x3; \ | ||
145 | \ | ||
146 | vpxor x3, x4, x4; \ | ||
147 | vpxor x0, x5, x5; \ | ||
148 | vpxor x1, x6, x6; \ | ||
149 | vpxor x2, x7, x7; /* note: high and low parts swapped */ \ | ||
150 | \ | ||
151 | /* \ | ||
152 | * Add key material and result to CD (x becomes new CD) \ | ||
153 | */ \ | ||
154 | \ | ||
155 | vpxor t3, x4, x4; \ | ||
156 | vpxor 0 * 16(mem_cd), x4, x4; \ | ||
157 | \ | ||
158 | vpxor t2, x5, x5; \ | ||
159 | vpxor 1 * 16(mem_cd), x5, x5; \ | ||
160 | \ | ||
161 | vpsrldq $1, t5, t3; \ | ||
162 | vpshufb t6, t5, t5; \ | ||
163 | vpshufb t6, t3, t6; \ | ||
164 | \ | ||
165 | vpxor t1, x6, x6; \ | ||
166 | vpxor 2 * 16(mem_cd), x6, x6; \ | ||
167 | \ | ||
168 | vpxor t0, x7, x7; \ | ||
169 | vpxor 3 * 16(mem_cd), x7, x7; \ | ||
170 | \ | ||
171 | vpxor t7, x0, x0; \ | ||
172 | vpxor 4 * 16(mem_cd), x0, x0; \ | ||
173 | \ | ||
174 | vpxor t6, x1, x1; \ | ||
175 | vpxor 5 * 16(mem_cd), x1, x1; \ | ||
176 | \ | ||
177 | vpxor t5, x2, x2; \ | ||
178 | vpxor 6 * 16(mem_cd), x2, x2; \ | ||
179 | \ | ||
180 | vpxor t4, x3, x3; \ | ||
181 | vpxor 7 * 16(mem_cd), x3, x3; | ||
182 | |||
183 | /* | ||
184 | * Size optimization... with inlined roundsm16, binary would be over 5 times | ||
185 | * larger and would only be 0.5% faster (on sandy-bridge). | ||
186 | */ | ||
187 | .align 8 | ||
188 | roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd: | ||
189 | roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
190 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, | ||
191 | %rcx, (%r9)); | ||
192 | ret; | ||
193 | |||
194 | .align 8 | ||
195 | roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab: | ||
196 | roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3, | ||
197 | %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11, | ||
198 | %rax, (%r9)); | ||
199 | ret; | ||
200 | |||
201 | /* | ||
202 | * IN/OUT: | ||
203 | * x0..x7: byte-sliced AB state preloaded | ||
204 | * mem_ab: byte-sliced AB state in memory | ||
205 | * mem_cb: byte-sliced CD state in memory | ||
206 | */ | ||
207 | #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
208 | y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ | ||
209 | leaq (key_table + (i) * 8)(CTX), %r9; \ | ||
210 | call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ | ||
211 | \ | ||
212 | vmovdqu x4, 0 * 16(mem_cd); \ | ||
213 | vmovdqu x5, 1 * 16(mem_cd); \ | ||
214 | vmovdqu x6, 2 * 16(mem_cd); \ | ||
215 | vmovdqu x7, 3 * 16(mem_cd); \ | ||
216 | vmovdqu x0, 4 * 16(mem_cd); \ | ||
217 | vmovdqu x1, 5 * 16(mem_cd); \ | ||
218 | vmovdqu x2, 6 * 16(mem_cd); \ | ||
219 | vmovdqu x3, 7 * 16(mem_cd); \ | ||
220 | \ | ||
221 | leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ | ||
222 | call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ | ||
223 | \ | ||
224 | store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); | ||
225 | |||
226 | #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ | ||
227 | |||
228 | #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ | ||
229 | /* Store new AB state */ \ | ||
230 | vmovdqu x0, 0 * 16(mem_ab); \ | ||
231 | vmovdqu x1, 1 * 16(mem_ab); \ | ||
232 | vmovdqu x2, 2 * 16(mem_ab); \ | ||
233 | vmovdqu x3, 3 * 16(mem_ab); \ | ||
234 | vmovdqu x4, 4 * 16(mem_ab); \ | ||
235 | vmovdqu x5, 5 * 16(mem_ab); \ | ||
236 | vmovdqu x6, 6 * 16(mem_ab); \ | ||
237 | vmovdqu x7, 7 * 16(mem_ab); | ||
238 | |||
239 | #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
240 | y6, y7, mem_ab, mem_cd, i) \ | ||
241 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
242 | y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ | ||
243 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
244 | y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ | ||
245 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
246 | y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); | ||
247 | |||
248 | #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
249 | y6, y7, mem_ab, mem_cd, i) \ | ||
250 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
251 | y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ | ||
252 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
253 | y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ | ||
254 | two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
255 | y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); | ||
256 | |||
257 | /* | ||
258 | * IN: | ||
259 | * v0..3: byte-sliced 32-bit integers | ||
260 | * OUT: | ||
261 | * v0..3: (IN <<< 1) | ||
262 | */ | ||
263 | #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \ | ||
264 | vpcmpgtb v0, zero, t0; \ | ||
265 | vpaddb v0, v0, v0; \ | ||
266 | vpabsb t0, t0; \ | ||
267 | \ | ||
268 | vpcmpgtb v1, zero, t1; \ | ||
269 | vpaddb v1, v1, v1; \ | ||
270 | vpabsb t1, t1; \ | ||
271 | \ | ||
272 | vpcmpgtb v2, zero, t2; \ | ||
273 | vpaddb v2, v2, v2; \ | ||
274 | vpabsb t2, t2; \ | ||
275 | \ | ||
276 | vpor t0, v1, v1; \ | ||
277 | \ | ||
278 | vpcmpgtb v3, zero, t0; \ | ||
279 | vpaddb v3, v3, v3; \ | ||
280 | vpabsb t0, t0; \ | ||
281 | \ | ||
282 | vpor t1, v2, v2; \ | ||
283 | vpor t2, v3, v3; \ | ||
284 | vpor t0, v0, v0; | ||
285 | |||
286 | /* | ||
287 | * IN: | ||
288 | * r: byte-sliced AB state in memory | ||
289 | * l: byte-sliced CD state in memory | ||
290 | * OUT: | ||
291 | * x0..x7: new byte-sliced CD state | ||
292 | */ | ||
293 | #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ | ||
294 | tt1, tt2, tt3, kll, klr, krl, krr) \ | ||
295 | /* \ | ||
296 | * t0 = kll; \ | ||
297 | * t0 &= ll; \ | ||
298 | * lr ^= rol32(t0, 1); \ | ||
299 | */ \ | ||
300 | vpxor tt0, tt0, tt0; \ | ||
301 | vmovd kll, t0; \ | ||
302 | vpshufb tt0, t0, t3; \ | ||
303 | vpsrldq $1, t0, t0; \ | ||
304 | vpshufb tt0, t0, t2; \ | ||
305 | vpsrldq $1, t0, t0; \ | ||
306 | vpshufb tt0, t0, t1; \ | ||
307 | vpsrldq $1, t0, t0; \ | ||
308 | vpshufb tt0, t0, t0; \ | ||
309 | \ | ||
310 | vpand l0, t0, t0; \ | ||
311 | vpand l1, t1, t1; \ | ||
312 | vpand l2, t2, t2; \ | ||
313 | vpand l3, t3, t3; \ | ||
314 | \ | ||
315 | rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | ||
316 | \ | ||
317 | vpxor l4, t0, l4; \ | ||
318 | vmovdqu l4, 4 * 16(l); \ | ||
319 | vpxor l5, t1, l5; \ | ||
320 | vmovdqu l5, 5 * 16(l); \ | ||
321 | vpxor l6, t2, l6; \ | ||
322 | vmovdqu l6, 6 * 16(l); \ | ||
323 | vpxor l7, t3, l7; \ | ||
324 | vmovdqu l7, 7 * 16(l); \ | ||
325 | \ | ||
326 | /* \ | ||
327 | * t2 = krr; \ | ||
328 | * t2 |= rr; \ | ||
329 | * rl ^= t2; \ | ||
330 | */ \ | ||
331 | \ | ||
332 | vmovd krr, t0; \ | ||
333 | vpshufb tt0, t0, t3; \ | ||
334 | vpsrldq $1, t0, t0; \ | ||
335 | vpshufb tt0, t0, t2; \ | ||
336 | vpsrldq $1, t0, t0; \ | ||
337 | vpshufb tt0, t0, t1; \ | ||
338 | vpsrldq $1, t0, t0; \ | ||
339 | vpshufb tt0, t0, t0; \ | ||
340 | \ | ||
341 | vpor 4 * 16(r), t0, t0; \ | ||
342 | vpor 5 * 16(r), t1, t1; \ | ||
343 | vpor 6 * 16(r), t2, t2; \ | ||
344 | vpor 7 * 16(r), t3, t3; \ | ||
345 | \ | ||
346 | vpxor 0 * 16(r), t0, t0; \ | ||
347 | vpxor 1 * 16(r), t1, t1; \ | ||
348 | vpxor 2 * 16(r), t2, t2; \ | ||
349 | vpxor 3 * 16(r), t3, t3; \ | ||
350 | vmovdqu t0, 0 * 16(r); \ | ||
351 | vmovdqu t1, 1 * 16(r); \ | ||
352 | vmovdqu t2, 2 * 16(r); \ | ||
353 | vmovdqu t3, 3 * 16(r); \ | ||
354 | \ | ||
355 | /* \ | ||
356 | * t2 = krl; \ | ||
357 | * t2 &= rl; \ | ||
358 | * rr ^= rol32(t2, 1); \ | ||
359 | */ \ | ||
360 | vmovd krl, t0; \ | ||
361 | vpshufb tt0, t0, t3; \ | ||
362 | vpsrldq $1, t0, t0; \ | ||
363 | vpshufb tt0, t0, t2; \ | ||
364 | vpsrldq $1, t0, t0; \ | ||
365 | vpshufb tt0, t0, t1; \ | ||
366 | vpsrldq $1, t0, t0; \ | ||
367 | vpshufb tt0, t0, t0; \ | ||
368 | \ | ||
369 | vpand 0 * 16(r), t0, t0; \ | ||
370 | vpand 1 * 16(r), t1, t1; \ | ||
371 | vpand 2 * 16(r), t2, t2; \ | ||
372 | vpand 3 * 16(r), t3, t3; \ | ||
373 | \ | ||
374 | rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ | ||
375 | \ | ||
376 | vpxor 4 * 16(r), t0, t0; \ | ||
377 | vpxor 5 * 16(r), t1, t1; \ | ||
378 | vpxor 6 * 16(r), t2, t2; \ | ||
379 | vpxor 7 * 16(r), t3, t3; \ | ||
380 | vmovdqu t0, 4 * 16(r); \ | ||
381 | vmovdqu t1, 5 * 16(r); \ | ||
382 | vmovdqu t2, 6 * 16(r); \ | ||
383 | vmovdqu t3, 7 * 16(r); \ | ||
384 | \ | ||
385 | /* \ | ||
386 | * t0 = klr; \ | ||
387 | * t0 |= lr; \ | ||
388 | * ll ^= t0; \ | ||
389 | */ \ | ||
390 | \ | ||
391 | vmovd klr, t0; \ | ||
392 | vpshufb tt0, t0, t3; \ | ||
393 | vpsrldq $1, t0, t0; \ | ||
394 | vpshufb tt0, t0, t2; \ | ||
395 | vpsrldq $1, t0, t0; \ | ||
396 | vpshufb tt0, t0, t1; \ | ||
397 | vpsrldq $1, t0, t0; \ | ||
398 | vpshufb tt0, t0, t0; \ | ||
399 | \ | ||
400 | vpor l4, t0, t0; \ | ||
401 | vpor l5, t1, t1; \ | ||
402 | vpor l6, t2, t2; \ | ||
403 | vpor l7, t3, t3; \ | ||
404 | \ | ||
405 | vpxor l0, t0, l0; \ | ||
406 | vmovdqu l0, 0 * 16(l); \ | ||
407 | vpxor l1, t1, l1; \ | ||
408 | vmovdqu l1, 1 * 16(l); \ | ||
409 | vpxor l2, t2, l2; \ | ||
410 | vmovdqu l2, 2 * 16(l); \ | ||
411 | vpxor l3, t3, l3; \ | ||
412 | vmovdqu l3, 3 * 16(l); | ||
413 | |||
414 | #define transpose_4x4(x0, x1, x2, x3, t1, t2) \ | ||
415 | vpunpckhdq x1, x0, t2; \ | ||
416 | vpunpckldq x1, x0, x0; \ | ||
417 | \ | ||
418 | vpunpckldq x3, x2, t1; \ | ||
419 | vpunpckhdq x3, x2, x2; \ | ||
420 | \ | ||
421 | vpunpckhqdq t1, x0, x1; \ | ||
422 | vpunpcklqdq t1, x0, x0; \ | ||
423 | \ | ||
424 | vpunpckhqdq x2, t2, x3; \ | ||
425 | vpunpcklqdq x2, t2, x2; | ||
426 | |||
427 | #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \ | ||
428 | b3, c3, d3, st0, st1) \ | ||
429 | vmovdqu d2, st0; \ | ||
430 | vmovdqu d3, st1; \ | ||
431 | transpose_4x4(a0, a1, a2, a3, d2, d3); \ | ||
432 | transpose_4x4(b0, b1, b2, b3, d2, d3); \ | ||
433 | vmovdqu st0, d2; \ | ||
434 | vmovdqu st1, d3; \ | ||
435 | \ | ||
436 | vmovdqu a0, st0; \ | ||
437 | vmovdqu a1, st1; \ | ||
438 | transpose_4x4(c0, c1, c2, c3, a0, a1); \ | ||
439 | transpose_4x4(d0, d1, d2, d3, a0, a1); \ | ||
440 | \ | ||
441 | vmovdqu .Lshufb_16x16b, a0; \ | ||
442 | vmovdqu st1, a1; \ | ||
443 | vpshufb a0, a2, a2; \ | ||
444 | vpshufb a0, a3, a3; \ | ||
445 | vpshufb a0, b0, b0; \ | ||
446 | vpshufb a0, b1, b1; \ | ||
447 | vpshufb a0, b2, b2; \ | ||
448 | vpshufb a0, b3, b3; \ | ||
449 | vpshufb a0, a1, a1; \ | ||
450 | vpshufb a0, c0, c0; \ | ||
451 | vpshufb a0, c1, c1; \ | ||
452 | vpshufb a0, c2, c2; \ | ||
453 | vpshufb a0, c3, c3; \ | ||
454 | vpshufb a0, d0, d0; \ | ||
455 | vpshufb a0, d1, d1; \ | ||
456 | vpshufb a0, d2, d2; \ | ||
457 | vpshufb a0, d3, d3; \ | ||
458 | vmovdqu d3, st1; \ | ||
459 | vmovdqu st0, d3; \ | ||
460 | vpshufb a0, d3, a0; \ | ||
461 | vmovdqu d2, st0; \ | ||
462 | \ | ||
463 | transpose_4x4(a0, b0, c0, d0, d2, d3); \ | ||
464 | transpose_4x4(a1, b1, c1, d1, d2, d3); \ | ||
465 | vmovdqu st0, d2; \ | ||
466 | vmovdqu st1, d3; \ | ||
467 | \ | ||
468 | vmovdqu b0, st0; \ | ||
469 | vmovdqu b1, st1; \ | ||
470 | transpose_4x4(a2, b2, c2, d2, b0, b1); \ | ||
471 | transpose_4x4(a3, b3, c3, d3, b0, b1); \ | ||
472 | vmovdqu st0, b0; \ | ||
473 | vmovdqu st1, b1; \ | ||
474 | /* does not adjust output bytes inside vectors */ | ||
475 | |||
476 | /* load blocks to registers and apply pre-whitening */ | ||
477 | #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
478 | y6, y7, rio, key) \ | ||
479 | vmovq key, x0; \ | ||
480 | vpshufb .Lpack_bswap, x0, x0; \ | ||
481 | \ | ||
482 | vpxor 0 * 16(rio), x0, y7; \ | ||
483 | vpxor 1 * 16(rio), x0, y6; \ | ||
484 | vpxor 2 * 16(rio), x0, y5; \ | ||
485 | vpxor 3 * 16(rio), x0, y4; \ | ||
486 | vpxor 4 * 16(rio), x0, y3; \ | ||
487 | vpxor 5 * 16(rio), x0, y2; \ | ||
488 | vpxor 6 * 16(rio), x0, y1; \ | ||
489 | vpxor 7 * 16(rio), x0, y0; \ | ||
490 | vpxor 8 * 16(rio), x0, x7; \ | ||
491 | vpxor 9 * 16(rio), x0, x6; \ | ||
492 | vpxor 10 * 16(rio), x0, x5; \ | ||
493 | vpxor 11 * 16(rio), x0, x4; \ | ||
494 | vpxor 12 * 16(rio), x0, x3; \ | ||
495 | vpxor 13 * 16(rio), x0, x2; \ | ||
496 | vpxor 14 * 16(rio), x0, x1; \ | ||
497 | vpxor 15 * 16(rio), x0, x0; | ||
498 | |||
499 | /* byteslice pre-whitened blocks and store to temporary memory */ | ||
500 | #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
501 | y6, y7, mem_ab, mem_cd) \ | ||
502 | byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ | ||
503 | y5, y6, y7, (mem_ab), (mem_cd)); \ | ||
504 | \ | ||
505 | vmovdqu x0, 0 * 16(mem_ab); \ | ||
506 | vmovdqu x1, 1 * 16(mem_ab); \ | ||
507 | vmovdqu x2, 2 * 16(mem_ab); \ | ||
508 | vmovdqu x3, 3 * 16(mem_ab); \ | ||
509 | vmovdqu x4, 4 * 16(mem_ab); \ | ||
510 | vmovdqu x5, 5 * 16(mem_ab); \ | ||
511 | vmovdqu x6, 6 * 16(mem_ab); \ | ||
512 | vmovdqu x7, 7 * 16(mem_ab); \ | ||
513 | vmovdqu y0, 0 * 16(mem_cd); \ | ||
514 | vmovdqu y1, 1 * 16(mem_cd); \ | ||
515 | vmovdqu y2, 2 * 16(mem_cd); \ | ||
516 | vmovdqu y3, 3 * 16(mem_cd); \ | ||
517 | vmovdqu y4, 4 * 16(mem_cd); \ | ||
518 | vmovdqu y5, 5 * 16(mem_cd); \ | ||
519 | vmovdqu y6, 6 * 16(mem_cd); \ | ||
520 | vmovdqu y7, 7 * 16(mem_cd); | ||
521 | |||
522 | /* de-byteslice, apply post-whitening and store blocks */ | ||
523 | #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ | ||
524 | y5, y6, y7, key, stack_tmp0, stack_tmp1) \ | ||
525 | byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \ | ||
526 | y7, x3, x7, stack_tmp0, stack_tmp1); \ | ||
527 | \ | ||
528 | vmovdqu x0, stack_tmp0; \ | ||
529 | \ | ||
530 | vmovq key, x0; \ | ||
531 | vpshufb .Lpack_bswap, x0, x0; \ | ||
532 | \ | ||
533 | vpxor x0, y7, y7; \ | ||
534 | vpxor x0, y6, y6; \ | ||
535 | vpxor x0, y5, y5; \ | ||
536 | vpxor x0, y4, y4; \ | ||
537 | vpxor x0, y3, y3; \ | ||
538 | vpxor x0, y2, y2; \ | ||
539 | vpxor x0, y1, y1; \ | ||
540 | vpxor x0, y0, y0; \ | ||
541 | vpxor x0, x7, x7; \ | ||
542 | vpxor x0, x6, x6; \ | ||
543 | vpxor x0, x5, x5; \ | ||
544 | vpxor x0, x4, x4; \ | ||
545 | vpxor x0, x3, x3; \ | ||
546 | vpxor x0, x2, x2; \ | ||
547 | vpxor x0, x1, x1; \ | ||
548 | vpxor stack_tmp0, x0, x0; | ||
549 | |||
550 | #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ | ||
551 | y6, y7, rio) \ | ||
552 | vmovdqu x0, 0 * 16(rio); \ | ||
553 | vmovdqu x1, 1 * 16(rio); \ | ||
554 | vmovdqu x2, 2 * 16(rio); \ | ||
555 | vmovdqu x3, 3 * 16(rio); \ | ||
556 | vmovdqu x4, 4 * 16(rio); \ | ||
557 | vmovdqu x5, 5 * 16(rio); \ | ||
558 | vmovdqu x6, 6 * 16(rio); \ | ||
559 | vmovdqu x7, 7 * 16(rio); \ | ||
560 | vmovdqu y0, 8 * 16(rio); \ | ||
561 | vmovdqu y1, 9 * 16(rio); \ | ||
562 | vmovdqu y2, 10 * 16(rio); \ | ||
563 | vmovdqu y3, 11 * 16(rio); \ | ||
564 | vmovdqu y4, 12 * 16(rio); \ | ||
565 | vmovdqu y5, 13 * 16(rio); \ | ||
566 | vmovdqu y6, 14 * 16(rio); \ | ||
567 | vmovdqu y7, 15 * 16(rio); | ||
568 | |||
569 | .data | ||
570 | .align 16 | ||
571 | |||
572 | #define SHUFB_BYTES(idx) \ | ||
573 | 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) | ||
574 | |||
575 | .Lshufb_16x16b: | ||
576 | .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); | ||
577 | |||
578 | .Lpack_bswap: | ||
579 | .long 0x00010203 | ||
580 | .long 0x04050607 | ||
581 | .long 0x80808080 | ||
582 | .long 0x80808080 | ||
583 | |||
584 | /* For CTR-mode IV byteswap */ | ||
585 | .Lbswap128_mask: | ||
586 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
587 | |||
588 | /* | ||
589 | * pre-SubByte transform | ||
590 | * | ||
591 | * pre-lookup for sbox1, sbox2, sbox3: | ||
592 | * swap_bitendianness( | ||
593 | * isom_map_camellia_to_aes( | ||
594 | * camellia_f( | ||
595 | * swap_bitendianess(in) | ||
596 | * ) | ||
597 | * ) | ||
598 | * ) | ||
599 | * | ||
600 | * (note: '⊕ 0xc5' inside camellia_f()) | ||
601 | */ | ||
602 | .Lpre_tf_lo_s1: | ||
603 | .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 | ||
604 | .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 | ||
605 | .Lpre_tf_hi_s1: | ||
606 | .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a | ||
607 | .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 | ||
608 | |||
609 | /* | ||
610 | * pre-SubByte transform | ||
611 | * | ||
612 | * pre-lookup for sbox4: | ||
613 | * swap_bitendianness( | ||
614 | * isom_map_camellia_to_aes( | ||
615 | * camellia_f( | ||
616 | * swap_bitendianess(in <<< 1) | ||
617 | * ) | ||
618 | * ) | ||
619 | * ) | ||
620 | * | ||
621 | * (note: '⊕ 0xc5' inside camellia_f()) | ||
622 | */ | ||
623 | .Lpre_tf_lo_s4: | ||
624 | .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 | ||
625 | .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 | ||
626 | .Lpre_tf_hi_s4: | ||
627 | .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 | ||
628 | .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf | ||
629 | |||
630 | /* | ||
631 | * post-SubByte transform | ||
632 | * | ||
633 | * post-lookup for sbox1, sbox4: | ||
634 | * swap_bitendianness( | ||
635 | * camellia_h( | ||
636 | * isom_map_aes_to_camellia( | ||
637 | * swap_bitendianness( | ||
638 | * aes_inverse_affine_transform(in) | ||
639 | * ) | ||
640 | * ) | ||
641 | * ) | ||
642 | * ) | ||
643 | * | ||
644 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
645 | */ | ||
646 | .Lpost_tf_lo_s1: | ||
647 | .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 | ||
648 | .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 | ||
649 | .Lpost_tf_hi_s1: | ||
650 | .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 | ||
651 | .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c | ||
652 | |||
653 | /* | ||
654 | * post-SubByte transform | ||
655 | * | ||
656 | * post-lookup for sbox2: | ||
657 | * swap_bitendianness( | ||
658 | * camellia_h( | ||
659 | * isom_map_aes_to_camellia( | ||
660 | * swap_bitendianness( | ||
661 | * aes_inverse_affine_transform(in) | ||
662 | * ) | ||
663 | * ) | ||
664 | * ) | ||
665 | * ) <<< 1 | ||
666 | * | ||
667 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
668 | */ | ||
669 | .Lpost_tf_lo_s2: | ||
670 | .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 | ||
671 | .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 | ||
672 | .Lpost_tf_hi_s2: | ||
673 | .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 | ||
674 | .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 | ||
675 | |||
676 | /* | ||
677 | * post-SubByte transform | ||
678 | * | ||
679 | * post-lookup for sbox3: | ||
680 | * swap_bitendianness( | ||
681 | * camellia_h( | ||
682 | * isom_map_aes_to_camellia( | ||
683 | * swap_bitendianness( | ||
684 | * aes_inverse_affine_transform(in) | ||
685 | * ) | ||
686 | * ) | ||
687 | * ) | ||
688 | * ) >>> 1 | ||
689 | * | ||
690 | * (note: '⊕ 0x6e' inside camellia_h()) | ||
691 | */ | ||
692 | .Lpost_tf_lo_s3: | ||
693 | .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 | ||
694 | .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 | ||
695 | .Lpost_tf_hi_s3: | ||
696 | .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 | ||
697 | .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 | ||
698 | |||
699 | /* For isolating SubBytes from AESENCLAST, inverse shift row */ | ||
700 | .Linv_shift_row: | ||
701 | .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b | ||
702 | .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 | ||
703 | |||
704 | /* 4-bit mask */ | ||
705 | .align 4 | ||
706 | .L0f0f0f0f: | ||
707 | .long 0x0f0f0f0f | ||
708 | |||
709 | .text | ||
710 | |||
711 | .align 8 | ||
712 | .type __camellia_enc_blk16,@function; | ||
713 | |||
714 | __camellia_enc_blk16: | ||
715 | /* input: | ||
716 | * %rdi: ctx, CTX | ||
717 | * %rax: temporary storage, 256 bytes | ||
718 | * %xmm0..%xmm15: 16 plaintext blocks | ||
719 | * output: | ||
720 | * %xmm0..%xmm15: 16 encrypted blocks, order swapped: | ||
721 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | ||
722 | */ | ||
723 | |||
724 | leaq 8 * 16(%rax), %rcx; | ||
725 | |||
726 | inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
727 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
728 | %xmm15, %rax, %rcx); | ||
729 | |||
730 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
731 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
732 | %xmm15, %rax, %rcx, 0); | ||
733 | |||
734 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
735 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
736 | %xmm15, | ||
737 | ((key_table + (8) * 8) + 0)(CTX), | ||
738 | ((key_table + (8) * 8) + 4)(CTX), | ||
739 | ((key_table + (8) * 8) + 8)(CTX), | ||
740 | ((key_table + (8) * 8) + 12)(CTX)); | ||
741 | |||
742 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
743 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
744 | %xmm15, %rax, %rcx, 8); | ||
745 | |||
746 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
747 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
748 | %xmm15, | ||
749 | ((key_table + (16) * 8) + 0)(CTX), | ||
750 | ((key_table + (16) * 8) + 4)(CTX), | ||
751 | ((key_table + (16) * 8) + 8)(CTX), | ||
752 | ((key_table + (16) * 8) + 12)(CTX)); | ||
753 | |||
754 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
755 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
756 | %xmm15, %rax, %rcx, 16); | ||
757 | |||
758 | movl $24, %r8d; | ||
759 | cmpl $16, key_length(CTX); | ||
760 | jne .Lenc_max32; | ||
761 | |||
762 | .Lenc_done: | ||
763 | /* load CD for output */ | ||
764 | vmovdqu 0 * 16(%rcx), %xmm8; | ||
765 | vmovdqu 1 * 16(%rcx), %xmm9; | ||
766 | vmovdqu 2 * 16(%rcx), %xmm10; | ||
767 | vmovdqu 3 * 16(%rcx), %xmm11; | ||
768 | vmovdqu 4 * 16(%rcx), %xmm12; | ||
769 | vmovdqu 5 * 16(%rcx), %xmm13; | ||
770 | vmovdqu 6 * 16(%rcx), %xmm14; | ||
771 | vmovdqu 7 * 16(%rcx), %xmm15; | ||
772 | |||
773 | outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
774 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
775 | %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax)); | ||
776 | |||
777 | ret; | ||
778 | |||
779 | .align 8 | ||
780 | .Lenc_max32: | ||
781 | movl $32, %r8d; | ||
782 | |||
783 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
784 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
785 | %xmm15, | ||
786 | ((key_table + (24) * 8) + 0)(CTX), | ||
787 | ((key_table + (24) * 8) + 4)(CTX), | ||
788 | ((key_table + (24) * 8) + 8)(CTX), | ||
789 | ((key_table + (24) * 8) + 12)(CTX)); | ||
790 | |||
791 | enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
792 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
793 | %xmm15, %rax, %rcx, 24); | ||
794 | |||
795 | jmp .Lenc_done; | ||
796 | |||
797 | .align 8 | ||
798 | .type __camellia_dec_blk16,@function; | ||
799 | |||
800 | __camellia_dec_blk16: | ||
801 | /* input: | ||
802 | * %rdi: ctx, CTX | ||
803 | * %rax: temporary storage, 256 bytes | ||
804 | * %r8d: 24 for 16 byte key, 32 for larger | ||
805 | * %xmm0..%xmm15: 16 encrypted blocks | ||
806 | * output: | ||
807 | * %xmm0..%xmm15: 16 plaintext blocks, order swapped: | ||
808 | * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 | ||
809 | */ | ||
810 | |||
811 | leaq 8 * 16(%rax), %rcx; | ||
812 | |||
813 | inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
814 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
815 | %xmm15, %rax, %rcx); | ||
816 | |||
817 | cmpl $32, %r8d; | ||
818 | je .Ldec_max32; | ||
819 | |||
820 | .Ldec_max24: | ||
821 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
822 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
823 | %xmm15, %rax, %rcx, 16); | ||
824 | |||
825 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
826 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
827 | %xmm15, | ||
828 | ((key_table + (16) * 8) + 8)(CTX), | ||
829 | ((key_table + (16) * 8) + 12)(CTX), | ||
830 | ((key_table + (16) * 8) + 0)(CTX), | ||
831 | ((key_table + (16) * 8) + 4)(CTX)); | ||
832 | |||
833 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
834 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
835 | %xmm15, %rax, %rcx, 8); | ||
836 | |||
837 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
838 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
839 | %xmm15, | ||
840 | ((key_table + (8) * 8) + 8)(CTX), | ||
841 | ((key_table + (8) * 8) + 12)(CTX), | ||
842 | ((key_table + (8) * 8) + 0)(CTX), | ||
843 | ((key_table + (8) * 8) + 4)(CTX)); | ||
844 | |||
845 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
846 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
847 | %xmm15, %rax, %rcx, 0); | ||
848 | |||
849 | /* load CD for output */ | ||
850 | vmovdqu 0 * 16(%rcx), %xmm8; | ||
851 | vmovdqu 1 * 16(%rcx), %xmm9; | ||
852 | vmovdqu 2 * 16(%rcx), %xmm10; | ||
853 | vmovdqu 3 * 16(%rcx), %xmm11; | ||
854 | vmovdqu 4 * 16(%rcx), %xmm12; | ||
855 | vmovdqu 5 * 16(%rcx), %xmm13; | ||
856 | vmovdqu 6 * 16(%rcx), %xmm14; | ||
857 | vmovdqu 7 * 16(%rcx), %xmm15; | ||
858 | |||
859 | outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
860 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
861 | %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); | ||
862 | |||
863 | ret; | ||
864 | |||
865 | .align 8 | ||
866 | .Ldec_max32: | ||
867 | dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
868 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
869 | %xmm15, %rax, %rcx, 24); | ||
870 | |||
871 | fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
872 | %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
873 | %xmm15, | ||
874 | ((key_table + (24) * 8) + 8)(CTX), | ||
875 | ((key_table + (24) * 8) + 12)(CTX), | ||
876 | ((key_table + (24) * 8) + 0)(CTX), | ||
877 | ((key_table + (24) * 8) + 4)(CTX)); | ||
878 | |||
879 | jmp .Ldec_max24; | ||
880 | |||
881 | .align 8 | ||
882 | .global camellia_ecb_enc_16way | ||
883 | .type camellia_ecb_enc_16way,@function; | ||
884 | |||
885 | camellia_ecb_enc_16way: | ||
886 | /* input: | ||
887 | * %rdi: ctx, CTX | ||
888 | * %rsi: dst (16 blocks) | ||
889 | * %rdx: src (16 blocks) | ||
890 | */ | ||
891 | |||
892 | inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
893 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
894 | %xmm15, %rdx, (key_table)(CTX)); | ||
895 | |||
896 | /* now dst can be used as temporary buffer (even in src == dst case) */ | ||
897 | movq %rsi, %rax; | ||
898 | |||
899 | call __camellia_enc_blk16; | ||
900 | |||
901 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
902 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
903 | %xmm8, %rsi); | ||
904 | |||
905 | ret; | ||
906 | |||
907 | .align 8 | ||
908 | .global camellia_ecb_dec_16way | ||
909 | .type camellia_ecb_dec_16way,@function; | ||
910 | |||
911 | camellia_ecb_dec_16way: | ||
912 | /* input: | ||
913 | * %rdi: ctx, CTX | ||
914 | * %rsi: dst (16 blocks) | ||
915 | * %rdx: src (16 blocks) | ||
916 | */ | ||
917 | |||
918 | cmpl $16, key_length(CTX); | ||
919 | movl $32, %r8d; | ||
920 | movl $24, %eax; | ||
921 | cmovel %eax, %r8d; /* max */ | ||
922 | |||
923 | inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
924 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
925 | %xmm15, %rdx, (key_table)(CTX, %r8, 8)); | ||
926 | |||
927 | /* now dst can be used as temporary buffer (even in src == dst case) */ | ||
928 | movq %rsi, %rax; | ||
929 | |||
930 | call __camellia_dec_blk16; | ||
931 | |||
932 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
933 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
934 | %xmm8, %rsi); | ||
935 | |||
936 | ret; | ||
937 | |||
938 | .align 8 | ||
939 | .global camellia_cbc_dec_16way | ||
940 | .type camellia_cbc_dec_16way,@function; | ||
941 | |||
942 | camellia_cbc_dec_16way: | ||
943 | /* input: | ||
944 | * %rdi: ctx, CTX | ||
945 | * %rsi: dst (16 blocks) | ||
946 | * %rdx: src (16 blocks) | ||
947 | */ | ||
948 | |||
949 | cmpl $16, key_length(CTX); | ||
950 | movl $32, %r8d; | ||
951 | movl $24, %eax; | ||
952 | cmovel %eax, %r8d; /* max */ | ||
953 | |||
954 | inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, | ||
955 | %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, | ||
956 | %xmm15, %rdx, (key_table)(CTX, %r8, 8)); | ||
957 | |||
958 | /* | ||
959 | * dst might still be in-use (in case dst == src), so use stack for | ||
960 | * temporary storage. | ||
961 | */ | ||
962 | subq $(16 * 16), %rsp; | ||
963 | movq %rsp, %rax; | ||
964 | |||
965 | call __camellia_dec_blk16; | ||
966 | |||
967 | addq $(16 * 16), %rsp; | ||
968 | |||
969 | vpxor (0 * 16)(%rdx), %xmm6, %xmm6; | ||
970 | vpxor (1 * 16)(%rdx), %xmm5, %xmm5; | ||
971 | vpxor (2 * 16)(%rdx), %xmm4, %xmm4; | ||
972 | vpxor (3 * 16)(%rdx), %xmm3, %xmm3; | ||
973 | vpxor (4 * 16)(%rdx), %xmm2, %xmm2; | ||
974 | vpxor (5 * 16)(%rdx), %xmm1, %xmm1; | ||
975 | vpxor (6 * 16)(%rdx), %xmm0, %xmm0; | ||
976 | vpxor (7 * 16)(%rdx), %xmm15, %xmm15; | ||
977 | vpxor (8 * 16)(%rdx), %xmm14, %xmm14; | ||
978 | vpxor (9 * 16)(%rdx), %xmm13, %xmm13; | ||
979 | vpxor (10 * 16)(%rdx), %xmm12, %xmm12; | ||
980 | vpxor (11 * 16)(%rdx), %xmm11, %xmm11; | ||
981 | vpxor (12 * 16)(%rdx), %xmm10, %xmm10; | ||
982 | vpxor (13 * 16)(%rdx), %xmm9, %xmm9; | ||
983 | vpxor (14 * 16)(%rdx), %xmm8, %xmm8; | ||
984 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
985 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
986 | %xmm8, %rsi); | ||
987 | |||
988 | ret; | ||
989 | |||
990 | #define inc_le128(x, minus_one, tmp) \ | ||
991 | vpcmpeqq minus_one, x, tmp; \ | ||
992 | vpsubq minus_one, x, x; \ | ||
993 | vpslldq $8, tmp, tmp; \ | ||
994 | vpsubq tmp, x, x; | ||
995 | |||
996 | .align 8 | ||
997 | .global camellia_ctr_16way | ||
998 | .type camellia_ctr_16way,@function; | ||
999 | |||
1000 | camellia_ctr_16way: | ||
1001 | /* input: | ||
1002 | * %rdi: ctx, CTX | ||
1003 | * %rsi: dst (16 blocks) | ||
1004 | * %rdx: src (16 blocks) | ||
1005 | * %rcx: iv (little endian, 128bit) | ||
1006 | */ | ||
1007 | |||
1008 | subq $(16 * 16), %rsp; | ||
1009 | movq %rsp, %rax; | ||
1010 | |||
1011 | vmovdqa .Lbswap128_mask, %xmm14; | ||
1012 | |||
1013 | /* load IV and byteswap */ | ||
1014 | vmovdqu (%rcx), %xmm0; | ||
1015 | vpshufb %xmm14, %xmm0, %xmm15; | ||
1016 | vmovdqu %xmm15, 15 * 16(%rax); | ||
1017 | |||
1018 | vpcmpeqd %xmm15, %xmm15, %xmm15; | ||
1019 | vpsrldq $8, %xmm15, %xmm15; /* low: -1, high: 0 */ | ||
1020 | |||
1021 | /* construct IVs */ | ||
1022 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1023 | vpshufb %xmm14, %xmm0, %xmm13; | ||
1024 | vmovdqu %xmm13, 14 * 16(%rax); | ||
1025 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1026 | vpshufb %xmm14, %xmm0, %xmm13; | ||
1027 | vmovdqu %xmm13, 13 * 16(%rax); | ||
1028 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1029 | vpshufb %xmm14, %xmm0, %xmm12; | ||
1030 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1031 | vpshufb %xmm14, %xmm0, %xmm11; | ||
1032 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1033 | vpshufb %xmm14, %xmm0, %xmm10; | ||
1034 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1035 | vpshufb %xmm14, %xmm0, %xmm9; | ||
1036 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1037 | vpshufb %xmm14, %xmm0, %xmm8; | ||
1038 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1039 | vpshufb %xmm14, %xmm0, %xmm7; | ||
1040 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1041 | vpshufb %xmm14, %xmm0, %xmm6; | ||
1042 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1043 | vpshufb %xmm14, %xmm0, %xmm5; | ||
1044 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1045 | vpshufb %xmm14, %xmm0, %xmm4; | ||
1046 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1047 | vpshufb %xmm14, %xmm0, %xmm3; | ||
1048 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1049 | vpshufb %xmm14, %xmm0, %xmm2; | ||
1050 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1051 | vpshufb %xmm14, %xmm0, %xmm1; | ||
1052 | inc_le128(%xmm0, %xmm15, %xmm13); | ||
1053 | vmovdqa %xmm0, %xmm13; | ||
1054 | vpshufb %xmm14, %xmm0, %xmm0; | ||
1055 | inc_le128(%xmm13, %xmm15, %xmm14); | ||
1056 | vmovdqu %xmm13, (%rcx); | ||
1057 | |||
1058 | /* inpack16_pre: */ | ||
1059 | vmovq (key_table)(CTX), %xmm15; | ||
1060 | vpshufb .Lpack_bswap, %xmm15, %xmm15; | ||
1061 | vpxor %xmm0, %xmm15, %xmm0; | ||
1062 | vpxor %xmm1, %xmm15, %xmm1; | ||
1063 | vpxor %xmm2, %xmm15, %xmm2; | ||
1064 | vpxor %xmm3, %xmm15, %xmm3; | ||
1065 | vpxor %xmm4, %xmm15, %xmm4; | ||
1066 | vpxor %xmm5, %xmm15, %xmm5; | ||
1067 | vpxor %xmm6, %xmm15, %xmm6; | ||
1068 | vpxor %xmm7, %xmm15, %xmm7; | ||
1069 | vpxor %xmm8, %xmm15, %xmm8; | ||
1070 | vpxor %xmm9, %xmm15, %xmm9; | ||
1071 | vpxor %xmm10, %xmm15, %xmm10; | ||
1072 | vpxor %xmm11, %xmm15, %xmm11; | ||
1073 | vpxor %xmm12, %xmm15, %xmm12; | ||
1074 | vpxor 13 * 16(%rax), %xmm15, %xmm13; | ||
1075 | vpxor 14 * 16(%rax), %xmm15, %xmm14; | ||
1076 | vpxor 15 * 16(%rax), %xmm15, %xmm15; | ||
1077 | |||
1078 | call __camellia_enc_blk16; | ||
1079 | |||
1080 | addq $(16 * 16), %rsp; | ||
1081 | |||
1082 | vpxor 0 * 16(%rdx), %xmm7, %xmm7; | ||
1083 | vpxor 1 * 16(%rdx), %xmm6, %xmm6; | ||
1084 | vpxor 2 * 16(%rdx), %xmm5, %xmm5; | ||
1085 | vpxor 3 * 16(%rdx), %xmm4, %xmm4; | ||
1086 | vpxor 4 * 16(%rdx), %xmm3, %xmm3; | ||
1087 | vpxor 5 * 16(%rdx), %xmm2, %xmm2; | ||
1088 | vpxor 6 * 16(%rdx), %xmm1, %xmm1; | ||
1089 | vpxor 7 * 16(%rdx), %xmm0, %xmm0; | ||
1090 | vpxor 8 * 16(%rdx), %xmm15, %xmm15; | ||
1091 | vpxor 9 * 16(%rdx), %xmm14, %xmm14; | ||
1092 | vpxor 10 * 16(%rdx), %xmm13, %xmm13; | ||
1093 | vpxor 11 * 16(%rdx), %xmm12, %xmm12; | ||
1094 | vpxor 12 * 16(%rdx), %xmm11, %xmm11; | ||
1095 | vpxor 13 * 16(%rdx), %xmm10, %xmm10; | ||
1096 | vpxor 14 * 16(%rdx), %xmm9, %xmm9; | ||
1097 | vpxor 15 * 16(%rdx), %xmm8, %xmm8; | ||
1098 | write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0, | ||
1099 | %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9, | ||
1100 | %xmm8, %rsi); | ||
1101 | |||
1102 | ret; | ||
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c new file mode 100644 index 000000000000..96cbb6068fce --- /dev/null +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c | |||
@@ -0,0 +1,558 @@ | |||
1 | /* | ||
2 | * Glue Code for x86_64/AVX/AES-NI assembler optimized version of Camellia | ||
3 | * | ||
4 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/types.h> | ||
15 | #include <linux/crypto.h> | ||
16 | #include <linux/err.h> | ||
17 | #include <crypto/algapi.h> | ||
18 | #include <crypto/ctr.h> | ||
19 | #include <crypto/lrw.h> | ||
20 | #include <crypto/xts.h> | ||
21 | #include <asm/xcr.h> | ||
22 | #include <asm/xsave.h> | ||
23 | #include <asm/crypto/camellia.h> | ||
24 | #include <asm/crypto/ablk_helper.h> | ||
25 | #include <asm/crypto/glue_helper.h> | ||
26 | |||
27 | #define CAMELLIA_AESNI_PARALLEL_BLOCKS 16 | ||
28 | |||
29 | /* 16-way AES-NI parallel cipher functions */ | ||
30 | asmlinkage void camellia_ecb_enc_16way(struct camellia_ctx *ctx, u8 *dst, | ||
31 | const u8 *src); | ||
32 | asmlinkage void camellia_ecb_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
33 | const u8 *src); | ||
34 | |||
35 | asmlinkage void camellia_cbc_dec_16way(struct camellia_ctx *ctx, u8 *dst, | ||
36 | const u8 *src); | ||
37 | asmlinkage void camellia_ctr_16way(struct camellia_ctx *ctx, u8 *dst, | ||
38 | const u8 *src, le128 *iv); | ||
39 | |||
40 | static const struct common_glue_ctx camellia_enc = { | ||
41 | .num_funcs = 3, | ||
42 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
43 | |||
44 | .funcs = { { | ||
45 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
46 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_enc_16way) } | ||
47 | }, { | ||
48 | .num_blocks = 2, | ||
49 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk_2way) } | ||
50 | }, { | ||
51 | .num_blocks = 1, | ||
52 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_enc_blk) } | ||
53 | } } | ||
54 | }; | ||
55 | |||
56 | static const struct common_glue_ctx camellia_ctr = { | ||
57 | .num_funcs = 3, | ||
58 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
59 | |||
60 | .funcs = { { | ||
61 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
62 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_ctr_16way) } | ||
63 | }, { | ||
64 | .num_blocks = 2, | ||
65 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr_2way) } | ||
66 | }, { | ||
67 | .num_blocks = 1, | ||
68 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(camellia_crypt_ctr) } | ||
69 | } } | ||
70 | }; | ||
71 | |||
72 | static const struct common_glue_ctx camellia_dec = { | ||
73 | .num_funcs = 3, | ||
74 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
75 | |||
76 | .funcs = { { | ||
77 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
78 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_ecb_dec_16way) } | ||
79 | }, { | ||
80 | .num_blocks = 2, | ||
81 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk_2way) } | ||
82 | }, { | ||
83 | .num_blocks = 1, | ||
84 | .fn_u = { .ecb = GLUE_FUNC_CAST(camellia_dec_blk) } | ||
85 | } } | ||
86 | }; | ||
87 | |||
88 | static const struct common_glue_ctx camellia_dec_cbc = { | ||
89 | .num_funcs = 3, | ||
90 | .fpu_blocks_limit = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
91 | |||
92 | .funcs = { { | ||
93 | .num_blocks = CAMELLIA_AESNI_PARALLEL_BLOCKS, | ||
94 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_cbc_dec_16way) } | ||
95 | }, { | ||
96 | .num_blocks = 2, | ||
97 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_decrypt_cbc_2way) } | ||
98 | }, { | ||
99 | .num_blocks = 1, | ||
100 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(camellia_dec_blk) } | ||
101 | } } | ||
102 | }; | ||
103 | |||
104 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
105 | struct scatterlist *src, unsigned int nbytes) | ||
106 | { | ||
107 | return glue_ecb_crypt_128bit(&camellia_enc, desc, dst, src, nbytes); | ||
108 | } | ||
109 | |||
110 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
111 | struct scatterlist *src, unsigned int nbytes) | ||
112 | { | ||
113 | return glue_ecb_crypt_128bit(&camellia_dec, desc, dst, src, nbytes); | ||
114 | } | ||
115 | |||
116 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
117 | struct scatterlist *src, unsigned int nbytes) | ||
118 | { | ||
119 | return glue_cbc_encrypt_128bit(GLUE_FUNC_CAST(camellia_enc_blk), desc, | ||
120 | dst, src, nbytes); | ||
121 | } | ||
122 | |||
123 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
124 | struct scatterlist *src, unsigned int nbytes) | ||
125 | { | ||
126 | return glue_cbc_decrypt_128bit(&camellia_dec_cbc, desc, dst, src, | ||
127 | nbytes); | ||
128 | } | ||
129 | |||
130 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
131 | struct scatterlist *src, unsigned int nbytes) | ||
132 | { | ||
133 | return glue_ctr_crypt_128bit(&camellia_ctr, desc, dst, src, nbytes); | ||
134 | } | ||
135 | |||
136 | static inline bool camellia_fpu_begin(bool fpu_enabled, unsigned int nbytes) | ||
137 | { | ||
138 | return glue_fpu_begin(CAMELLIA_BLOCK_SIZE, | ||
139 | CAMELLIA_AESNI_PARALLEL_BLOCKS, NULL, fpu_enabled, | ||
140 | nbytes); | ||
141 | } | ||
142 | |||
143 | static inline void camellia_fpu_end(bool fpu_enabled) | ||
144 | { | ||
145 | glue_fpu_end(fpu_enabled); | ||
146 | } | ||
147 | |||
148 | static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, | ||
149 | unsigned int key_len) | ||
150 | { | ||
151 | return __camellia_setkey(crypto_tfm_ctx(tfm), in_key, key_len, | ||
152 | &tfm->crt_flags); | ||
153 | } | ||
154 | |||
155 | struct crypt_priv { | ||
156 | struct camellia_ctx *ctx; | ||
157 | bool fpu_enabled; | ||
158 | }; | ||
159 | |||
160 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
161 | { | ||
162 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | ||
163 | struct crypt_priv *ctx = priv; | ||
164 | int i; | ||
165 | |||
166 | ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); | ||
167 | |||
168 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | ||
169 | camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); | ||
170 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
171 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
172 | } | ||
173 | |||
174 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | ||
175 | camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); | ||
176 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
177 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
178 | } | ||
179 | |||
180 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
181 | camellia_enc_blk(ctx->ctx, srcdst, srcdst); | ||
182 | } | ||
183 | |||
184 | static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | ||
185 | { | ||
186 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | ||
187 | struct crypt_priv *ctx = priv; | ||
188 | int i; | ||
189 | |||
190 | ctx->fpu_enabled = camellia_fpu_begin(ctx->fpu_enabled, nbytes); | ||
191 | |||
192 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | ||
193 | camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); | ||
194 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
195 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | ||
196 | } | ||
197 | |||
198 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | ||
199 | camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); | ||
200 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
201 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | ||
202 | } | ||
203 | |||
204 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | ||
205 | camellia_dec_blk(ctx->ctx, srcdst, srcdst); | ||
206 | } | ||
207 | |||
208 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
209 | struct scatterlist *src, unsigned int nbytes) | ||
210 | { | ||
211 | struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
212 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
213 | struct crypt_priv crypt_ctx = { | ||
214 | .ctx = &ctx->camellia_ctx, | ||
215 | .fpu_enabled = false, | ||
216 | }; | ||
217 | struct lrw_crypt_req req = { | ||
218 | .tbuf = buf, | ||
219 | .tbuflen = sizeof(buf), | ||
220 | |||
221 | .table_ctx = &ctx->lrw_table, | ||
222 | .crypt_ctx = &crypt_ctx, | ||
223 | .crypt_fn = encrypt_callback, | ||
224 | }; | ||
225 | int ret; | ||
226 | |||
227 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
228 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
229 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
230 | |||
231 | return ret; | ||
232 | } | ||
233 | |||
234 | static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
235 | struct scatterlist *src, unsigned int nbytes) | ||
236 | { | ||
237 | struct camellia_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
238 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
239 | struct crypt_priv crypt_ctx = { | ||
240 | .ctx = &ctx->camellia_ctx, | ||
241 | .fpu_enabled = false, | ||
242 | }; | ||
243 | struct lrw_crypt_req req = { | ||
244 | .tbuf = buf, | ||
245 | .tbuflen = sizeof(buf), | ||
246 | |||
247 | .table_ctx = &ctx->lrw_table, | ||
248 | .crypt_ctx = &crypt_ctx, | ||
249 | .crypt_fn = decrypt_callback, | ||
250 | }; | ||
251 | int ret; | ||
252 | |||
253 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
254 | ret = lrw_crypt(desc, dst, src, nbytes, &req); | ||
255 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
256 | |||
257 | return ret; | ||
258 | } | ||
259 | |||
260 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
261 | struct scatterlist *src, unsigned int nbytes) | ||
262 | { | ||
263 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
264 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
265 | struct crypt_priv crypt_ctx = { | ||
266 | .ctx = &ctx->crypt_ctx, | ||
267 | .fpu_enabled = false, | ||
268 | }; | ||
269 | struct xts_crypt_req req = { | ||
270 | .tbuf = buf, | ||
271 | .tbuflen = sizeof(buf), | ||
272 | |||
273 | .tweak_ctx = &ctx->tweak_ctx, | ||
274 | .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), | ||
275 | .crypt_ctx = &crypt_ctx, | ||
276 | .crypt_fn = encrypt_callback, | ||
277 | }; | ||
278 | int ret; | ||
279 | |||
280 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
281 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
282 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
283 | |||
284 | return ret; | ||
285 | } | ||
286 | |||
287 | static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
288 | struct scatterlist *src, unsigned int nbytes) | ||
289 | { | ||
290 | struct camellia_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
291 | be128 buf[CAMELLIA_AESNI_PARALLEL_BLOCKS]; | ||
292 | struct crypt_priv crypt_ctx = { | ||
293 | .ctx = &ctx->crypt_ctx, | ||
294 | .fpu_enabled = false, | ||
295 | }; | ||
296 | struct xts_crypt_req req = { | ||
297 | .tbuf = buf, | ||
298 | .tbuflen = sizeof(buf), | ||
299 | |||
300 | .tweak_ctx = &ctx->tweak_ctx, | ||
301 | .tweak_fn = XTS_TWEAK_CAST(camellia_enc_blk), | ||
302 | .crypt_ctx = &crypt_ctx, | ||
303 | .crypt_fn = decrypt_callback, | ||
304 | }; | ||
305 | int ret; | ||
306 | |||
307 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | ||
308 | ret = xts_crypt(desc, dst, src, nbytes, &req); | ||
309 | camellia_fpu_end(crypt_ctx.fpu_enabled); | ||
310 | |||
311 | return ret; | ||
312 | } | ||
313 | |||
314 | static struct crypto_alg cmll_algs[10] = { { | ||
315 | .cra_name = "__ecb-camellia-aesni", | ||
316 | .cra_driver_name = "__driver-ecb-camellia-aesni", | ||
317 | .cra_priority = 0, | ||
318 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
319 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
320 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
321 | .cra_alignmask = 0, | ||
322 | .cra_type = &crypto_blkcipher_type, | ||
323 | .cra_module = THIS_MODULE, | ||
324 | .cra_u = { | ||
325 | .blkcipher = { | ||
326 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
327 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
328 | .setkey = camellia_setkey, | ||
329 | .encrypt = ecb_encrypt, | ||
330 | .decrypt = ecb_decrypt, | ||
331 | }, | ||
332 | }, | ||
333 | }, { | ||
334 | .cra_name = "__cbc-camellia-aesni", | ||
335 | .cra_driver_name = "__driver-cbc-camellia-aesni", | ||
336 | .cra_priority = 0, | ||
337 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
338 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
339 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
340 | .cra_alignmask = 0, | ||
341 | .cra_type = &crypto_blkcipher_type, | ||
342 | .cra_module = THIS_MODULE, | ||
343 | .cra_u = { | ||
344 | .blkcipher = { | ||
345 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
346 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
347 | .setkey = camellia_setkey, | ||
348 | .encrypt = cbc_encrypt, | ||
349 | .decrypt = cbc_decrypt, | ||
350 | }, | ||
351 | }, | ||
352 | }, { | ||
353 | .cra_name = "__ctr-camellia-aesni", | ||
354 | .cra_driver_name = "__driver-ctr-camellia-aesni", | ||
355 | .cra_priority = 0, | ||
356 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
357 | .cra_blocksize = 1, | ||
358 | .cra_ctxsize = sizeof(struct camellia_ctx), | ||
359 | .cra_alignmask = 0, | ||
360 | .cra_type = &crypto_blkcipher_type, | ||
361 | .cra_module = THIS_MODULE, | ||
362 | .cra_u = { | ||
363 | .blkcipher = { | ||
364 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
365 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
366 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
367 | .setkey = camellia_setkey, | ||
368 | .encrypt = ctr_crypt, | ||
369 | .decrypt = ctr_crypt, | ||
370 | }, | ||
371 | }, | ||
372 | }, { | ||
373 | .cra_name = "__lrw-camellia-aesni", | ||
374 | .cra_driver_name = "__driver-lrw-camellia-aesni", | ||
375 | .cra_priority = 0, | ||
376 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
377 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
378 | .cra_ctxsize = sizeof(struct camellia_lrw_ctx), | ||
379 | .cra_alignmask = 0, | ||
380 | .cra_type = &crypto_blkcipher_type, | ||
381 | .cra_module = THIS_MODULE, | ||
382 | .cra_exit = lrw_camellia_exit_tfm, | ||
383 | .cra_u = { | ||
384 | .blkcipher = { | ||
385 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + | ||
386 | CAMELLIA_BLOCK_SIZE, | ||
387 | .max_keysize = CAMELLIA_MAX_KEY_SIZE + | ||
388 | CAMELLIA_BLOCK_SIZE, | ||
389 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
390 | .setkey = lrw_camellia_setkey, | ||
391 | .encrypt = lrw_encrypt, | ||
392 | .decrypt = lrw_decrypt, | ||
393 | }, | ||
394 | }, | ||
395 | }, { | ||
396 | .cra_name = "__xts-camellia-aesni", | ||
397 | .cra_driver_name = "__driver-xts-camellia-aesni", | ||
398 | .cra_priority = 0, | ||
399 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
400 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
401 | .cra_ctxsize = sizeof(struct camellia_xts_ctx), | ||
402 | .cra_alignmask = 0, | ||
403 | .cra_type = &crypto_blkcipher_type, | ||
404 | .cra_module = THIS_MODULE, | ||
405 | .cra_u = { | ||
406 | .blkcipher = { | ||
407 | .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, | ||
408 | .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, | ||
409 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
410 | .setkey = xts_camellia_setkey, | ||
411 | .encrypt = xts_encrypt, | ||
412 | .decrypt = xts_decrypt, | ||
413 | }, | ||
414 | }, | ||
415 | }, { | ||
416 | .cra_name = "ecb(camellia)", | ||
417 | .cra_driver_name = "ecb-camellia-aesni", | ||
418 | .cra_priority = 400, | ||
419 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
420 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
421 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
422 | .cra_alignmask = 0, | ||
423 | .cra_type = &crypto_ablkcipher_type, | ||
424 | .cra_module = THIS_MODULE, | ||
425 | .cra_init = ablk_init, | ||
426 | .cra_exit = ablk_exit, | ||
427 | .cra_u = { | ||
428 | .ablkcipher = { | ||
429 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
430 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
431 | .setkey = ablk_set_key, | ||
432 | .encrypt = ablk_encrypt, | ||
433 | .decrypt = ablk_decrypt, | ||
434 | }, | ||
435 | }, | ||
436 | }, { | ||
437 | .cra_name = "cbc(camellia)", | ||
438 | .cra_driver_name = "cbc-camellia-aesni", | ||
439 | .cra_priority = 400, | ||
440 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
441 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
442 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
443 | .cra_alignmask = 0, | ||
444 | .cra_type = &crypto_ablkcipher_type, | ||
445 | .cra_module = THIS_MODULE, | ||
446 | .cra_init = ablk_init, | ||
447 | .cra_exit = ablk_exit, | ||
448 | .cra_u = { | ||
449 | .ablkcipher = { | ||
450 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
451 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
452 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
453 | .setkey = ablk_set_key, | ||
454 | .encrypt = __ablk_encrypt, | ||
455 | .decrypt = ablk_decrypt, | ||
456 | }, | ||
457 | }, | ||
458 | }, { | ||
459 | .cra_name = "ctr(camellia)", | ||
460 | .cra_driver_name = "ctr-camellia-aesni", | ||
461 | .cra_priority = 400, | ||
462 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
463 | .cra_blocksize = 1, | ||
464 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
465 | .cra_alignmask = 0, | ||
466 | .cra_type = &crypto_ablkcipher_type, | ||
467 | .cra_module = THIS_MODULE, | ||
468 | .cra_init = ablk_init, | ||
469 | .cra_exit = ablk_exit, | ||
470 | .cra_u = { | ||
471 | .ablkcipher = { | ||
472 | .min_keysize = CAMELLIA_MIN_KEY_SIZE, | ||
473 | .max_keysize = CAMELLIA_MAX_KEY_SIZE, | ||
474 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
475 | .setkey = ablk_set_key, | ||
476 | .encrypt = ablk_encrypt, | ||
477 | .decrypt = ablk_encrypt, | ||
478 | .geniv = "chainiv", | ||
479 | }, | ||
480 | }, | ||
481 | }, { | ||
482 | .cra_name = "lrw(camellia)", | ||
483 | .cra_driver_name = "lrw-camellia-aesni", | ||
484 | .cra_priority = 400, | ||
485 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
486 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
487 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
488 | .cra_alignmask = 0, | ||
489 | .cra_type = &crypto_ablkcipher_type, | ||
490 | .cra_module = THIS_MODULE, | ||
491 | .cra_init = ablk_init, | ||
492 | .cra_exit = ablk_exit, | ||
493 | .cra_u = { | ||
494 | .ablkcipher = { | ||
495 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + | ||
496 | CAMELLIA_BLOCK_SIZE, | ||
497 | .max_keysize = CAMELLIA_MAX_KEY_SIZE + | ||
498 | CAMELLIA_BLOCK_SIZE, | ||
499 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
500 | .setkey = ablk_set_key, | ||
501 | .encrypt = ablk_encrypt, | ||
502 | .decrypt = ablk_decrypt, | ||
503 | }, | ||
504 | }, | ||
505 | }, { | ||
506 | .cra_name = "xts(camellia)", | ||
507 | .cra_driver_name = "xts-camellia-aesni", | ||
508 | .cra_priority = 400, | ||
509 | .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, | ||
510 | .cra_blocksize = CAMELLIA_BLOCK_SIZE, | ||
511 | .cra_ctxsize = sizeof(struct async_helper_ctx), | ||
512 | .cra_alignmask = 0, | ||
513 | .cra_type = &crypto_ablkcipher_type, | ||
514 | .cra_module = THIS_MODULE, | ||
515 | .cra_init = ablk_init, | ||
516 | .cra_exit = ablk_exit, | ||
517 | .cra_u = { | ||
518 | .ablkcipher = { | ||
519 | .min_keysize = CAMELLIA_MIN_KEY_SIZE * 2, | ||
520 | .max_keysize = CAMELLIA_MAX_KEY_SIZE * 2, | ||
521 | .ivsize = CAMELLIA_BLOCK_SIZE, | ||
522 | .setkey = ablk_set_key, | ||
523 | .encrypt = ablk_encrypt, | ||
524 | .decrypt = ablk_decrypt, | ||
525 | }, | ||
526 | }, | ||
527 | } }; | ||
528 | |||
529 | static int __init camellia_aesni_init(void) | ||
530 | { | ||
531 | u64 xcr0; | ||
532 | |||
533 | if (!cpu_has_avx || !cpu_has_aes || !cpu_has_osxsave) { | ||
534 | pr_info("AVX or AES-NI instructions are not detected.\n"); | ||
535 | return -ENODEV; | ||
536 | } | ||
537 | |||
538 | xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); | ||
539 | if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { | ||
540 | pr_info("AVX detected but unusable.\n"); | ||
541 | return -ENODEV; | ||
542 | } | ||
543 | |||
544 | return crypto_register_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); | ||
545 | } | ||
546 | |||
547 | static void __exit camellia_aesni_fini(void) | ||
548 | { | ||
549 | crypto_unregister_algs(cmll_algs, ARRAY_SIZE(cmll_algs)); | ||
550 | } | ||
551 | |||
552 | module_init(camellia_aesni_init); | ||
553 | module_exit(camellia_aesni_fini); | ||
554 | |||
555 | MODULE_LICENSE("GPL"); | ||
556 | MODULE_DESCRIPTION("Camellia Cipher Algorithm, AES-NI/AVX optimized"); | ||
557 | MODULE_ALIAS("camellia"); | ||
558 | MODULE_ALIAS("camellia-asm"); | ||
diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 42ffd2bbab5b..5cb86ccd4acb 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c | |||
@@ -32,53 +32,24 @@ | |||
32 | #include <crypto/algapi.h> | 32 | #include <crypto/algapi.h> |
33 | #include <crypto/lrw.h> | 33 | #include <crypto/lrw.h> |
34 | #include <crypto/xts.h> | 34 | #include <crypto/xts.h> |
35 | #include <asm/crypto/camellia.h> | ||
35 | #include <asm/crypto/glue_helper.h> | 36 | #include <asm/crypto/glue_helper.h> |
36 | 37 | ||
37 | #define CAMELLIA_MIN_KEY_SIZE 16 | ||
38 | #define CAMELLIA_MAX_KEY_SIZE 32 | ||
39 | #define CAMELLIA_BLOCK_SIZE 16 | ||
40 | #define CAMELLIA_TABLE_BYTE_LEN 272 | ||
41 | |||
42 | struct camellia_ctx { | ||
43 | u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; | ||
44 | u32 key_length; | ||
45 | }; | ||
46 | |||
47 | /* regular block cipher functions */ | 38 | /* regular block cipher functions */ |
48 | asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, | 39 | asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, |
49 | const u8 *src, bool xor); | 40 | const u8 *src, bool xor); |
41 | EXPORT_SYMBOL_GPL(__camellia_enc_blk); | ||
50 | asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, | 42 | asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, |
51 | const u8 *src); | 43 | const u8 *src); |
44 | EXPORT_SYMBOL_GPL(camellia_dec_blk); | ||
52 | 45 | ||
53 | /* 2-way parallel cipher functions */ | 46 | /* 2-way parallel cipher functions */ |
54 | asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, | 47 | asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, |
55 | const u8 *src, bool xor); | 48 | const u8 *src, bool xor); |
49 | EXPORT_SYMBOL_GPL(__camellia_enc_blk_2way); | ||
56 | asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, | 50 | asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, |
57 | const u8 *src); | 51 | const u8 *src); |
58 | 52 | EXPORT_SYMBOL_GPL(camellia_dec_blk_2way); | |
59 | static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, | ||
60 | const u8 *src) | ||
61 | { | ||
62 | __camellia_enc_blk(ctx, dst, src, false); | ||
63 | } | ||
64 | |||
65 | static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst, | ||
66 | const u8 *src) | ||
67 | { | ||
68 | __camellia_enc_blk(ctx, dst, src, true); | ||
69 | } | ||
70 | |||
71 | static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, | ||
72 | const u8 *src) | ||
73 | { | ||
74 | __camellia_enc_blk_2way(ctx, dst, src, false); | ||
75 | } | ||
76 | |||
77 | static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst, | ||
78 | const u8 *src) | ||
79 | { | ||
80 | __camellia_enc_blk_2way(ctx, dst, src, true); | ||
81 | } | ||
82 | 53 | ||
83 | static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) | 54 | static void camellia_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) |
84 | { | 55 | { |
@@ -1275,9 +1246,8 @@ static void camellia_setup192(const unsigned char *key, u64 *subkey) | |||
1275 | camellia_setup256(kk, subkey); | 1246 | camellia_setup256(kk, subkey); |
1276 | } | 1247 | } |
1277 | 1248 | ||
1278 | static int __camellia_setkey(struct camellia_ctx *cctx, | 1249 | int __camellia_setkey(struct camellia_ctx *cctx, const unsigned char *key, |
1279 | const unsigned char *key, | 1250 | unsigned int key_len, u32 *flags) |
1280 | unsigned int key_len, u32 *flags) | ||
1281 | { | 1251 | { |
1282 | if (key_len != 16 && key_len != 24 && key_len != 32) { | 1252 | if (key_len != 16 && key_len != 24 && key_len != 32) { |
1283 | *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; | 1253 | *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; |
@@ -1300,6 +1270,7 @@ static int __camellia_setkey(struct camellia_ctx *cctx, | |||
1300 | 1270 | ||
1301 | return 0; | 1271 | return 0; |
1302 | } | 1272 | } |
1273 | EXPORT_SYMBOL_GPL(__camellia_setkey); | ||
1303 | 1274 | ||
1304 | static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, | 1275 | static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, |
1305 | unsigned int key_len) | 1276 | unsigned int key_len) |
@@ -1308,7 +1279,7 @@ static int camellia_setkey(struct crypto_tfm *tfm, const u8 *in_key, | |||
1308 | &tfm->crt_flags); | 1279 | &tfm->crt_flags); |
1309 | } | 1280 | } |
1310 | 1281 | ||
1311 | static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) | 1282 | void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) |
1312 | { | 1283 | { |
1313 | u128 iv = *src; | 1284 | u128 iv = *src; |
1314 | 1285 | ||
@@ -1316,22 +1287,23 @@ static void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src) | |||
1316 | 1287 | ||
1317 | u128_xor(&dst[1], &dst[1], &iv); | 1288 | u128_xor(&dst[1], &dst[1], &iv); |
1318 | } | 1289 | } |
1290 | EXPORT_SYMBOL_GPL(camellia_decrypt_cbc_2way); | ||
1319 | 1291 | ||
1320 | static void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) | 1292 | void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
1321 | { | 1293 | { |
1322 | be128 ctrblk; | 1294 | be128 ctrblk; |
1323 | 1295 | ||
1324 | if (dst != src) | 1296 | if (dst != src) |
1325 | *dst = *src; | 1297 | *dst = *src; |
1326 | 1298 | ||
1327 | u128_to_be128(&ctrblk, iv); | 1299 | le128_to_be128(&ctrblk, iv); |
1328 | u128_inc(iv); | 1300 | le128_inc(iv); |
1329 | 1301 | ||
1330 | camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); | 1302 | camellia_enc_blk_xor(ctx, (u8 *)dst, (u8 *)&ctrblk); |
1331 | } | 1303 | } |
1304 | EXPORT_SYMBOL_GPL(camellia_crypt_ctr); | ||
1332 | 1305 | ||
1333 | static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, | 1306 | void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
1334 | u128 *iv) | ||
1335 | { | 1307 | { |
1336 | be128 ctrblks[2]; | 1308 | be128 ctrblks[2]; |
1337 | 1309 | ||
@@ -1340,13 +1312,14 @@ static void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, | |||
1340 | dst[1] = src[1]; | 1312 | dst[1] = src[1]; |
1341 | } | 1313 | } |
1342 | 1314 | ||
1343 | u128_to_be128(&ctrblks[0], iv); | 1315 | le128_to_be128(&ctrblks[0], iv); |
1344 | u128_inc(iv); | 1316 | le128_inc(iv); |
1345 | u128_to_be128(&ctrblks[1], iv); | 1317 | le128_to_be128(&ctrblks[1], iv); |
1346 | u128_inc(iv); | 1318 | le128_inc(iv); |
1347 | 1319 | ||
1348 | camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); | 1320 | camellia_enc_blk_xor_2way(ctx, (u8 *)dst, (u8 *)ctrblks); |
1349 | } | 1321 | } |
1322 | EXPORT_SYMBOL_GPL(camellia_crypt_ctr_2way); | ||
1350 | 1323 | ||
1351 | static const struct common_glue_ctx camellia_enc = { | 1324 | static const struct common_glue_ctx camellia_enc = { |
1352 | .num_funcs = 2, | 1325 | .num_funcs = 2, |
@@ -1464,13 +1437,8 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
1464 | camellia_dec_blk(ctx, srcdst, srcdst); | 1437 | camellia_dec_blk(ctx, srcdst, srcdst); |
1465 | } | 1438 | } |
1466 | 1439 | ||
1467 | struct camellia_lrw_ctx { | 1440 | int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, |
1468 | struct lrw_table_ctx lrw_table; | 1441 | unsigned int keylen) |
1469 | struct camellia_ctx camellia_ctx; | ||
1470 | }; | ||
1471 | |||
1472 | static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, | ||
1473 | unsigned int keylen) | ||
1474 | { | 1442 | { |
1475 | struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); | 1443 | struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); |
1476 | int err; | 1444 | int err; |
@@ -1484,6 +1452,7 @@ static int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, | |||
1484 | return lrw_init_table(&ctx->lrw_table, | 1452 | return lrw_init_table(&ctx->lrw_table, |
1485 | key + keylen - CAMELLIA_BLOCK_SIZE); | 1453 | key + keylen - CAMELLIA_BLOCK_SIZE); |
1486 | } | 1454 | } |
1455 | EXPORT_SYMBOL_GPL(lrw_camellia_setkey); | ||
1487 | 1456 | ||
1488 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 1457 | static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
1489 | struct scatterlist *src, unsigned int nbytes) | 1458 | struct scatterlist *src, unsigned int nbytes) |
@@ -1519,20 +1488,16 @@ static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |||
1519 | return lrw_crypt(desc, dst, src, nbytes, &req); | 1488 | return lrw_crypt(desc, dst, src, nbytes, &req); |
1520 | } | 1489 | } |
1521 | 1490 | ||
1522 | static void lrw_exit_tfm(struct crypto_tfm *tfm) | 1491 | void lrw_camellia_exit_tfm(struct crypto_tfm *tfm) |
1523 | { | 1492 | { |
1524 | struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); | 1493 | struct camellia_lrw_ctx *ctx = crypto_tfm_ctx(tfm); |
1525 | 1494 | ||
1526 | lrw_free_table(&ctx->lrw_table); | 1495 | lrw_free_table(&ctx->lrw_table); |
1527 | } | 1496 | } |
1497 | EXPORT_SYMBOL_GPL(lrw_camellia_exit_tfm); | ||
1528 | 1498 | ||
1529 | struct camellia_xts_ctx { | 1499 | int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, |
1530 | struct camellia_ctx tweak_ctx; | 1500 | unsigned int keylen) |
1531 | struct camellia_ctx crypt_ctx; | ||
1532 | }; | ||
1533 | |||
1534 | static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, | ||
1535 | unsigned int keylen) | ||
1536 | { | 1501 | { |
1537 | struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm); | 1502 | struct camellia_xts_ctx *ctx = crypto_tfm_ctx(tfm); |
1538 | u32 *flags = &tfm->crt_flags; | 1503 | u32 *flags = &tfm->crt_flags; |
@@ -1555,6 +1520,7 @@ static int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, | |||
1555 | return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, | 1520 | return __camellia_setkey(&ctx->tweak_ctx, key + keylen / 2, keylen / 2, |
1556 | flags); | 1521 | flags); |
1557 | } | 1522 | } |
1523 | EXPORT_SYMBOL_GPL(xts_camellia_setkey); | ||
1558 | 1524 | ||
1559 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | 1525 | static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, |
1560 | struct scatterlist *src, unsigned int nbytes) | 1526 | struct scatterlist *src, unsigned int nbytes) |
@@ -1679,7 +1645,7 @@ static struct crypto_alg camellia_algs[6] = { { | |||
1679 | .cra_alignmask = 0, | 1645 | .cra_alignmask = 0, |
1680 | .cra_type = &crypto_blkcipher_type, | 1646 | .cra_type = &crypto_blkcipher_type, |
1681 | .cra_module = THIS_MODULE, | 1647 | .cra_module = THIS_MODULE, |
1682 | .cra_exit = lrw_exit_tfm, | 1648 | .cra_exit = lrw_camellia_exit_tfm, |
1683 | .cra_u = { | 1649 | .cra_u = { |
1684 | .blkcipher = { | 1650 | .blkcipher = { |
1685 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + | 1651 | .min_keysize = CAMELLIA_MIN_KEY_SIZE + |
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index a41a3aaba220..15b00ac7cbd3 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S | |||
@@ -25,10 +25,10 @@ | |||
25 | 25 | ||
26 | .file "cast5-avx-x86_64-asm_64.S" | 26 | .file "cast5-avx-x86_64-asm_64.S" |
27 | 27 | ||
28 | .extern cast5_s1 | 28 | .extern cast_s1 |
29 | .extern cast5_s2 | 29 | .extern cast_s2 |
30 | .extern cast5_s3 | 30 | .extern cast_s3 |
31 | .extern cast5_s4 | 31 | .extern cast_s4 |
32 | 32 | ||
33 | /* structure of crypto context */ | 33 | /* structure of crypto context */ |
34 | #define km 0 | 34 | #define km 0 |
@@ -36,10 +36,10 @@ | |||
36 | #define rr ((16*4)+16) | 36 | #define rr ((16*4)+16) |
37 | 37 | ||
38 | /* s-boxes */ | 38 | /* s-boxes */ |
39 | #define s1 cast5_s1 | 39 | #define s1 cast_s1 |
40 | #define s2 cast5_s2 | 40 | #define s2 cast_s2 |
41 | #define s3 cast5_s3 | 41 | #define s3 cast_s3 |
42 | #define s4 cast5_s4 | 42 | #define s4 cast_s4 |
43 | 43 | ||
44 | /********************************************************************** | 44 | /********************************************************************** |
45 | 16-way AVX cast5 | 45 | 16-way AVX cast5 |
@@ -180,31 +180,17 @@ | |||
180 | vpunpcklqdq t1, t0, x0; \ | 180 | vpunpcklqdq t1, t0, x0; \ |
181 | vpunpckhqdq t1, t0, x1; | 181 | vpunpckhqdq t1, t0, x1; |
182 | 182 | ||
183 | #define inpack_blocks(in, x0, x1, t0, t1, rmask) \ | 183 | #define inpack_blocks(x0, x1, t0, t1, rmask) \ |
184 | vmovdqu (0*4*4)(in), x0; \ | ||
185 | vmovdqu (1*4*4)(in), x1; \ | ||
186 | vpshufb rmask, x0, x0; \ | 184 | vpshufb rmask, x0, x0; \ |
187 | vpshufb rmask, x1, x1; \ | 185 | vpshufb rmask, x1, x1; \ |
188 | \ | 186 | \ |
189 | transpose_2x4(x0, x1, t0, t1) | 187 | transpose_2x4(x0, x1, t0, t1) |
190 | 188 | ||
191 | #define outunpack_blocks(out, x0, x1, t0, t1, rmask) \ | 189 | #define outunpack_blocks(x0, x1, t0, t1, rmask) \ |
192 | transpose_2x4(x0, x1, t0, t1) \ | 190 | transpose_2x4(x0, x1, t0, t1) \ |
193 | \ | 191 | \ |
194 | vpshufb rmask, x0, x0; \ | 192 | vpshufb rmask, x0, x0; \ |
195 | vpshufb rmask, x1, x1; \ | 193 | vpshufb rmask, x1, x1; |
196 | vmovdqu x0, (0*4*4)(out); \ | ||
197 | vmovdqu x1, (1*4*4)(out); | ||
198 | |||
199 | #define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \ | ||
200 | transpose_2x4(x0, x1, t0, t1) \ | ||
201 | \ | ||
202 | vpshufb rmask, x0, x0; \ | ||
203 | vpshufb rmask, x1, x1; \ | ||
204 | vpxor (0*4*4)(out), x0, x0; \ | ||
205 | vmovdqu x0, (0*4*4)(out); \ | ||
206 | vpxor (1*4*4)(out), x1, x1; \ | ||
207 | vmovdqu x1, (1*4*4)(out); | ||
208 | 194 | ||
209 | .data | 195 | .data |
210 | 196 | ||
@@ -213,6 +199,8 @@ | |||
213 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | 199 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 |
214 | .Lbswap128_mask: | 200 | .Lbswap128_mask: |
215 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | 201 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 |
202 | .Lbswap_iv_mask: | ||
203 | .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0 | ||
216 | .L16_mask: | 204 | .L16_mask: |
217 | .byte 16, 16, 16, 16 | 205 | .byte 16, 16, 16, 16 |
218 | .L32_mask: | 206 | .L32_mask: |
@@ -223,35 +211,42 @@ | |||
223 | .text | 211 | .text |
224 | 212 | ||
225 | .align 16 | 213 | .align 16 |
226 | .global __cast5_enc_blk_16way | 214 | .type __cast5_enc_blk16,@function; |
227 | .type __cast5_enc_blk_16way,@function; | ||
228 | 215 | ||
229 | __cast5_enc_blk_16way: | 216 | __cast5_enc_blk16: |
230 | /* input: | 217 | /* input: |
231 | * %rdi: ctx, CTX | 218 | * %rdi: ctx, CTX |
232 | * %rsi: dst | 219 | * RL1: blocks 1 and 2 |
233 | * %rdx: src | 220 | * RR1: blocks 3 and 4 |
234 | * %rcx: bool, if true: xor output | 221 | * RL2: blocks 5 and 6 |
222 | * RR2: blocks 7 and 8 | ||
223 | * RL3: blocks 9 and 10 | ||
224 | * RR3: blocks 11 and 12 | ||
225 | * RL4: blocks 13 and 14 | ||
226 | * RR4: blocks 15 and 16 | ||
227 | * output: | ||
228 | * RL1: encrypted blocks 1 and 2 | ||
229 | * RR1: encrypted blocks 3 and 4 | ||
230 | * RL2: encrypted blocks 5 and 6 | ||
231 | * RR2: encrypted blocks 7 and 8 | ||
232 | * RL3: encrypted blocks 9 and 10 | ||
233 | * RR3: encrypted blocks 11 and 12 | ||
234 | * RL4: encrypted blocks 13 and 14 | ||
235 | * RR4: encrypted blocks 15 and 16 | ||
235 | */ | 236 | */ |
236 | 237 | ||
237 | pushq %rbp; | 238 | pushq %rbp; |
238 | pushq %rbx; | 239 | pushq %rbx; |
239 | pushq %rcx; | ||
240 | 240 | ||
241 | vmovdqa .Lbswap_mask, RKM; | 241 | vmovdqa .Lbswap_mask, RKM; |
242 | vmovd .Lfirst_mask, R1ST; | 242 | vmovd .Lfirst_mask, R1ST; |
243 | vmovd .L32_mask, R32; | 243 | vmovd .L32_mask, R32; |
244 | enc_preload_rkr(); | 244 | enc_preload_rkr(); |
245 | 245 | ||
246 | leaq 1*(2*4*4)(%rdx), %rax; | 246 | inpack_blocks(RL1, RR1, RTMP, RX, RKM); |
247 | inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); | 247 | inpack_blocks(RL2, RR2, RTMP, RX, RKM); |
248 | inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); | 248 | inpack_blocks(RL3, RR3, RTMP, RX, RKM); |
249 | leaq 2*(2*4*4)(%rdx), %rax; | 249 | inpack_blocks(RL4, RR4, RTMP, RX, RKM); |
250 | inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); | ||
251 | leaq 3*(2*4*4)(%rdx), %rax; | ||
252 | inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); | ||
253 | |||
254 | movq %rsi, %r11; | ||
255 | 250 | ||
256 | round(RL, RR, 0, 1); | 251 | round(RL, RR, 0, 1); |
257 | round(RR, RL, 1, 2); | 252 | round(RR, RL, 1, 2); |
@@ -276,44 +271,41 @@ __cast5_enc_blk_16way: | |||
276 | round(RR, RL, 15, 1); | 271 | round(RR, RL, 15, 1); |
277 | 272 | ||
278 | __skip_enc: | 273 | __skip_enc: |
279 | popq %rcx; | ||
280 | popq %rbx; | 274 | popq %rbx; |
281 | popq %rbp; | 275 | popq %rbp; |
282 | 276 | ||
283 | vmovdqa .Lbswap_mask, RKM; | 277 | vmovdqa .Lbswap_mask, RKM; |
284 | leaq 1*(2*4*4)(%r11), %rax; | ||
285 | 278 | ||
286 | testb %cl, %cl; | 279 | outunpack_blocks(RR1, RL1, RTMP, RX, RKM); |
287 | jnz __enc_xor16; | 280 | outunpack_blocks(RR2, RL2, RTMP, RX, RKM); |
288 | 281 | outunpack_blocks(RR3, RL3, RTMP, RX, RKM); | |
289 | outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); | 282 | outunpack_blocks(RR4, RL4, RTMP, RX, RKM); |
290 | outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); | ||
291 | leaq 2*(2*4*4)(%r11), %rax; | ||
292 | outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); | ||
293 | leaq 3*(2*4*4)(%r11), %rax; | ||
294 | outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); | ||
295 | |||
296 | ret; | ||
297 | |||
298 | __enc_xor16: | ||
299 | outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM); | ||
300 | outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM); | ||
301 | leaq 2*(2*4*4)(%r11), %rax; | ||
302 | outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM); | ||
303 | leaq 3*(2*4*4)(%r11), %rax; | ||
304 | outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM); | ||
305 | 283 | ||
306 | ret; | 284 | ret; |
307 | 285 | ||
308 | .align 16 | 286 | .align 16 |
309 | .global cast5_dec_blk_16way | 287 | .type __cast5_dec_blk16,@function; |
310 | .type cast5_dec_blk_16way,@function; | ||
311 | 288 | ||
312 | cast5_dec_blk_16way: | 289 | __cast5_dec_blk16: |
313 | /* input: | 290 | /* input: |
314 | * %rdi: ctx, CTX | 291 | * %rdi: ctx, CTX |
315 | * %rsi: dst | 292 | * RL1: encrypted blocks 1 and 2 |
316 | * %rdx: src | 293 | * RR1: encrypted blocks 3 and 4 |
294 | * RL2: encrypted blocks 5 and 6 | ||
295 | * RR2: encrypted blocks 7 and 8 | ||
296 | * RL3: encrypted blocks 9 and 10 | ||
297 | * RR3: encrypted blocks 11 and 12 | ||
298 | * RL4: encrypted blocks 13 and 14 | ||
299 | * RR4: encrypted blocks 15 and 16 | ||
300 | * output: | ||
301 | * RL1: decrypted blocks 1 and 2 | ||
302 | * RR1: decrypted blocks 3 and 4 | ||
303 | * RL2: decrypted blocks 5 and 6 | ||
304 | * RR2: decrypted blocks 7 and 8 | ||
305 | * RL3: decrypted blocks 9 and 10 | ||
306 | * RR3: decrypted blocks 11 and 12 | ||
307 | * RL4: decrypted blocks 13 and 14 | ||
308 | * RR4: decrypted blocks 15 and 16 | ||
317 | */ | 309 | */ |
318 | 310 | ||
319 | pushq %rbp; | 311 | pushq %rbp; |
@@ -324,15 +316,10 @@ cast5_dec_blk_16way: | |||
324 | vmovd .L32_mask, R32; | 316 | vmovd .L32_mask, R32; |
325 | dec_preload_rkr(); | 317 | dec_preload_rkr(); |
326 | 318 | ||
327 | leaq 1*(2*4*4)(%rdx), %rax; | 319 | inpack_blocks(RL1, RR1, RTMP, RX, RKM); |
328 | inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); | 320 | inpack_blocks(RL2, RR2, RTMP, RX, RKM); |
329 | inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); | 321 | inpack_blocks(RL3, RR3, RTMP, RX, RKM); |
330 | leaq 2*(2*4*4)(%rdx), %rax; | 322 | inpack_blocks(RL4, RR4, RTMP, RX, RKM); |
331 | inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); | ||
332 | leaq 3*(2*4*4)(%rdx), %rax; | ||
333 | inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); | ||
334 | |||
335 | movq %rsi, %r11; | ||
336 | 323 | ||
337 | movzbl rr(CTX), %eax; | 324 | movzbl rr(CTX), %eax; |
338 | testl %eax, %eax; | 325 | testl %eax, %eax; |
@@ -361,16 +348,211 @@ __dec_tail: | |||
361 | popq %rbx; | 348 | popq %rbx; |
362 | popq %rbp; | 349 | popq %rbp; |
363 | 350 | ||
364 | leaq 1*(2*4*4)(%r11), %rax; | 351 | outunpack_blocks(RR1, RL1, RTMP, RX, RKM); |
365 | outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); | 352 | outunpack_blocks(RR2, RL2, RTMP, RX, RKM); |
366 | outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); | 353 | outunpack_blocks(RR3, RL3, RTMP, RX, RKM); |
367 | leaq 2*(2*4*4)(%r11), %rax; | 354 | outunpack_blocks(RR4, RL4, RTMP, RX, RKM); |
368 | outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); | ||
369 | leaq 3*(2*4*4)(%r11), %rax; | ||
370 | outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); | ||
371 | 355 | ||
372 | ret; | 356 | ret; |
373 | 357 | ||
374 | __skip_dec: | 358 | __skip_dec: |
375 | vpsrldq $4, RKR, RKR; | 359 | vpsrldq $4, RKR, RKR; |
376 | jmp __dec_tail; | 360 | jmp __dec_tail; |
361 | |||
362 | .align 16 | ||
363 | .global cast5_ecb_enc_16way | ||
364 | .type cast5_ecb_enc_16way,@function; | ||
365 | |||
366 | cast5_ecb_enc_16way: | ||
367 | /* input: | ||
368 | * %rdi: ctx, CTX | ||
369 | * %rsi: dst | ||
370 | * %rdx: src | ||
371 | */ | ||
372 | |||
373 | movq %rsi, %r11; | ||
374 | |||
375 | vmovdqu (0*4*4)(%rdx), RL1; | ||
376 | vmovdqu (1*4*4)(%rdx), RR1; | ||
377 | vmovdqu (2*4*4)(%rdx), RL2; | ||
378 | vmovdqu (3*4*4)(%rdx), RR2; | ||
379 | vmovdqu (4*4*4)(%rdx), RL3; | ||
380 | vmovdqu (5*4*4)(%rdx), RR3; | ||
381 | vmovdqu (6*4*4)(%rdx), RL4; | ||
382 | vmovdqu (7*4*4)(%rdx), RR4; | ||
383 | |||
384 | call __cast5_enc_blk16; | ||
385 | |||
386 | vmovdqu RR1, (0*4*4)(%r11); | ||
387 | vmovdqu RL1, (1*4*4)(%r11); | ||
388 | vmovdqu RR2, (2*4*4)(%r11); | ||
389 | vmovdqu RL2, (3*4*4)(%r11); | ||
390 | vmovdqu RR3, (4*4*4)(%r11); | ||
391 | vmovdqu RL3, (5*4*4)(%r11); | ||
392 | vmovdqu RR4, (6*4*4)(%r11); | ||
393 | vmovdqu RL4, (7*4*4)(%r11); | ||
394 | |||
395 | ret; | ||
396 | |||
397 | .align 16 | ||
398 | .global cast5_ecb_dec_16way | ||
399 | .type cast5_ecb_dec_16way,@function; | ||
400 | |||
401 | cast5_ecb_dec_16way: | ||
402 | /* input: | ||
403 | * %rdi: ctx, CTX | ||
404 | * %rsi: dst | ||
405 | * %rdx: src | ||
406 | */ | ||
407 | |||
408 | movq %rsi, %r11; | ||
409 | |||
410 | vmovdqu (0*4*4)(%rdx), RL1; | ||
411 | vmovdqu (1*4*4)(%rdx), RR1; | ||
412 | vmovdqu (2*4*4)(%rdx), RL2; | ||
413 | vmovdqu (3*4*4)(%rdx), RR2; | ||
414 | vmovdqu (4*4*4)(%rdx), RL3; | ||
415 | vmovdqu (5*4*4)(%rdx), RR3; | ||
416 | vmovdqu (6*4*4)(%rdx), RL4; | ||
417 | vmovdqu (7*4*4)(%rdx), RR4; | ||
418 | |||
419 | call __cast5_dec_blk16; | ||
420 | |||
421 | vmovdqu RR1, (0*4*4)(%r11); | ||
422 | vmovdqu RL1, (1*4*4)(%r11); | ||
423 | vmovdqu RR2, (2*4*4)(%r11); | ||
424 | vmovdqu RL2, (3*4*4)(%r11); | ||
425 | vmovdqu RR3, (4*4*4)(%r11); | ||
426 | vmovdqu RL3, (5*4*4)(%r11); | ||
427 | vmovdqu RR4, (6*4*4)(%r11); | ||
428 | vmovdqu RL4, (7*4*4)(%r11); | ||
429 | |||
430 | ret; | ||
431 | |||
432 | .align 16 | ||
433 | .global cast5_cbc_dec_16way | ||
434 | .type cast5_cbc_dec_16way,@function; | ||
435 | |||
436 | cast5_cbc_dec_16way: | ||
437 | /* input: | ||
438 | * %rdi: ctx, CTX | ||
439 | * %rsi: dst | ||
440 | * %rdx: src | ||
441 | */ | ||
442 | |||
443 | pushq %r12; | ||
444 | |||
445 | movq %rsi, %r11; | ||
446 | movq %rdx, %r12; | ||
447 | |||
448 | vmovdqu (0*16)(%rdx), RL1; | ||
449 | vmovdqu (1*16)(%rdx), RR1; | ||
450 | vmovdqu (2*16)(%rdx), RL2; | ||
451 | vmovdqu (3*16)(%rdx), RR2; | ||
452 | vmovdqu (4*16)(%rdx), RL3; | ||
453 | vmovdqu (5*16)(%rdx), RR3; | ||
454 | vmovdqu (6*16)(%rdx), RL4; | ||
455 | vmovdqu (7*16)(%rdx), RR4; | ||
456 | |||
457 | call __cast5_dec_blk16; | ||
458 | |||
459 | /* xor with src */ | ||
460 | vmovq (%r12), RX; | ||
461 | vpshufd $0x4f, RX, RX; | ||
462 | vpxor RX, RR1, RR1; | ||
463 | vpxor 0*16+8(%r12), RL1, RL1; | ||
464 | vpxor 1*16+8(%r12), RR2, RR2; | ||
465 | vpxor 2*16+8(%r12), RL2, RL2; | ||
466 | vpxor 3*16+8(%r12), RR3, RR3; | ||
467 | vpxor 4*16+8(%r12), RL3, RL3; | ||
468 | vpxor 5*16+8(%r12), RR4, RR4; | ||
469 | vpxor 6*16+8(%r12), RL4, RL4; | ||
470 | |||
471 | vmovdqu RR1, (0*16)(%r11); | ||
472 | vmovdqu RL1, (1*16)(%r11); | ||
473 | vmovdqu RR2, (2*16)(%r11); | ||
474 | vmovdqu RL2, (3*16)(%r11); | ||
475 | vmovdqu RR3, (4*16)(%r11); | ||
476 | vmovdqu RL3, (5*16)(%r11); | ||
477 | vmovdqu RR4, (6*16)(%r11); | ||
478 | vmovdqu RL4, (7*16)(%r11); | ||
479 | |||
480 | popq %r12; | ||
481 | |||
482 | ret; | ||
483 | |||
484 | .align 16 | ||
485 | .global cast5_ctr_16way | ||
486 | .type cast5_ctr_16way,@function; | ||
487 | |||
488 | cast5_ctr_16way: | ||
489 | /* input: | ||
490 | * %rdi: ctx, CTX | ||
491 | * %rsi: dst | ||
492 | * %rdx: src | ||
493 | * %rcx: iv (big endian, 64bit) | ||
494 | */ | ||
495 | |||
496 | pushq %r12; | ||
497 | |||
498 | movq %rsi, %r11; | ||
499 | movq %rdx, %r12; | ||
500 | |||
501 | vpcmpeqd RTMP, RTMP, RTMP; | ||
502 | vpsrldq $8, RTMP, RTMP; /* low: -1, high: 0 */ | ||
503 | |||
504 | vpcmpeqd RKR, RKR, RKR; | ||
505 | vpaddq RKR, RKR, RKR; /* low: -2, high: -2 */ | ||
506 | vmovdqa .Lbswap_iv_mask, R1ST; | ||
507 | vmovdqa .Lbswap128_mask, RKM; | ||
508 | |||
509 | /* load IV and byteswap */ | ||
510 | vmovq (%rcx), RX; | ||
511 | vpshufb R1ST, RX, RX; | ||
512 | |||
513 | /* construct IVs */ | ||
514 | vpsubq RTMP, RX, RX; /* le: IV1, IV0 */ | ||
515 | vpshufb RKM, RX, RL1; /* be: IV0, IV1 */ | ||
516 | vpsubq RKR, RX, RX; | ||
517 | vpshufb RKM, RX, RR1; /* be: IV2, IV3 */ | ||
518 | vpsubq RKR, RX, RX; | ||
519 | vpshufb RKM, RX, RL2; /* be: IV4, IV5 */ | ||
520 | vpsubq RKR, RX, RX; | ||
521 | vpshufb RKM, RX, RR2; /* be: IV6, IV7 */ | ||
522 | vpsubq RKR, RX, RX; | ||
523 | vpshufb RKM, RX, RL3; /* be: IV8, IV9 */ | ||
524 | vpsubq RKR, RX, RX; | ||
525 | vpshufb RKM, RX, RR3; /* be: IV10, IV11 */ | ||
526 | vpsubq RKR, RX, RX; | ||
527 | vpshufb RKM, RX, RL4; /* be: IV12, IV13 */ | ||
528 | vpsubq RKR, RX, RX; | ||
529 | vpshufb RKM, RX, RR4; /* be: IV14, IV15 */ | ||
530 | |||
531 | /* store last IV */ | ||
532 | vpsubq RTMP, RX, RX; /* le: IV16, IV14 */ | ||
533 | vpshufb R1ST, RX, RX; /* be: IV16, IV16 */ | ||
534 | vmovq RX, (%rcx); | ||
535 | |||
536 | call __cast5_enc_blk16; | ||
537 | |||
538 | /* dst = src ^ iv */ | ||
539 | vpxor (0*16)(%r12), RR1, RR1; | ||
540 | vpxor (1*16)(%r12), RL1, RL1; | ||
541 | vpxor (2*16)(%r12), RR2, RR2; | ||
542 | vpxor (3*16)(%r12), RL2, RL2; | ||
543 | vpxor (4*16)(%r12), RR3, RR3; | ||
544 | vpxor (5*16)(%r12), RL3, RL3; | ||
545 | vpxor (6*16)(%r12), RR4, RR4; | ||
546 | vpxor (7*16)(%r12), RL4, RL4; | ||
547 | vmovdqu RR1, (0*16)(%r11); | ||
548 | vmovdqu RL1, (1*16)(%r11); | ||
549 | vmovdqu RR2, (2*16)(%r11); | ||
550 | vmovdqu RL2, (3*16)(%r11); | ||
551 | vmovdqu RR3, (4*16)(%r11); | ||
552 | vmovdqu RL3, (5*16)(%r11); | ||
553 | vmovdqu RR4, (6*16)(%r11); | ||
554 | vmovdqu RL4, (7*16)(%r11); | ||
555 | |||
556 | popq %r12; | ||
557 | |||
558 | ret; | ||
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index e0ea14f9547f..c6631813dc11 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c | |||
@@ -37,29 +37,14 @@ | |||
37 | 37 | ||
38 | #define CAST5_PARALLEL_BLOCKS 16 | 38 | #define CAST5_PARALLEL_BLOCKS 16 |
39 | 39 | ||
40 | asmlinkage void __cast5_enc_blk_16way(struct cast5_ctx *ctx, u8 *dst, | 40 | asmlinkage void cast5_ecb_enc_16way(struct cast5_ctx *ctx, u8 *dst, |
41 | const u8 *src, bool xor); | ||
42 | asmlinkage void cast5_dec_blk_16way(struct cast5_ctx *ctx, u8 *dst, | ||
43 | const u8 *src); | 41 | const u8 *src); |
44 | 42 | asmlinkage void cast5_ecb_dec_16way(struct cast5_ctx *ctx, u8 *dst, | |
45 | static inline void cast5_enc_blk_xway(struct cast5_ctx *ctx, u8 *dst, | 43 | const u8 *src); |
46 | const u8 *src) | 44 | asmlinkage void cast5_cbc_dec_16way(struct cast5_ctx *ctx, u8 *dst, |
47 | { | 45 | const u8 *src); |
48 | __cast5_enc_blk_16way(ctx, dst, src, false); | 46 | asmlinkage void cast5_ctr_16way(struct cast5_ctx *ctx, u8 *dst, const u8 *src, |
49 | } | 47 | __be64 *iv); |
50 | |||
51 | static inline void cast5_enc_blk_xway_xor(struct cast5_ctx *ctx, u8 *dst, | ||
52 | const u8 *src) | ||
53 | { | ||
54 | __cast5_enc_blk_16way(ctx, dst, src, true); | ||
55 | } | ||
56 | |||
57 | static inline void cast5_dec_blk_xway(struct cast5_ctx *ctx, u8 *dst, | ||
58 | const u8 *src) | ||
59 | { | ||
60 | cast5_dec_blk_16way(ctx, dst, src); | ||
61 | } | ||
62 | |||
63 | 48 | ||
64 | static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes) | 49 | static inline bool cast5_fpu_begin(bool fpu_enabled, unsigned int nbytes) |
65 | { | 50 | { |
@@ -79,8 +64,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | |||
79 | struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | 64 | struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); |
80 | const unsigned int bsize = CAST5_BLOCK_SIZE; | 65 | const unsigned int bsize = CAST5_BLOCK_SIZE; |
81 | unsigned int nbytes; | 66 | unsigned int nbytes; |
67 | void (*fn)(struct cast5_ctx *ctx, u8 *dst, const u8 *src); | ||
82 | int err; | 68 | int err; |
83 | 69 | ||
70 | fn = (enc) ? cast5_ecb_enc_16way : cast5_ecb_dec_16way; | ||
71 | |||
84 | err = blkcipher_walk_virt(desc, walk); | 72 | err = blkcipher_walk_virt(desc, walk); |
85 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | 73 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; |
86 | 74 | ||
@@ -93,10 +81,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | |||
93 | /* Process multi-block batch */ | 81 | /* Process multi-block batch */ |
94 | if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { | 82 | if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { |
95 | do { | 83 | do { |
96 | if (enc) | 84 | fn(ctx, wdst, wsrc); |
97 | cast5_enc_blk_xway(ctx, wdst, wsrc); | ||
98 | else | ||
99 | cast5_dec_blk_xway(ctx, wdst, wsrc); | ||
100 | 85 | ||
101 | wsrc += bsize * CAST5_PARALLEL_BLOCKS; | 86 | wsrc += bsize * CAST5_PARALLEL_BLOCKS; |
102 | wdst += bsize * CAST5_PARALLEL_BLOCKS; | 87 | wdst += bsize * CAST5_PARALLEL_BLOCKS; |
@@ -107,12 +92,11 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | |||
107 | goto done; | 92 | goto done; |
108 | } | 93 | } |
109 | 94 | ||
95 | fn = (enc) ? __cast5_encrypt : __cast5_decrypt; | ||
96 | |||
110 | /* Handle leftovers */ | 97 | /* Handle leftovers */ |
111 | do { | 98 | do { |
112 | if (enc) | 99 | fn(ctx, wdst, wsrc); |
113 | __cast5_encrypt(ctx, wdst, wsrc); | ||
114 | else | ||
115 | __cast5_decrypt(ctx, wdst, wsrc); | ||
116 | 100 | ||
117 | wsrc += bsize; | 101 | wsrc += bsize; |
118 | wdst += bsize; | 102 | wdst += bsize; |
@@ -194,9 +178,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, | |||
194 | unsigned int nbytes = walk->nbytes; | 178 | unsigned int nbytes = walk->nbytes; |
195 | u64 *src = (u64 *)walk->src.virt.addr; | 179 | u64 *src = (u64 *)walk->src.virt.addr; |
196 | u64 *dst = (u64 *)walk->dst.virt.addr; | 180 | u64 *dst = (u64 *)walk->dst.virt.addr; |
197 | u64 ivs[CAST5_PARALLEL_BLOCKS - 1]; | ||
198 | u64 last_iv; | 181 | u64 last_iv; |
199 | int i; | ||
200 | 182 | ||
201 | /* Start of the last block. */ | 183 | /* Start of the last block. */ |
202 | src += nbytes / bsize - 1; | 184 | src += nbytes / bsize - 1; |
@@ -211,13 +193,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, | |||
211 | src -= CAST5_PARALLEL_BLOCKS - 1; | 193 | src -= CAST5_PARALLEL_BLOCKS - 1; |
212 | dst -= CAST5_PARALLEL_BLOCKS - 1; | 194 | dst -= CAST5_PARALLEL_BLOCKS - 1; |
213 | 195 | ||
214 | for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) | 196 | cast5_cbc_dec_16way(ctx, (u8 *)dst, (u8 *)src); |
215 | ivs[i] = src[i]; | ||
216 | |||
217 | cast5_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); | ||
218 | |||
219 | for (i = 0; i < CAST5_PARALLEL_BLOCKS - 1; i++) | ||
220 | *(dst + (i + 1)) ^= *(ivs + i); | ||
221 | 197 | ||
222 | nbytes -= bsize; | 198 | nbytes -= bsize; |
223 | if (nbytes < bsize) | 199 | if (nbytes < bsize) |
@@ -298,23 +274,12 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, | |||
298 | unsigned int nbytes = walk->nbytes; | 274 | unsigned int nbytes = walk->nbytes; |
299 | u64 *src = (u64 *)walk->src.virt.addr; | 275 | u64 *src = (u64 *)walk->src.virt.addr; |
300 | u64 *dst = (u64 *)walk->dst.virt.addr; | 276 | u64 *dst = (u64 *)walk->dst.virt.addr; |
301 | u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); | ||
302 | __be64 ctrblocks[CAST5_PARALLEL_BLOCKS]; | ||
303 | int i; | ||
304 | 277 | ||
305 | /* Process multi-block batch */ | 278 | /* Process multi-block batch */ |
306 | if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { | 279 | if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { |
307 | do { | 280 | do { |
308 | /* create ctrblks for parallel encrypt */ | 281 | cast5_ctr_16way(ctx, (u8 *)dst, (u8 *)src, |
309 | for (i = 0; i < CAST5_PARALLEL_BLOCKS; i++) { | 282 | (__be64 *)walk->iv); |
310 | if (dst != src) | ||
311 | dst[i] = src[i]; | ||
312 | |||
313 | ctrblocks[i] = cpu_to_be64(ctrblk++); | ||
314 | } | ||
315 | |||
316 | cast5_enc_blk_xway_xor(ctx, (u8 *)dst, | ||
317 | (u8 *)ctrblocks); | ||
318 | 283 | ||
319 | src += CAST5_PARALLEL_BLOCKS; | 284 | src += CAST5_PARALLEL_BLOCKS; |
320 | dst += CAST5_PARALLEL_BLOCKS; | 285 | dst += CAST5_PARALLEL_BLOCKS; |
@@ -327,13 +292,16 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, | |||
327 | 292 | ||
328 | /* Handle leftovers */ | 293 | /* Handle leftovers */ |
329 | do { | 294 | do { |
295 | u64 ctrblk; | ||
296 | |||
330 | if (dst != src) | 297 | if (dst != src) |
331 | *dst = *src; | 298 | *dst = *src; |
332 | 299 | ||
333 | ctrblocks[0] = cpu_to_be64(ctrblk++); | 300 | ctrblk = *(u64 *)walk->iv; |
301 | be64_add_cpu((__be64 *)walk->iv, 1); | ||
334 | 302 | ||
335 | __cast5_encrypt(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); | 303 | __cast5_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); |
336 | *dst ^= ctrblocks[0]; | 304 | *dst ^= ctrblk; |
337 | 305 | ||
338 | src += 1; | 306 | src += 1; |
339 | dst += 1; | 307 | dst += 1; |
@@ -341,7 +309,6 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, | |||
341 | } while (nbytes >= bsize); | 309 | } while (nbytes >= bsize); |
342 | 310 | ||
343 | done: | 311 | done: |
344 | *(__be64 *)walk->iv = cpu_to_be64(ctrblk); | ||
345 | return nbytes; | 312 | return nbytes; |
346 | } | 313 | } |
347 | 314 | ||
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index 218d283772f4..2569d0da841f 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S | |||
@@ -23,22 +23,24 @@ | |||
23 | * | 23 | * |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include "glue_helper-asm-avx.S" | ||
27 | |||
26 | .file "cast6-avx-x86_64-asm_64.S" | 28 | .file "cast6-avx-x86_64-asm_64.S" |
27 | 29 | ||
28 | .extern cast6_s1 | 30 | .extern cast_s1 |
29 | .extern cast6_s2 | 31 | .extern cast_s2 |
30 | .extern cast6_s3 | 32 | .extern cast_s3 |
31 | .extern cast6_s4 | 33 | .extern cast_s4 |
32 | 34 | ||
33 | /* structure of crypto context */ | 35 | /* structure of crypto context */ |
34 | #define km 0 | 36 | #define km 0 |
35 | #define kr (12*4*4) | 37 | #define kr (12*4*4) |
36 | 38 | ||
37 | /* s-boxes */ | 39 | /* s-boxes */ |
38 | #define s1 cast6_s1 | 40 | #define s1 cast_s1 |
39 | #define s2 cast6_s2 | 41 | #define s2 cast_s2 |
40 | #define s3 cast6_s3 | 42 | #define s3 cast_s3 |
41 | #define s4 cast6_s4 | 43 | #define s4 cast_s4 |
42 | 44 | ||
43 | /********************************************************************** | 45 | /********************************************************************** |
44 | 8-way AVX cast6 | 46 | 8-way AVX cast6 |
@@ -205,11 +207,7 @@ | |||
205 | vpunpcklqdq x3, t2, x2; \ | 207 | vpunpcklqdq x3, t2, x2; \ |
206 | vpunpckhqdq x3, t2, x3; | 208 | vpunpckhqdq x3, t2, x3; |
207 | 209 | ||
208 | #define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ | 210 | #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ |
209 | vmovdqu (0*4*4)(in), x0; \ | ||
210 | vmovdqu (1*4*4)(in), x1; \ | ||
211 | vmovdqu (2*4*4)(in), x2; \ | ||
212 | vmovdqu (3*4*4)(in), x3; \ | ||
213 | vpshufb rmask, x0, x0; \ | 211 | vpshufb rmask, x0, x0; \ |
214 | vpshufb rmask, x1, x1; \ | 212 | vpshufb rmask, x1, x1; \ |
215 | vpshufb rmask, x2, x2; \ | 213 | vpshufb rmask, x2, x2; \ |
@@ -217,39 +215,21 @@ | |||
217 | \ | 215 | \ |
218 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | 216 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
219 | 217 | ||
220 | #define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ | 218 | #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ |
221 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 219 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
222 | \ | 220 | \ |
223 | vpshufb rmask, x0, x0; \ | 221 | vpshufb rmask, x0, x0; \ |
224 | vpshufb rmask, x1, x1; \ | 222 | vpshufb rmask, x1, x1; \ |
225 | vpshufb rmask, x2, x2; \ | 223 | vpshufb rmask, x2, x2; \ |
226 | vpshufb rmask, x3, x3; \ | 224 | vpshufb rmask, x3, x3; |
227 | vmovdqu x0, (0*4*4)(out); \ | ||
228 | vmovdqu x1, (1*4*4)(out); \ | ||
229 | vmovdqu x2, (2*4*4)(out); \ | ||
230 | vmovdqu x3, (3*4*4)(out); | ||
231 | |||
232 | #define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ | ||
233 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | ||
234 | \ | ||
235 | vpshufb rmask, x0, x0; \ | ||
236 | vpshufb rmask, x1, x1; \ | ||
237 | vpshufb rmask, x2, x2; \ | ||
238 | vpshufb rmask, x3, x3; \ | ||
239 | vpxor (0*4*4)(out), x0, x0; \ | ||
240 | vmovdqu x0, (0*4*4)(out); \ | ||
241 | vpxor (1*4*4)(out), x1, x1; \ | ||
242 | vmovdqu x1, (1*4*4)(out); \ | ||
243 | vpxor (2*4*4)(out), x2, x2; \ | ||
244 | vmovdqu x2, (2*4*4)(out); \ | ||
245 | vpxor (3*4*4)(out), x3, x3; \ | ||
246 | vmovdqu x3, (3*4*4)(out); | ||
247 | 225 | ||
248 | .data | 226 | .data |
249 | 227 | ||
250 | .align 16 | 228 | .align 16 |
251 | .Lbswap_mask: | 229 | .Lbswap_mask: |
252 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | 230 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 |
231 | .Lbswap128_mask: | ||
232 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
253 | .Lrkr_enc_Q_Q_QBAR_QBAR: | 233 | .Lrkr_enc_Q_Q_QBAR_QBAR: |
254 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 | 234 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 |
255 | .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: | 235 | .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: |
@@ -269,31 +249,26 @@ | |||
269 | 249 | ||
270 | .text | 250 | .text |
271 | 251 | ||
272 | .align 16 | 252 | .align 8 |
273 | .global __cast6_enc_blk_8way | 253 | .type __cast6_enc_blk8,@function; |
274 | .type __cast6_enc_blk_8way,@function; | ||
275 | 254 | ||
276 | __cast6_enc_blk_8way: | 255 | __cast6_enc_blk8: |
277 | /* input: | 256 | /* input: |
278 | * %rdi: ctx, CTX | 257 | * %rdi: ctx, CTX |
279 | * %rsi: dst | 258 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks |
280 | * %rdx: src | 259 | * output: |
281 | * %rcx: bool, if true: xor output | 260 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
282 | */ | 261 | */ |
283 | 262 | ||
284 | pushq %rbp; | 263 | pushq %rbp; |
285 | pushq %rbx; | 264 | pushq %rbx; |
286 | pushq %rcx; | ||
287 | 265 | ||
288 | vmovdqa .Lbswap_mask, RKM; | 266 | vmovdqa .Lbswap_mask, RKM; |
289 | vmovd .Lfirst_mask, R1ST; | 267 | vmovd .Lfirst_mask, R1ST; |
290 | vmovd .L32_mask, R32; | 268 | vmovd .L32_mask, R32; |
291 | 269 | ||
292 | leaq (4*4*4)(%rdx), %rax; | 270 | inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
293 | inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | 271 | inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
294 | inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | ||
295 | |||
296 | movq %rsi, %r11; | ||
297 | 272 | ||
298 | preload_rkr(0, dummy, none); | 273 | preload_rkr(0, dummy, none); |
299 | Q(0); | 274 | Q(0); |
@@ -311,36 +286,25 @@ __cast6_enc_blk_8way: | |||
311 | QBAR(10); | 286 | QBAR(10); |
312 | QBAR(11); | 287 | QBAR(11); |
313 | 288 | ||
314 | popq %rcx; | ||
315 | popq %rbx; | 289 | popq %rbx; |
316 | popq %rbp; | 290 | popq %rbp; |
317 | 291 | ||
318 | vmovdqa .Lbswap_mask, RKM; | 292 | vmovdqa .Lbswap_mask, RKM; |
319 | leaq (4*4*4)(%r11), %rax; | ||
320 | |||
321 | testb %cl, %cl; | ||
322 | jnz __enc_xor8; | ||
323 | |||
324 | outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | ||
325 | outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | ||
326 | |||
327 | ret; | ||
328 | 293 | ||
329 | __enc_xor8: | 294 | outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
330 | outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | 295 | outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
331 | outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | ||
332 | 296 | ||
333 | ret; | 297 | ret; |
334 | 298 | ||
335 | .align 16 | 299 | .align 8 |
336 | .global cast6_dec_blk_8way | 300 | .type __cast6_dec_blk8,@function; |
337 | .type cast6_dec_blk_8way,@function; | ||
338 | 301 | ||
339 | cast6_dec_blk_8way: | 302 | __cast6_dec_blk8: |
340 | /* input: | 303 | /* input: |
341 | * %rdi: ctx, CTX | 304 | * %rdi: ctx, CTX |
342 | * %rsi: dst | 305 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
343 | * %rdx: src | 306 | * output: |
307 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks | ||
344 | */ | 308 | */ |
345 | 309 | ||
346 | pushq %rbp; | 310 | pushq %rbp; |
@@ -350,11 +314,8 @@ cast6_dec_blk_8way: | |||
350 | vmovd .Lfirst_mask, R1ST; | 314 | vmovd .Lfirst_mask, R1ST; |
351 | vmovd .L32_mask, R32; | 315 | vmovd .L32_mask, R32; |
352 | 316 | ||
353 | leaq (4*4*4)(%rdx), %rax; | 317 | inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
354 | inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | 318 | inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
355 | inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | ||
356 | |||
357 | movq %rsi, %r11; | ||
358 | 319 | ||
359 | preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); | 320 | preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); |
360 | Q(11); | 321 | Q(11); |
@@ -376,8 +337,103 @@ cast6_dec_blk_8way: | |||
376 | popq %rbp; | 337 | popq %rbp; |
377 | 338 | ||
378 | vmovdqa .Lbswap_mask, RKM; | 339 | vmovdqa .Lbswap_mask, RKM; |
379 | leaq (4*4*4)(%r11), %rax; | 340 | outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
380 | outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); | 341 | outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
381 | outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | 342 | |
343 | ret; | ||
344 | |||
345 | .align 8 | ||
346 | .global cast6_ecb_enc_8way | ||
347 | .type cast6_ecb_enc_8way,@function; | ||
348 | |||
349 | cast6_ecb_enc_8way: | ||
350 | /* input: | ||
351 | * %rdi: ctx, CTX | ||
352 | * %rsi: dst | ||
353 | * %rdx: src | ||
354 | */ | ||
355 | |||
356 | movq %rsi, %r11; | ||
357 | |||
358 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
359 | |||
360 | call __cast6_enc_blk8; | ||
361 | |||
362 | store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
363 | |||
364 | ret; | ||
365 | |||
366 | .align 8 | ||
367 | .global cast6_ecb_dec_8way | ||
368 | .type cast6_ecb_dec_8way,@function; | ||
369 | |||
370 | cast6_ecb_dec_8way: | ||
371 | /* input: | ||
372 | * %rdi: ctx, CTX | ||
373 | * %rsi: dst | ||
374 | * %rdx: src | ||
375 | */ | ||
376 | |||
377 | movq %rsi, %r11; | ||
378 | |||
379 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
380 | |||
381 | call __cast6_dec_blk8; | ||
382 | |||
383 | store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
384 | |||
385 | ret; | ||
386 | |||
387 | .align 8 | ||
388 | .global cast6_cbc_dec_8way | ||
389 | .type cast6_cbc_dec_8way,@function; | ||
390 | |||
391 | cast6_cbc_dec_8way: | ||
392 | /* input: | ||
393 | * %rdi: ctx, CTX | ||
394 | * %rsi: dst | ||
395 | * %rdx: src | ||
396 | */ | ||
397 | |||
398 | pushq %r12; | ||
399 | |||
400 | movq %rsi, %r11; | ||
401 | movq %rdx, %r12; | ||
402 | |||
403 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
404 | |||
405 | call __cast6_dec_blk8; | ||
406 | |||
407 | store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
408 | |||
409 | popq %r12; | ||
410 | |||
411 | ret; | ||
412 | |||
413 | .align 8 | ||
414 | .global cast6_ctr_8way | ||
415 | .type cast6_ctr_8way,@function; | ||
416 | |||
417 | cast6_ctr_8way: | ||
418 | /* input: | ||
419 | * %rdi: ctx, CTX | ||
420 | * %rsi: dst | ||
421 | * %rdx: src | ||
422 | * %rcx: iv (little endian, 128bit) | ||
423 | */ | ||
424 | |||
425 | pushq %r12; | ||
426 | |||
427 | movq %rsi, %r11; | ||
428 | movq %rdx, %r12; | ||
429 | |||
430 | load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
431 | RD2, RX, RKR, RKM); | ||
432 | |||
433 | call __cast6_enc_blk8; | ||
434 | |||
435 | store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
436 | |||
437 | popq %r12; | ||
382 | 438 | ||
383 | ret; | 439 | ret; |
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c index 15e5f85a5011..92f7ca24790a 100644 --- a/arch/x86/crypto/cast6_avx_glue.c +++ b/arch/x86/crypto/cast6_avx_glue.c | |||
@@ -40,79 +40,34 @@ | |||
40 | 40 | ||
41 | #define CAST6_PARALLEL_BLOCKS 8 | 41 | #define CAST6_PARALLEL_BLOCKS 8 |
42 | 42 | ||
43 | asmlinkage void __cast6_enc_blk_8way(struct cast6_ctx *ctx, u8 *dst, | 43 | asmlinkage void cast6_ecb_enc_8way(struct cast6_ctx *ctx, u8 *dst, |
44 | const u8 *src, bool xor); | 44 | const u8 *src); |
45 | asmlinkage void cast6_dec_blk_8way(struct cast6_ctx *ctx, u8 *dst, | 45 | asmlinkage void cast6_ecb_dec_8way(struct cast6_ctx *ctx, u8 *dst, |
46 | const u8 *src); | 46 | const u8 *src); |
47 | 47 | ||
48 | static inline void cast6_enc_blk_xway(struct cast6_ctx *ctx, u8 *dst, | 48 | asmlinkage void cast6_cbc_dec_8way(struct cast6_ctx *ctx, u8 *dst, |
49 | const u8 *src) | 49 | const u8 *src); |
50 | { | 50 | asmlinkage void cast6_ctr_8way(struct cast6_ctx *ctx, u8 *dst, const u8 *src, |
51 | __cast6_enc_blk_8way(ctx, dst, src, false); | 51 | le128 *iv); |
52 | } | ||
53 | |||
54 | static inline void cast6_enc_blk_xway_xor(struct cast6_ctx *ctx, u8 *dst, | ||
55 | const u8 *src) | ||
56 | { | ||
57 | __cast6_enc_blk_8way(ctx, dst, src, true); | ||
58 | } | ||
59 | |||
60 | static inline void cast6_dec_blk_xway(struct cast6_ctx *ctx, u8 *dst, | ||
61 | const u8 *src) | ||
62 | { | ||
63 | cast6_dec_blk_8way(ctx, dst, src); | ||
64 | } | ||
65 | |||
66 | |||
67 | static void cast6_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) | ||
68 | { | ||
69 | u128 ivs[CAST6_PARALLEL_BLOCKS - 1]; | ||
70 | unsigned int j; | ||
71 | |||
72 | for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) | ||
73 | ivs[j] = src[j]; | ||
74 | |||
75 | cast6_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); | ||
76 | |||
77 | for (j = 0; j < CAST6_PARALLEL_BLOCKS - 1; j++) | ||
78 | u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); | ||
79 | } | ||
80 | 52 | ||
81 | static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) | 53 | static void cast6_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
82 | { | 54 | { |
83 | be128 ctrblk; | 55 | be128 ctrblk; |
84 | 56 | ||
85 | u128_to_be128(&ctrblk, iv); | 57 | le128_to_be128(&ctrblk, iv); |
86 | u128_inc(iv); | 58 | le128_inc(iv); |
87 | 59 | ||
88 | __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); | 60 | __cast6_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); |
89 | u128_xor(dst, src, (u128 *)&ctrblk); | 61 | u128_xor(dst, src, (u128 *)&ctrblk); |
90 | } | 62 | } |
91 | 63 | ||
92 | static void cast6_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, | ||
93 | u128 *iv) | ||
94 | { | ||
95 | be128 ctrblks[CAST6_PARALLEL_BLOCKS]; | ||
96 | unsigned int i; | ||
97 | |||
98 | for (i = 0; i < CAST6_PARALLEL_BLOCKS; i++) { | ||
99 | if (dst != src) | ||
100 | dst[i] = src[i]; | ||
101 | |||
102 | u128_to_be128(&ctrblks[i], iv); | ||
103 | u128_inc(iv); | ||
104 | } | ||
105 | |||
106 | cast6_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); | ||
107 | } | ||
108 | |||
109 | static const struct common_glue_ctx cast6_enc = { | 64 | static const struct common_glue_ctx cast6_enc = { |
110 | .num_funcs = 2, | 65 | .num_funcs = 2, |
111 | .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, | 66 | .fpu_blocks_limit = CAST6_PARALLEL_BLOCKS, |
112 | 67 | ||
113 | .funcs = { { | 68 | .funcs = { { |
114 | .num_blocks = CAST6_PARALLEL_BLOCKS, | 69 | .num_blocks = CAST6_PARALLEL_BLOCKS, |
115 | .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_enc_blk_xway) } | 70 | .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_enc_8way) } |
116 | }, { | 71 | }, { |
117 | .num_blocks = 1, | 72 | .num_blocks = 1, |
118 | .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } | 73 | .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_encrypt) } |
@@ -125,7 +80,7 @@ static const struct common_glue_ctx cast6_ctr = { | |||
125 | 80 | ||
126 | .funcs = { { | 81 | .funcs = { { |
127 | .num_blocks = CAST6_PARALLEL_BLOCKS, | 82 | .num_blocks = CAST6_PARALLEL_BLOCKS, |
128 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr_xway) } | 83 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_ctr_8way) } |
129 | }, { | 84 | }, { |
130 | .num_blocks = 1, | 85 | .num_blocks = 1, |
131 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } | 86 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(cast6_crypt_ctr) } |
@@ -138,7 +93,7 @@ static const struct common_glue_ctx cast6_dec = { | |||
138 | 93 | ||
139 | .funcs = { { | 94 | .funcs = { { |
140 | .num_blocks = CAST6_PARALLEL_BLOCKS, | 95 | .num_blocks = CAST6_PARALLEL_BLOCKS, |
141 | .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_dec_blk_xway) } | 96 | .fn_u = { .ecb = GLUE_FUNC_CAST(cast6_ecb_dec_8way) } |
142 | }, { | 97 | }, { |
143 | .num_blocks = 1, | 98 | .num_blocks = 1, |
144 | .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } | 99 | .fn_u = { .ecb = GLUE_FUNC_CAST(__cast6_decrypt) } |
@@ -151,7 +106,7 @@ static const struct common_glue_ctx cast6_dec_cbc = { | |||
151 | 106 | ||
152 | .funcs = { { | 107 | .funcs = { { |
153 | .num_blocks = CAST6_PARALLEL_BLOCKS, | 108 | .num_blocks = CAST6_PARALLEL_BLOCKS, |
154 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_decrypt_cbc_xway) } | 109 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(cast6_cbc_dec_8way) } |
155 | }, { | 110 | }, { |
156 | .num_blocks = 1, | 111 | .num_blocks = 1, |
157 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } | 112 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__cast6_decrypt) } |
@@ -215,7 +170,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
215 | ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); | 170 | ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); |
216 | 171 | ||
217 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { | 172 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { |
218 | cast6_enc_blk_xway(ctx->ctx, srcdst, srcdst); | 173 | cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst); |
219 | return; | 174 | return; |
220 | } | 175 | } |
221 | 176 | ||
@@ -232,7 +187,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
232 | ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); | 187 | ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); |
233 | 188 | ||
234 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { | 189 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { |
235 | cast6_dec_blk_xway(ctx->ctx, srcdst, srcdst); | 190 | cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst); |
236 | return; | 191 | return; |
237 | } | 192 | } |
238 | 193 | ||
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel_glue.c index 493f959261f7..6812ad98355c 100644 --- a/arch/x86/crypto/crc32c-intel.c +++ b/arch/x86/crypto/crc32c-intel_glue.c | |||
@@ -32,6 +32,8 @@ | |||
32 | 32 | ||
33 | #include <asm/cpufeature.h> | 33 | #include <asm/cpufeature.h> |
34 | #include <asm/cpu_device_id.h> | 34 | #include <asm/cpu_device_id.h> |
35 | #include <asm/i387.h> | ||
36 | #include <asm/fpu-internal.h> | ||
35 | 37 | ||
36 | #define CHKSUM_BLOCK_SIZE 1 | 38 | #define CHKSUM_BLOCK_SIZE 1 |
37 | #define CHKSUM_DIGEST_SIZE 4 | 39 | #define CHKSUM_DIGEST_SIZE 4 |
@@ -44,6 +46,31 @@ | |||
44 | #define REX_PRE | 46 | #define REX_PRE |
45 | #endif | 47 | #endif |
46 | 48 | ||
49 | #ifdef CONFIG_X86_64 | ||
50 | /* | ||
51 | * use carryless multiply version of crc32c when buffer | ||
52 | * size is >= 512 (when eager fpu is enabled) or | ||
53 | * >= 1024 (when eager fpu is disabled) to account | ||
54 | * for fpu state save/restore overhead. | ||
55 | */ | ||
56 | #define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 | ||
57 | #define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024 | ||
58 | |||
59 | asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, | ||
60 | unsigned int crc_init); | ||
61 | static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; | ||
62 | #if defined(X86_FEATURE_EAGER_FPU) | ||
63 | #define set_pcl_breakeven_point() \ | ||
64 | do { \ | ||
65 | if (!use_eager_fpu()) \ | ||
66 | crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \ | ||
67 | } while (0) | ||
68 | #else | ||
69 | #define set_pcl_breakeven_point() \ | ||
70 | (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU) | ||
71 | #endif | ||
72 | #endif /* CONFIG_X86_64 */ | ||
73 | |||
47 | static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) | 74 | static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) |
48 | { | 75 | { |
49 | while (length--) { | 76 | while (length--) { |
@@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm) | |||
154 | return 0; | 181 | return 0; |
155 | } | 182 | } |
156 | 183 | ||
184 | #ifdef CONFIG_X86_64 | ||
185 | static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, | ||
186 | unsigned int len) | ||
187 | { | ||
188 | u32 *crcp = shash_desc_ctx(desc); | ||
189 | |||
190 | /* | ||
191 | * use faster PCL version if datasize is large enough to | ||
192 | * overcome kernel fpu state save/restore overhead | ||
193 | */ | ||
194 | if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { | ||
195 | kernel_fpu_begin(); | ||
196 | *crcp = crc_pcl(data, len, *crcp); | ||
197 | kernel_fpu_end(); | ||
198 | } else | ||
199 | *crcp = crc32c_intel_le_hw(*crcp, data, len); | ||
200 | return 0; | ||
201 | } | ||
202 | |||
203 | static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, | ||
204 | u8 *out) | ||
205 | { | ||
206 | if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { | ||
207 | kernel_fpu_begin(); | ||
208 | *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); | ||
209 | kernel_fpu_end(); | ||
210 | } else | ||
211 | *(__le32 *)out = | ||
212 | ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); | ||
213 | return 0; | ||
214 | } | ||
215 | |||
216 | static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data, | ||
217 | unsigned int len, u8 *out) | ||
218 | { | ||
219 | return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out); | ||
220 | } | ||
221 | |||
222 | static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data, | ||
223 | unsigned int len, u8 *out) | ||
224 | { | ||
225 | return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len, | ||
226 | out); | ||
227 | } | ||
228 | #endif /* CONFIG_X86_64 */ | ||
229 | |||
157 | static struct shash_alg alg = { | 230 | static struct shash_alg alg = { |
158 | .setkey = crc32c_intel_setkey, | 231 | .setkey = crc32c_intel_setkey, |
159 | .init = crc32c_intel_init, | 232 | .init = crc32c_intel_init, |
@@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void) | |||
184 | { | 257 | { |
185 | if (!x86_match_cpu(crc32c_cpu_id)) | 258 | if (!x86_match_cpu(crc32c_cpu_id)) |
186 | return -ENODEV; | 259 | return -ENODEV; |
260 | #ifdef CONFIG_X86_64 | ||
261 | if (cpu_has_pclmulqdq) { | ||
262 | alg.update = crc32c_pcl_intel_update; | ||
263 | alg.finup = crc32c_pcl_intel_finup; | ||
264 | alg.digest = crc32c_pcl_intel_digest; | ||
265 | set_pcl_breakeven_point(); | ||
266 | } | ||
267 | #endif | ||
187 | return crypto_register_shash(&alg); | 268 | return crypto_register_shash(&alg); |
188 | } | 269 | } |
189 | 270 | ||
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S new file mode 100644 index 000000000000..93c6d39237ac --- /dev/null +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S | |||
@@ -0,0 +1,460 @@ | |||
1 | /* | ||
2 | * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) | ||
3 | * | ||
4 | * The white paper on CRC32C calculations with PCLMULQDQ instruction can be | ||
5 | * downloaded from: | ||
6 | * http://download.intel.com/design/intarch/papers/323405.pdf | ||
7 | * | ||
8 | * Copyright (C) 2012 Intel Corporation. | ||
9 | * | ||
10 | * Authors: | ||
11 | * Wajdi Feghali <wajdi.k.feghali@intel.com> | ||
12 | * James Guilford <james.guilford@intel.com> | ||
13 | * David Cote <david.m.cote@intel.com> | ||
14 | * Tim Chen <tim.c.chen@linux.intel.com> | ||
15 | * | ||
16 | * This software is available to you under a choice of one of two | ||
17 | * licenses. You may choose to be licensed under the terms of the GNU | ||
18 | * General Public License (GPL) Version 2, available from the file | ||
19 | * COPYING in the main directory of this source tree, or the | ||
20 | * OpenIB.org BSD license below: | ||
21 | * | ||
22 | * Redistribution and use in source and binary forms, with or | ||
23 | * without modification, are permitted provided that the following | ||
24 | * conditions are met: | ||
25 | * | ||
26 | * - Redistributions of source code must retain the above | ||
27 | * copyright notice, this list of conditions and the following | ||
28 | * disclaimer. | ||
29 | * | ||
30 | * - Redistributions in binary form must reproduce the above | ||
31 | * copyright notice, this list of conditions and the following | ||
32 | * disclaimer in the documentation and/or other materials | ||
33 | * provided with the distribution. | ||
34 | * | ||
35 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
36 | * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
37 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
38 | * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
39 | * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
40 | * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
41 | * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
42 | * SOFTWARE. | ||
43 | */ | ||
44 | |||
45 | ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction | ||
46 | |||
47 | .macro LABEL prefix n | ||
48 | \prefix\n\(): | ||
49 | .endm | ||
50 | |||
51 | .macro JMPTBL_ENTRY i | ||
52 | .word crc_\i - crc_array | ||
53 | .endm | ||
54 | |||
55 | .macro JNC_LESS_THAN j | ||
56 | jnc less_than_\j | ||
57 | .endm | ||
58 | |||
59 | # Define threshold where buffers are considered "small" and routed to more | ||
60 | # efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so | ||
61 | # SMALL_SIZE can be no larger than 255. | ||
62 | |||
63 | #define SMALL_SIZE 200 | ||
64 | |||
65 | .if (SMALL_SIZE > 255) | ||
66 | .error "SMALL_ SIZE must be < 256" | ||
67 | .endif | ||
68 | |||
69 | # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); | ||
70 | |||
71 | .global crc_pcl | ||
72 | crc_pcl: | ||
73 | #define bufp %rdi | ||
74 | #define bufp_dw %edi | ||
75 | #define bufp_w %di | ||
76 | #define bufp_b %dil | ||
77 | #define bufptmp %rcx | ||
78 | #define block_0 %rcx | ||
79 | #define block_1 %rdx | ||
80 | #define block_2 %r11 | ||
81 | #define len %rsi | ||
82 | #define len_dw %esi | ||
83 | #define len_w %si | ||
84 | #define len_b %sil | ||
85 | #define crc_init_arg %rdx | ||
86 | #define tmp %rbx | ||
87 | #define crc_init %r8 | ||
88 | #define crc_init_dw %r8d | ||
89 | #define crc1 %r9 | ||
90 | #define crc2 %r10 | ||
91 | |||
92 | pushq %rbx | ||
93 | pushq %rdi | ||
94 | pushq %rsi | ||
95 | |||
96 | ## Move crc_init for Linux to a different | ||
97 | mov crc_init_arg, crc_init | ||
98 | |||
99 | ################################################################ | ||
100 | ## 1) ALIGN: | ||
101 | ################################################################ | ||
102 | |||
103 | mov bufp, bufptmp # rdi = *buf | ||
104 | neg bufp | ||
105 | and $7, bufp # calculate the unalignment amount of | ||
106 | # the address | ||
107 | je proc_block # Skip if aligned | ||
108 | |||
109 | ## If len is less than 8 and we're unaligned, we need to jump | ||
110 | ## to special code to avoid reading beyond the end of the buffer | ||
111 | cmp $8, len | ||
112 | jae do_align | ||
113 | # less_than_8 expects length in upper 3 bits of len_dw | ||
114 | # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] | ||
115 | shl $32-3+1, len_dw | ||
116 | jmp less_than_8_post_shl1 | ||
117 | |||
118 | do_align: | ||
119 | #### Calculate CRC of unaligned bytes of the buffer (if any) | ||
120 | movq (bufptmp), tmp # load a quadward from the buffer | ||
121 | add bufp, bufptmp # align buffer pointer for quadword | ||
122 | # processing | ||
123 | sub bufp, len # update buffer length | ||
124 | align_loop: | ||
125 | crc32b %bl, crc_init_dw # compute crc32 of 1-byte | ||
126 | shr $8, tmp # get next byte | ||
127 | dec bufp | ||
128 | jne align_loop | ||
129 | |||
130 | proc_block: | ||
131 | |||
132 | ################################################################ | ||
133 | ## 2) PROCESS BLOCKS: | ||
134 | ################################################################ | ||
135 | |||
136 | ## compute num of bytes to be processed | ||
137 | movq len, tmp # save num bytes in tmp | ||
138 | |||
139 | cmpq $128*24, len | ||
140 | jae full_block | ||
141 | |||
142 | continue_block: | ||
143 | cmpq $SMALL_SIZE, len | ||
144 | jb small | ||
145 | |||
146 | ## len < 128*24 | ||
147 | movq $2731, %rax # 2731 = ceil(2^16 / 24) | ||
148 | mul len_dw | ||
149 | shrq $16, %rax | ||
150 | |||
151 | ## eax contains floor(bytes / 24) = num 24-byte chunks to do | ||
152 | |||
153 | ## process rax 24-byte chunks (128 >= rax >= 0) | ||
154 | |||
155 | ## compute end address of each block | ||
156 | ## block 0 (base addr + RAX * 8) | ||
157 | ## block 1 (base addr + RAX * 16) | ||
158 | ## block 2 (base addr + RAX * 24) | ||
159 | lea (bufptmp, %rax, 8), block_0 | ||
160 | lea (block_0, %rax, 8), block_1 | ||
161 | lea (block_1, %rax, 8), block_2 | ||
162 | |||
163 | xor crc1, crc1 | ||
164 | xor crc2, crc2 | ||
165 | |||
166 | ## branch into array | ||
167 | lea jump_table(%rip), bufp | ||
168 | movzxw (bufp, %rax, 2), len | ||
169 | offset=crc_array-jump_table | ||
170 | lea offset(bufp, len, 1), bufp | ||
171 | jmp *bufp | ||
172 | |||
173 | ################################################################ | ||
174 | ## 2a) PROCESS FULL BLOCKS: | ||
175 | ################################################################ | ||
176 | full_block: | ||
177 | movq $128,%rax | ||
178 | lea 128*8*2(block_0), block_1 | ||
179 | lea 128*8*3(block_0), block_2 | ||
180 | add $128*8*1, block_0 | ||
181 | |||
182 | xor crc1,crc1 | ||
183 | xor crc2,crc2 | ||
184 | |||
185 | # Fall thruogh into top of crc array (crc_128) | ||
186 | |||
187 | ################################################################ | ||
188 | ## 3) CRC Array: | ||
189 | ################################################################ | ||
190 | |||
191 | crc_array: | ||
192 | i=128 | ||
193 | .rept 128-1 | ||
194 | .altmacro | ||
195 | LABEL crc_ %i | ||
196 | .noaltmacro | ||
197 | crc32q -i*8(block_0), crc_init | ||
198 | crc32q -i*8(block_1), crc1 | ||
199 | crc32q -i*8(block_2), crc2 | ||
200 | i=(i-1) | ||
201 | .endr | ||
202 | |||
203 | .altmacro | ||
204 | LABEL crc_ %i | ||
205 | .noaltmacro | ||
206 | crc32q -i*8(block_0), crc_init | ||
207 | crc32q -i*8(block_1), crc1 | ||
208 | # SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet | ||
209 | |||
210 | mov block_2, block_0 | ||
211 | |||
212 | ################################################################ | ||
213 | ## 4) Combine three results: | ||
214 | ################################################################ | ||
215 | |||
216 | lea (K_table-16)(%rip), bufp # first entry is for idx 1 | ||
217 | shlq $3, %rax # rax *= 8 | ||
218 | subq %rax, tmp # tmp -= rax*8 | ||
219 | shlq $1, %rax | ||
220 | subq %rax, tmp # tmp -= rax*16 | ||
221 | # (total tmp -= rax*24) | ||
222 | addq %rax, bufp | ||
223 | |||
224 | movdqa (bufp), %xmm0 # 2 consts: K1:K2 | ||
225 | |||
226 | movq crc_init, %xmm1 # CRC for block 1 | ||
227 | pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 | ||
228 | |||
229 | movq crc1, %xmm2 # CRC for block 2 | ||
230 | pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 | ||
231 | |||
232 | pxor %xmm2,%xmm1 | ||
233 | movq %xmm1, %rax | ||
234 | xor -i*8(block_2), %rax | ||
235 | mov crc2, crc_init | ||
236 | crc32 %rax, crc_init | ||
237 | |||
238 | ################################################################ | ||
239 | ## 5) Check for end: | ||
240 | ################################################################ | ||
241 | |||
242 | LABEL crc_ 0 | ||
243 | mov tmp, len | ||
244 | cmp $128*24, tmp | ||
245 | jae full_block | ||
246 | cmp $24, tmp | ||
247 | jae continue_block | ||
248 | |||
249 | less_than_24: | ||
250 | shl $32-4, len_dw # less_than_16 expects length | ||
251 | # in upper 4 bits of len_dw | ||
252 | jnc less_than_16 | ||
253 | crc32q (bufptmp), crc_init | ||
254 | crc32q 8(bufptmp), crc_init | ||
255 | jz do_return | ||
256 | add $16, bufptmp | ||
257 | # len is less than 8 if we got here | ||
258 | # less_than_8 expects length in upper 3 bits of len_dw | ||
259 | # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] | ||
260 | shl $2, len_dw | ||
261 | jmp less_than_8_post_shl1 | ||
262 | |||
263 | ####################################################################### | ||
264 | ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) | ||
265 | ####################################################################### | ||
266 | small: | ||
267 | shl $32-8, len_dw # Prepare len_dw for less_than_256 | ||
268 | j=256 | ||
269 | .rept 5 # j = {256, 128, 64, 32, 16} | ||
270 | .altmacro | ||
271 | LABEL less_than_ %j # less_than_j: Length should be in | ||
272 | # upper lg(j) bits of len_dw | ||
273 | j=(j/2) | ||
274 | shl $1, len_dw # Get next MSB | ||
275 | JNC_LESS_THAN %j | ||
276 | .noaltmacro | ||
277 | i=0 | ||
278 | .rept (j/8) | ||
279 | crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data | ||
280 | i=i+8 | ||
281 | .endr | ||
282 | jz do_return # Return if remaining length is zero | ||
283 | add $j, bufptmp # Advance buf | ||
284 | .endr | ||
285 | |||
286 | less_than_8: # Length should be stored in | ||
287 | # upper 3 bits of len_dw | ||
288 | shl $1, len_dw | ||
289 | less_than_8_post_shl1: | ||
290 | jnc less_than_4 | ||
291 | crc32l (bufptmp), crc_init_dw # CRC of 4 bytes | ||
292 | jz do_return # return if remaining data is zero | ||
293 | add $4, bufptmp | ||
294 | less_than_4: # Length should be stored in | ||
295 | # upper 2 bits of len_dw | ||
296 | shl $1, len_dw | ||
297 | jnc less_than_2 | ||
298 | crc32w (bufptmp), crc_init_dw # CRC of 2 bytes | ||
299 | jz do_return # return if remaining data is zero | ||
300 | add $2, bufptmp | ||
301 | less_than_2: # Length should be stored in the MSB | ||
302 | # of len_dw | ||
303 | shl $1, len_dw | ||
304 | jnc less_than_1 | ||
305 | crc32b (bufptmp), crc_init_dw # CRC of 1 byte | ||
306 | less_than_1: # Length should be zero | ||
307 | do_return: | ||
308 | movq crc_init, %rax | ||
309 | popq %rsi | ||
310 | popq %rdi | ||
311 | popq %rbx | ||
312 | ret | ||
313 | |||
314 | ################################################################ | ||
315 | ## jump table Table is 129 entries x 2 bytes each | ||
316 | ################################################################ | ||
317 | .align 4 | ||
318 | jump_table: | ||
319 | i=0 | ||
320 | .rept 129 | ||
321 | .altmacro | ||
322 | JMPTBL_ENTRY %i | ||
323 | .noaltmacro | ||
324 | i=i+1 | ||
325 | .endr | ||
326 | ################################################################ | ||
327 | ## PCLMULQDQ tables | ||
328 | ## Table is 128 entries x 2 quad words each | ||
329 | ################################################################ | ||
330 | .data | ||
331 | .align 64 | ||
332 | K_table: | ||
333 | .quad 0x14cd00bd6,0x105ec76f0 | ||
334 | .quad 0x0ba4fc28e,0x14cd00bd6 | ||
335 | .quad 0x1d82c63da,0x0f20c0dfe | ||
336 | .quad 0x09e4addf8,0x0ba4fc28e | ||
337 | .quad 0x039d3b296,0x1384aa63a | ||
338 | .quad 0x102f9b8a2,0x1d82c63da | ||
339 | .quad 0x14237f5e6,0x01c291d04 | ||
340 | .quad 0x00d3b6092,0x09e4addf8 | ||
341 | .quad 0x0c96cfdc0,0x0740eef02 | ||
342 | .quad 0x18266e456,0x039d3b296 | ||
343 | .quad 0x0daece73e,0x0083a6eec | ||
344 | .quad 0x0ab7aff2a,0x102f9b8a2 | ||
345 | .quad 0x1248ea574,0x1c1733996 | ||
346 | .quad 0x083348832,0x14237f5e6 | ||
347 | .quad 0x12c743124,0x02ad91c30 | ||
348 | .quad 0x0b9e02b86,0x00d3b6092 | ||
349 | .quad 0x018b33a4e,0x06992cea2 | ||
350 | .quad 0x1b331e26a,0x0c96cfdc0 | ||
351 | .quad 0x17d35ba46,0x07e908048 | ||
352 | .quad 0x1bf2e8b8a,0x18266e456 | ||
353 | .quad 0x1a3e0968a,0x11ed1f9d8 | ||
354 | .quad 0x0ce7f39f4,0x0daece73e | ||
355 | .quad 0x061d82e56,0x0f1d0f55e | ||
356 | .quad 0x0d270f1a2,0x0ab7aff2a | ||
357 | .quad 0x1c3f5f66c,0x0a87ab8a8 | ||
358 | .quad 0x12ed0daac,0x1248ea574 | ||
359 | .quad 0x065863b64,0x08462d800 | ||
360 | .quad 0x11eef4f8e,0x083348832 | ||
361 | .quad 0x1ee54f54c,0x071d111a8 | ||
362 | .quad 0x0b3e32c28,0x12c743124 | ||
363 | .quad 0x0064f7f26,0x0ffd852c6 | ||
364 | .quad 0x0dd7e3b0c,0x0b9e02b86 | ||
365 | .quad 0x0f285651c,0x0dcb17aa4 | ||
366 | .quad 0x010746f3c,0x018b33a4e | ||
367 | .quad 0x1c24afea4,0x0f37c5aee | ||
368 | .quad 0x0271d9844,0x1b331e26a | ||
369 | .quad 0x08e766a0c,0x06051d5a2 | ||
370 | .quad 0x093a5f730,0x17d35ba46 | ||
371 | .quad 0x06cb08e5c,0x11d5ca20e | ||
372 | .quad 0x06b749fb2,0x1bf2e8b8a | ||
373 | .quad 0x1167f94f2,0x021f3d99c | ||
374 | .quad 0x0cec3662e,0x1a3e0968a | ||
375 | .quad 0x19329634a,0x08f158014 | ||
376 | .quad 0x0e6fc4e6a,0x0ce7f39f4 | ||
377 | .quad 0x08227bb8a,0x1a5e82106 | ||
378 | .quad 0x0b0cd4768,0x061d82e56 | ||
379 | .quad 0x13c2b89c4,0x188815ab2 | ||
380 | .quad 0x0d7a4825c,0x0d270f1a2 | ||
381 | .quad 0x10f5ff2ba,0x105405f3e | ||
382 | .quad 0x00167d312,0x1c3f5f66c | ||
383 | .quad 0x0f6076544,0x0e9adf796 | ||
384 | .quad 0x026f6a60a,0x12ed0daac | ||
385 | .quad 0x1a2adb74e,0x096638b34 | ||
386 | .quad 0x19d34af3a,0x065863b64 | ||
387 | .quad 0x049c3cc9c,0x1e50585a0 | ||
388 | .quad 0x068bce87a,0x11eef4f8e | ||
389 | .quad 0x1524fa6c6,0x19f1c69dc | ||
390 | .quad 0x16cba8aca,0x1ee54f54c | ||
391 | .quad 0x042d98888,0x12913343e | ||
392 | .quad 0x1329d9f7e,0x0b3e32c28 | ||
393 | .quad 0x1b1c69528,0x088f25a3a | ||
394 | .quad 0x02178513a,0x0064f7f26 | ||
395 | .quad 0x0e0ac139e,0x04e36f0b0 | ||
396 | .quad 0x0170076fa,0x0dd7e3b0c | ||
397 | .quad 0x141a1a2e2,0x0bd6f81f8 | ||
398 | .quad 0x16ad828b4,0x0f285651c | ||
399 | .quad 0x041d17b64,0x19425cbba | ||
400 | .quad 0x1fae1cc66,0x010746f3c | ||
401 | .quad 0x1a75b4b00,0x18db37e8a | ||
402 | .quad 0x0f872e54c,0x1c24afea4 | ||
403 | .quad 0x01e41e9fc,0x04c144932 | ||
404 | .quad 0x086d8e4d2,0x0271d9844 | ||
405 | .quad 0x160f7af7a,0x052148f02 | ||
406 | .quad 0x05bb8f1bc,0x08e766a0c | ||
407 | .quad 0x0a90fd27a,0x0a3c6f37a | ||
408 | .quad 0x0b3af077a,0x093a5f730 | ||
409 | .quad 0x04984d782,0x1d22c238e | ||
410 | .quad 0x0ca6ef3ac,0x06cb08e5c | ||
411 | .quad 0x0234e0b26,0x063ded06a | ||
412 | .quad 0x1d88abd4a,0x06b749fb2 | ||
413 | .quad 0x04597456a,0x04d56973c | ||
414 | .quad 0x0e9e28eb4,0x1167f94f2 | ||
415 | .quad 0x07b3ff57a,0x19385bf2e | ||
416 | .quad 0x0c9c8b782,0x0cec3662e | ||
417 | .quad 0x13a9cba9e,0x0e417f38a | ||
418 | .quad 0x093e106a4,0x19329634a | ||
419 | .quad 0x167001a9c,0x14e727980 | ||
420 | .quad 0x1ddffc5d4,0x0e6fc4e6a | ||
421 | .quad 0x00df04680,0x0d104b8fc | ||
422 | .quad 0x02342001e,0x08227bb8a | ||
423 | .quad 0x00a2a8d7e,0x05b397730 | ||
424 | .quad 0x168763fa6,0x0b0cd4768 | ||
425 | .quad 0x1ed5a407a,0x0e78eb416 | ||
426 | .quad 0x0d2c3ed1a,0x13c2b89c4 | ||
427 | .quad 0x0995a5724,0x1641378f0 | ||
428 | .quad 0x19b1afbc4,0x0d7a4825c | ||
429 | .quad 0x109ffedc0,0x08d96551c | ||
430 | .quad 0x0f2271e60,0x10f5ff2ba | ||
431 | .quad 0x00b0bf8ca,0x00bf80dd2 | ||
432 | .quad 0x123888b7a,0x00167d312 | ||
433 | .quad 0x1e888f7dc,0x18dcddd1c | ||
434 | .quad 0x002ee03b2,0x0f6076544 | ||
435 | .quad 0x183e8d8fe,0x06a45d2b2 | ||
436 | .quad 0x133d7a042,0x026f6a60a | ||
437 | .quad 0x116b0f50c,0x1dd3e10e8 | ||
438 | .quad 0x05fabe670,0x1a2adb74e | ||
439 | .quad 0x130004488,0x0de87806c | ||
440 | .quad 0x000bcf5f6,0x19d34af3a | ||
441 | .quad 0x18f0c7078,0x014338754 | ||
442 | .quad 0x017f27698,0x049c3cc9c | ||
443 | .quad 0x058ca5f00,0x15e3e77ee | ||
444 | .quad 0x1af900c24,0x068bce87a | ||
445 | .quad 0x0b5cfca28,0x0dd07448e | ||
446 | .quad 0x0ded288f8,0x1524fa6c6 | ||
447 | .quad 0x059f229bc,0x1d8048348 | ||
448 | .quad 0x06d390dec,0x16cba8aca | ||
449 | .quad 0x037170390,0x0a3e3e02c | ||
450 | .quad 0x06353c1cc,0x042d98888 | ||
451 | .quad 0x0c4584f5c,0x0d73c7bea | ||
452 | .quad 0x1f16a3418,0x1329d9f7e | ||
453 | .quad 0x0531377e2,0x185137662 | ||
454 | .quad 0x1d8d9ca7c,0x1b1c69528 | ||
455 | .quad 0x0b25b29f2,0x18a08b5bc | ||
456 | .quad 0x19fb2a8b0,0x02178513a | ||
457 | .quad 0x1a08fe6ac,0x1da758ae0 | ||
458 | .quad 0x045cddf4e,0x0e0ac139e | ||
459 | .quad 0x1a91647f2,0x169cf9eb0 | ||
460 | .quad 0x1a0f717c4,0x0170076fa | ||
diff --git a/arch/x86/crypto/glue_helper-asm-avx.S b/arch/x86/crypto/glue_helper-asm-avx.S new file mode 100644 index 000000000000..f7b6ea2ddfdb --- /dev/null +++ b/arch/x86/crypto/glue_helper-asm-avx.S | |||
@@ -0,0 +1,91 @@ | |||
1 | /* | ||
2 | * Shared glue code for 128bit block ciphers, AVX assembler macros | ||
3 | * | ||
4 | * Copyright (c) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | */ | ||
17 | |||
18 | #define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
19 | vmovdqu (0*16)(src), x0; \ | ||
20 | vmovdqu (1*16)(src), x1; \ | ||
21 | vmovdqu (2*16)(src), x2; \ | ||
22 | vmovdqu (3*16)(src), x3; \ | ||
23 | vmovdqu (4*16)(src), x4; \ | ||
24 | vmovdqu (5*16)(src), x5; \ | ||
25 | vmovdqu (6*16)(src), x6; \ | ||
26 | vmovdqu (7*16)(src), x7; | ||
27 | |||
28 | #define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
29 | vmovdqu x0, (0*16)(dst); \ | ||
30 | vmovdqu x1, (1*16)(dst); \ | ||
31 | vmovdqu x2, (2*16)(dst); \ | ||
32 | vmovdqu x3, (3*16)(dst); \ | ||
33 | vmovdqu x4, (4*16)(dst); \ | ||
34 | vmovdqu x5, (5*16)(dst); \ | ||
35 | vmovdqu x6, (6*16)(dst); \ | ||
36 | vmovdqu x7, (7*16)(dst); | ||
37 | |||
38 | #define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
39 | vpxor (0*16)(src), x1, x1; \ | ||
40 | vpxor (1*16)(src), x2, x2; \ | ||
41 | vpxor (2*16)(src), x3, x3; \ | ||
42 | vpxor (3*16)(src), x4, x4; \ | ||
43 | vpxor (4*16)(src), x5, x5; \ | ||
44 | vpxor (5*16)(src), x6, x6; \ | ||
45 | vpxor (6*16)(src), x7, x7; \ | ||
46 | store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); | ||
47 | |||
48 | #define inc_le128(x, minus_one, tmp) \ | ||
49 | vpcmpeqq minus_one, x, tmp; \ | ||
50 | vpsubq minus_one, x, x; \ | ||
51 | vpslldq $8, tmp, tmp; \ | ||
52 | vpsubq tmp, x, x; | ||
53 | |||
54 | #define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \ | ||
55 | vpcmpeqd t0, t0, t0; \ | ||
56 | vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \ | ||
57 | vmovdqa bswap, t1; \ | ||
58 | \ | ||
59 | /* load IV and byteswap */ \ | ||
60 | vmovdqu (iv), x7; \ | ||
61 | vpshufb t1, x7, x0; \ | ||
62 | \ | ||
63 | /* construct IVs */ \ | ||
64 | inc_le128(x7, t0, t2); \ | ||
65 | vpshufb t1, x7, x1; \ | ||
66 | inc_le128(x7, t0, t2); \ | ||
67 | vpshufb t1, x7, x2; \ | ||
68 | inc_le128(x7, t0, t2); \ | ||
69 | vpshufb t1, x7, x3; \ | ||
70 | inc_le128(x7, t0, t2); \ | ||
71 | vpshufb t1, x7, x4; \ | ||
72 | inc_le128(x7, t0, t2); \ | ||
73 | vpshufb t1, x7, x5; \ | ||
74 | inc_le128(x7, t0, t2); \ | ||
75 | vpshufb t1, x7, x6; \ | ||
76 | inc_le128(x7, t0, t2); \ | ||
77 | vmovdqa x7, t2; \ | ||
78 | vpshufb t1, x7, x7; \ | ||
79 | inc_le128(t2, t0, t1); \ | ||
80 | vmovdqu t2, (iv); | ||
81 | |||
82 | #define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ | ||
83 | vpxor (0*16)(src), x0, x0; \ | ||
84 | vpxor (1*16)(src), x1, x1; \ | ||
85 | vpxor (2*16)(src), x2, x2; \ | ||
86 | vpxor (3*16)(src), x3, x3; \ | ||
87 | vpxor (4*16)(src), x4, x4; \ | ||
88 | vpxor (5*16)(src), x5, x5; \ | ||
89 | vpxor (6*16)(src), x6, x6; \ | ||
90 | vpxor (7*16)(src), x7, x7; \ | ||
91 | store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); | ||
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index 30b3927bd733..22ce4f683e55 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c | |||
@@ -221,16 +221,16 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, | |||
221 | u8 *src = (u8 *)walk->src.virt.addr; | 221 | u8 *src = (u8 *)walk->src.virt.addr; |
222 | u8 *dst = (u8 *)walk->dst.virt.addr; | 222 | u8 *dst = (u8 *)walk->dst.virt.addr; |
223 | unsigned int nbytes = walk->nbytes; | 223 | unsigned int nbytes = walk->nbytes; |
224 | u128 ctrblk; | 224 | le128 ctrblk; |
225 | u128 tmp; | 225 | u128 tmp; |
226 | 226 | ||
227 | be128_to_u128(&ctrblk, (be128 *)walk->iv); | 227 | be128_to_le128(&ctrblk, (be128 *)walk->iv); |
228 | 228 | ||
229 | memcpy(&tmp, src, nbytes); | 229 | memcpy(&tmp, src, nbytes); |
230 | fn_ctr(ctx, &tmp, &tmp, &ctrblk); | 230 | fn_ctr(ctx, &tmp, &tmp, &ctrblk); |
231 | memcpy(dst, &tmp, nbytes); | 231 | memcpy(dst, &tmp, nbytes); |
232 | 232 | ||
233 | u128_to_be128((be128 *)walk->iv, &ctrblk); | 233 | le128_to_be128((be128 *)walk->iv, &ctrblk); |
234 | } | 234 | } |
235 | EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); | 235 | EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); |
236 | 236 | ||
@@ -243,11 +243,11 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, | |||
243 | unsigned int nbytes = walk->nbytes; | 243 | unsigned int nbytes = walk->nbytes; |
244 | u128 *src = (u128 *)walk->src.virt.addr; | 244 | u128 *src = (u128 *)walk->src.virt.addr; |
245 | u128 *dst = (u128 *)walk->dst.virt.addr; | 245 | u128 *dst = (u128 *)walk->dst.virt.addr; |
246 | u128 ctrblk; | 246 | le128 ctrblk; |
247 | unsigned int num_blocks, func_bytes; | 247 | unsigned int num_blocks, func_bytes; |
248 | unsigned int i; | 248 | unsigned int i; |
249 | 249 | ||
250 | be128_to_u128(&ctrblk, (be128 *)walk->iv); | 250 | be128_to_le128(&ctrblk, (be128 *)walk->iv); |
251 | 251 | ||
252 | /* Process multi-block batch */ | 252 | /* Process multi-block batch */ |
253 | for (i = 0; i < gctx->num_funcs; i++) { | 253 | for (i = 0; i < gctx->num_funcs; i++) { |
@@ -269,7 +269,7 @@ static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, | |||
269 | } | 269 | } |
270 | 270 | ||
271 | done: | 271 | done: |
272 | u128_to_be128((be128 *)walk->iv, &ctrblk); | 272 | le128_to_be128((be128 *)walk->iv, &ctrblk); |
273 | return nbytes; | 273 | return nbytes; |
274 | } | 274 | } |
275 | 275 | ||
diff --git a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S index 504106bf04a2..02b0e9fe997c 100644 --- a/arch/x86/crypto/serpent-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/serpent-avx-x86_64-asm_64.S | |||
@@ -24,7 +24,16 @@ | |||
24 | * | 24 | * |
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include "glue_helper-asm-avx.S" | ||
28 | |||
27 | .file "serpent-avx-x86_64-asm_64.S" | 29 | .file "serpent-avx-x86_64-asm_64.S" |
30 | |||
31 | .data | ||
32 | .align 16 | ||
33 | |||
34 | .Lbswap128_mask: | ||
35 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
36 | |||
28 | .text | 37 | .text |
29 | 38 | ||
30 | #define CTX %rdi | 39 | #define CTX %rdi |
@@ -550,51 +559,27 @@ | |||
550 | vpunpcklqdq x3, t2, x2; \ | 559 | vpunpcklqdq x3, t2, x2; \ |
551 | vpunpckhqdq x3, t2, x3; | 560 | vpunpckhqdq x3, t2, x3; |
552 | 561 | ||
553 | #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ | 562 | #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ |
554 | vmovdqu (0*4*4)(in), x0; \ | ||
555 | vmovdqu (1*4*4)(in), x1; \ | ||
556 | vmovdqu (2*4*4)(in), x2; \ | ||
557 | vmovdqu (3*4*4)(in), x3; \ | ||
558 | \ | ||
559 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | 563 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
560 | 564 | ||
561 | #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | 565 | #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ |
562 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 566 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
563 | \ | ||
564 | vmovdqu x0, (0*4*4)(out); \ | ||
565 | vmovdqu x1, (1*4*4)(out); \ | ||
566 | vmovdqu x2, (2*4*4)(out); \ | ||
567 | vmovdqu x3, (3*4*4)(out); | ||
568 | |||
569 | #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | ||
570 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | ||
571 | \ | ||
572 | vpxor (0*4*4)(out), x0, x0; \ | ||
573 | vmovdqu x0, (0*4*4)(out); \ | ||
574 | vpxor (1*4*4)(out), x1, x1; \ | ||
575 | vmovdqu x1, (1*4*4)(out); \ | ||
576 | vpxor (2*4*4)(out), x2, x2; \ | ||
577 | vmovdqu x2, (2*4*4)(out); \ | ||
578 | vpxor (3*4*4)(out), x3, x3; \ | ||
579 | vmovdqu x3, (3*4*4)(out); | ||
580 | 567 | ||
581 | .align 8 | 568 | .align 8 |
582 | .global __serpent_enc_blk_8way_avx | 569 | .type __serpent_enc_blk8_avx,@function; |
583 | .type __serpent_enc_blk_8way_avx,@function; | ||
584 | 570 | ||
585 | __serpent_enc_blk_8way_avx: | 571 | __serpent_enc_blk8_avx: |
586 | /* input: | 572 | /* input: |
587 | * %rdi: ctx, CTX | 573 | * %rdi: ctx, CTX |
588 | * %rsi: dst | 574 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks |
589 | * %rdx: src | 575 | * output: |
590 | * %rcx: bool, if true: xor output | 576 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
591 | */ | 577 | */ |
592 | 578 | ||
593 | vpcmpeqd RNOT, RNOT, RNOT; | 579 | vpcmpeqd RNOT, RNOT, RNOT; |
594 | 580 | ||
595 | leaq (4*4*4)(%rdx), %rax; | 581 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
596 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | 582 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); |
597 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
598 | 583 | ||
599 | K2(RA, RB, RC, RD, RE, 0); | 584 | K2(RA, RB, RC, RD, RE, 0); |
600 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); | 585 | S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); |
@@ -630,38 +615,26 @@ __serpent_enc_blk_8way_avx: | |||
630 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); | 615 | S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); |
631 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); | 616 | S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); |
632 | 617 | ||
633 | leaq (4*4*4)(%rsi), %rax; | 618 | write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
634 | 619 | write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); | |
635 | testb %cl, %cl; | ||
636 | jnz __enc_xor8; | ||
637 | |||
638 | write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
639 | write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
640 | |||
641 | ret; | ||
642 | |||
643 | __enc_xor8: | ||
644 | xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | ||
645 | xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
646 | 620 | ||
647 | ret; | 621 | ret; |
648 | 622 | ||
649 | .align 8 | 623 | .align 8 |
650 | .global serpent_dec_blk_8way_avx | 624 | .type __serpent_dec_blk8_avx,@function; |
651 | .type serpent_dec_blk_8way_avx,@function; | ||
652 | 625 | ||
653 | serpent_dec_blk_8way_avx: | 626 | __serpent_dec_blk8_avx: |
654 | /* input: | 627 | /* input: |
655 | * %rdi: ctx, CTX | 628 | * %rdi: ctx, CTX |
656 | * %rsi: dst | 629 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks |
657 | * %rdx: src | 630 | * output: |
631 | * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks | ||
658 | */ | 632 | */ |
659 | 633 | ||
660 | vpcmpeqd RNOT, RNOT, RNOT; | 634 | vpcmpeqd RNOT, RNOT, RNOT; |
661 | 635 | ||
662 | leaq (4*4*4)(%rdx), %rax; | 636 | read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); |
663 | read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2); | 637 | read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); |
664 | read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2); | ||
665 | 638 | ||
666 | K2(RA, RB, RC, RD, RE, 32); | 639 | K2(RA, RB, RC, RD, RE, 32); |
667 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); | 640 | SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); |
@@ -697,8 +670,85 @@ serpent_dec_blk_8way_avx: | |||
697 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); | 670 | SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); |
698 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); | 671 | S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); |
699 | 672 | ||
700 | leaq (4*4*4)(%rsi), %rax; | 673 | write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); |
701 | write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2); | 674 | write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); |
702 | write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2); | 675 | |
676 | ret; | ||
677 | |||
678 | .align 8 | ||
679 | .global serpent_ecb_enc_8way_avx | ||
680 | .type serpent_ecb_enc_8way_avx,@function; | ||
681 | |||
682 | serpent_ecb_enc_8way_avx: | ||
683 | /* input: | ||
684 | * %rdi: ctx, CTX | ||
685 | * %rsi: dst | ||
686 | * %rdx: src | ||
687 | */ | ||
688 | |||
689 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
690 | |||
691 | call __serpent_enc_blk8_avx; | ||
692 | |||
693 | store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
694 | |||
695 | ret; | ||
696 | |||
697 | .align 8 | ||
698 | .global serpent_ecb_dec_8way_avx | ||
699 | .type serpent_ecb_dec_8way_avx,@function; | ||
700 | |||
701 | serpent_ecb_dec_8way_avx: | ||
702 | /* input: | ||
703 | * %rdi: ctx, CTX | ||
704 | * %rsi: dst | ||
705 | * %rdx: src | ||
706 | */ | ||
707 | |||
708 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
709 | |||
710 | call __serpent_dec_blk8_avx; | ||
711 | |||
712 | store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
713 | |||
714 | ret; | ||
715 | |||
716 | .align 8 | ||
717 | .global serpent_cbc_dec_8way_avx | ||
718 | .type serpent_cbc_dec_8way_avx,@function; | ||
719 | |||
720 | serpent_cbc_dec_8way_avx: | ||
721 | /* input: | ||
722 | * %rdi: ctx, CTX | ||
723 | * %rsi: dst | ||
724 | * %rdx: src | ||
725 | */ | ||
726 | |||
727 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
728 | |||
729 | call __serpent_dec_blk8_avx; | ||
730 | |||
731 | store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); | ||
732 | |||
733 | ret; | ||
734 | |||
735 | .align 8 | ||
736 | .global serpent_ctr_8way_avx | ||
737 | .type serpent_ctr_8way_avx,@function; | ||
738 | |||
739 | serpent_ctr_8way_avx: | ||
740 | /* input: | ||
741 | * %rdi: ctx, CTX | ||
742 | * %rsi: dst | ||
743 | * %rdx: src | ||
744 | * %rcx: iv (little endian, 128bit) | ||
745 | */ | ||
746 | |||
747 | load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
748 | RD2, RK0, RK1, RK2); | ||
749 | |||
750 | call __serpent_enc_blk8_avx; | ||
751 | |||
752 | store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
703 | 753 | ||
704 | ret; | 754 | ret; |
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c index 3f543a04cf1e..52abaaf28e7f 100644 --- a/arch/x86/crypto/serpent_avx_glue.c +++ b/arch/x86/crypto/serpent_avx_glue.c | |||
@@ -42,55 +42,24 @@ | |||
42 | #include <asm/crypto/ablk_helper.h> | 42 | #include <asm/crypto/ablk_helper.h> |
43 | #include <asm/crypto/glue_helper.h> | 43 | #include <asm/crypto/glue_helper.h> |
44 | 44 | ||
45 | static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) | 45 | static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
46 | { | ||
47 | u128 ivs[SERPENT_PARALLEL_BLOCKS - 1]; | ||
48 | unsigned int j; | ||
49 | |||
50 | for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) | ||
51 | ivs[j] = src[j]; | ||
52 | |||
53 | serpent_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); | ||
54 | |||
55 | for (j = 0; j < SERPENT_PARALLEL_BLOCKS - 1; j++) | ||
56 | u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); | ||
57 | } | ||
58 | |||
59 | static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) | ||
60 | { | 46 | { |
61 | be128 ctrblk; | 47 | be128 ctrblk; |
62 | 48 | ||
63 | u128_to_be128(&ctrblk, iv); | 49 | le128_to_be128(&ctrblk, iv); |
64 | u128_inc(iv); | 50 | le128_inc(iv); |
65 | 51 | ||
66 | __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); | 52 | __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); |
67 | u128_xor(dst, src, (u128 *)&ctrblk); | 53 | u128_xor(dst, src, (u128 *)&ctrblk); |
68 | } | 54 | } |
69 | 55 | ||
70 | static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, | ||
71 | u128 *iv) | ||
72 | { | ||
73 | be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; | ||
74 | unsigned int i; | ||
75 | |||
76 | for (i = 0; i < SERPENT_PARALLEL_BLOCKS; i++) { | ||
77 | if (dst != src) | ||
78 | dst[i] = src[i]; | ||
79 | |||
80 | u128_to_be128(&ctrblks[i], iv); | ||
81 | u128_inc(iv); | ||
82 | } | ||
83 | |||
84 | serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); | ||
85 | } | ||
86 | |||
87 | static const struct common_glue_ctx serpent_enc = { | 56 | static const struct common_glue_ctx serpent_enc = { |
88 | .num_funcs = 2, | 57 | .num_funcs = 2, |
89 | .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, | 58 | .fpu_blocks_limit = SERPENT_PARALLEL_BLOCKS, |
90 | 59 | ||
91 | .funcs = { { | 60 | .funcs = { { |
92 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | 61 | .num_blocks = SERPENT_PARALLEL_BLOCKS, |
93 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_enc_blk_xway) } | 62 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_enc_8way_avx) } |
94 | }, { | 63 | }, { |
95 | .num_blocks = 1, | 64 | .num_blocks = 1, |
96 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } | 65 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_encrypt) } |
@@ -103,7 +72,7 @@ static const struct common_glue_ctx serpent_ctr = { | |||
103 | 72 | ||
104 | .funcs = { { | 73 | .funcs = { { |
105 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | 74 | .num_blocks = SERPENT_PARALLEL_BLOCKS, |
106 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr_xway) } | 75 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_ctr_8way_avx) } |
107 | }, { | 76 | }, { |
108 | .num_blocks = 1, | 77 | .num_blocks = 1, |
109 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } | 78 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(serpent_crypt_ctr) } |
@@ -116,7 +85,7 @@ static const struct common_glue_ctx serpent_dec = { | |||
116 | 85 | ||
117 | .funcs = { { | 86 | .funcs = { { |
118 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | 87 | .num_blocks = SERPENT_PARALLEL_BLOCKS, |
119 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_dec_blk_xway) } | 88 | .fn_u = { .ecb = GLUE_FUNC_CAST(serpent_ecb_dec_8way_avx) } |
120 | }, { | 89 | }, { |
121 | .num_blocks = 1, | 90 | .num_blocks = 1, |
122 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } | 91 | .fn_u = { .ecb = GLUE_FUNC_CAST(__serpent_decrypt) } |
@@ -129,7 +98,7 @@ static const struct common_glue_ctx serpent_dec_cbc = { | |||
129 | 98 | ||
130 | .funcs = { { | 99 | .funcs = { { |
131 | .num_blocks = SERPENT_PARALLEL_BLOCKS, | 100 | .num_blocks = SERPENT_PARALLEL_BLOCKS, |
132 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_decrypt_cbc_xway) } | 101 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(serpent_cbc_dec_8way_avx) } |
133 | }, { | 102 | }, { |
134 | .num_blocks = 1, | 103 | .num_blocks = 1, |
135 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } | 104 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(__serpent_decrypt) } |
@@ -193,7 +162,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
193 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | 162 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
194 | 163 | ||
195 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { | 164 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { |
196 | serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); | 165 | serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); |
197 | return; | 166 | return; |
198 | } | 167 | } |
199 | 168 | ||
@@ -210,7 +179,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
210 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | 179 | ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); |
211 | 180 | ||
212 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { | 181 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { |
213 | serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); | 182 | serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); |
214 | return; | 183 | return; |
215 | } | 184 | } |
216 | 185 | ||
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c index 9107a9908c41..97a356ece24d 100644 --- a/arch/x86/crypto/serpent_sse2_glue.c +++ b/arch/x86/crypto/serpent_sse2_glue.c | |||
@@ -59,19 +59,19 @@ static void serpent_decrypt_cbc_xway(void *ctx, u128 *dst, const u128 *src) | |||
59 | u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); | 59 | u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); |
60 | } | 60 | } |
61 | 61 | ||
62 | static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) | 62 | static void serpent_crypt_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
63 | { | 63 | { |
64 | be128 ctrblk; | 64 | be128 ctrblk; |
65 | 65 | ||
66 | u128_to_be128(&ctrblk, iv); | 66 | le128_to_be128(&ctrblk, iv); |
67 | u128_inc(iv); | 67 | le128_inc(iv); |
68 | 68 | ||
69 | __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); | 69 | __serpent_encrypt(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); |
70 | u128_xor(dst, src, (u128 *)&ctrblk); | 70 | u128_xor(dst, src, (u128 *)&ctrblk); |
71 | } | 71 | } |
72 | 72 | ||
73 | static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, | 73 | static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, |
74 | u128 *iv) | 74 | le128 *iv) |
75 | { | 75 | { |
76 | be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; | 76 | be128 ctrblks[SERPENT_PARALLEL_BLOCKS]; |
77 | unsigned int i; | 77 | unsigned int i; |
@@ -80,8 +80,8 @@ static void serpent_crypt_ctr_xway(void *ctx, u128 *dst, const u128 *src, | |||
80 | if (dst != src) | 80 | if (dst != src) |
81 | dst[i] = src[i]; | 81 | dst[i] = src[i]; |
82 | 82 | ||
83 | u128_to_be128(&ctrblks[i], iv); | 83 | le128_to_be128(&ctrblks[i], iv); |
84 | u128_inc(iv); | 84 | le128_inc(iv); |
85 | } | 85 | } |
86 | 86 | ||
87 | serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); | 87 | serpent_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); |
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S index 1585abb13dde..ebac16bfa830 100644 --- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S | |||
@@ -23,7 +23,16 @@ | |||
23 | * | 23 | * |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include "glue_helper-asm-avx.S" | ||
27 | |||
26 | .file "twofish-avx-x86_64-asm_64.S" | 28 | .file "twofish-avx-x86_64-asm_64.S" |
29 | |||
30 | .data | ||
31 | .align 16 | ||
32 | |||
33 | .Lbswap128_mask: | ||
34 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
35 | |||
27 | .text | 36 | .text |
28 | 37 | ||
29 | /* structure of crypto context */ | 38 | /* structure of crypto context */ |
@@ -217,69 +226,45 @@ | |||
217 | vpunpcklqdq x3, t2, x2; \ | 226 | vpunpcklqdq x3, t2, x2; \ |
218 | vpunpckhqdq x3, t2, x3; | 227 | vpunpckhqdq x3, t2, x3; |
219 | 228 | ||
220 | #define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \ | 229 | #define inpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ |
221 | vpxor (0*4*4)(in), wkey, x0; \ | 230 | vpxor x0, wkey, x0; \ |
222 | vpxor (1*4*4)(in), wkey, x1; \ | 231 | vpxor x1, wkey, x1; \ |
223 | vpxor (2*4*4)(in), wkey, x2; \ | 232 | vpxor x2, wkey, x2; \ |
224 | vpxor (3*4*4)(in), wkey, x3; \ | 233 | vpxor x3, wkey, x3; \ |
225 | \ | 234 | \ |
226 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | 235 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
227 | 236 | ||
228 | #define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ | 237 | #define outunpack_blocks(x0, x1, x2, x3, wkey, t0, t1, t2) \ |
229 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | ||
230 | \ | ||
231 | vpxor x0, wkey, x0; \ | ||
232 | vmovdqu x0, (0*4*4)(out); \ | ||
233 | vpxor x1, wkey, x1; \ | ||
234 | vmovdqu x1, (1*4*4)(out); \ | ||
235 | vpxor x2, wkey, x2; \ | ||
236 | vmovdqu x2, (2*4*4)(out); \ | ||
237 | vpxor x3, wkey, x3; \ | ||
238 | vmovdqu x3, (3*4*4)(out); | ||
239 | |||
240 | #define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \ | ||
241 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 238 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
242 | \ | 239 | \ |
243 | vpxor x0, wkey, x0; \ | 240 | vpxor x0, wkey, x0; \ |
244 | vpxor (0*4*4)(out), x0, x0; \ | 241 | vpxor x1, wkey, x1; \ |
245 | vmovdqu x0, (0*4*4)(out); \ | 242 | vpxor x2, wkey, x2; \ |
246 | vpxor x1, wkey, x1; \ | 243 | vpxor x3, wkey, x3; |
247 | vpxor (1*4*4)(out), x1, x1; \ | ||
248 | vmovdqu x1, (1*4*4)(out); \ | ||
249 | vpxor x2, wkey, x2; \ | ||
250 | vpxor (2*4*4)(out), x2, x2; \ | ||
251 | vmovdqu x2, (2*4*4)(out); \ | ||
252 | vpxor x3, wkey, x3; \ | ||
253 | vpxor (3*4*4)(out), x3, x3; \ | ||
254 | vmovdqu x3, (3*4*4)(out); | ||
255 | 244 | ||
256 | .align 8 | 245 | .align 8 |
257 | .global __twofish_enc_blk_8way | 246 | .type __twofish_enc_blk8,@function; |
258 | .type __twofish_enc_blk_8way,@function; | ||
259 | 247 | ||
260 | __twofish_enc_blk_8way: | 248 | __twofish_enc_blk8: |
261 | /* input: | 249 | /* input: |
262 | * %rdi: ctx, CTX | 250 | * %rdi: ctx, CTX |
263 | * %rsi: dst | 251 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks |
264 | * %rdx: src | 252 | * output: |
265 | * %rcx: bool, if true: xor output | 253 | * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks |
266 | */ | 254 | */ |
267 | 255 | ||
256 | vmovdqu w(CTX), RK1; | ||
257 | |||
268 | pushq %rbp; | 258 | pushq %rbp; |
269 | pushq %rbx; | 259 | pushq %rbx; |
270 | pushq %rcx; | 260 | pushq %rcx; |
271 | 261 | ||
272 | vmovdqu w(CTX), RK1; | 262 | inpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); |
273 | |||
274 | leaq (4*4*4)(%rdx), %rax; | ||
275 | inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); | ||
276 | preload_rgi(RA1); | 263 | preload_rgi(RA1); |
277 | rotate_1l(RD1); | 264 | rotate_1l(RD1); |
278 | inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); | 265 | inpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); |
279 | rotate_1l(RD2); | 266 | rotate_1l(RD2); |
280 | 267 | ||
281 | movq %rsi, %r11; | ||
282 | |||
283 | encrypt_cycle(0); | 268 | encrypt_cycle(0); |
284 | encrypt_cycle(1); | 269 | encrypt_cycle(1); |
285 | encrypt_cycle(2); | 270 | encrypt_cycle(2); |
@@ -295,47 +280,33 @@ __twofish_enc_blk_8way: | |||
295 | popq %rbx; | 280 | popq %rbx; |
296 | popq %rbp; | 281 | popq %rbp; |
297 | 282 | ||
298 | leaq (4*4*4)(%r11), %rax; | 283 | outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); |
299 | 284 | outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); | |
300 | testb %cl, %cl; | ||
301 | jnz __enc_xor8; | ||
302 | |||
303 | outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); | ||
304 | outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); | ||
305 | |||
306 | ret; | ||
307 | |||
308 | __enc_xor8: | ||
309 | outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); | ||
310 | outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); | ||
311 | 285 | ||
312 | ret; | 286 | ret; |
313 | 287 | ||
314 | .align 8 | 288 | .align 8 |
315 | .global twofish_dec_blk_8way | 289 | .type __twofish_dec_blk8,@function; |
316 | .type twofish_dec_blk_8way,@function; | ||
317 | 290 | ||
318 | twofish_dec_blk_8way: | 291 | __twofish_dec_blk8: |
319 | /* input: | 292 | /* input: |
320 | * %rdi: ctx, CTX | 293 | * %rdi: ctx, CTX |
321 | * %rsi: dst | 294 | * RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2: encrypted blocks |
322 | * %rdx: src | 295 | * output: |
296 | * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks | ||
323 | */ | 297 | */ |
324 | 298 | ||
299 | vmovdqu (w+4*4)(CTX), RK1; | ||
300 | |||
325 | pushq %rbp; | 301 | pushq %rbp; |
326 | pushq %rbx; | 302 | pushq %rbx; |
327 | 303 | ||
328 | vmovdqu (w+4*4)(CTX), RK1; | 304 | inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); |
329 | |||
330 | leaq (4*4*4)(%rdx), %rax; | ||
331 | inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2); | ||
332 | preload_rgi(RC1); | 305 | preload_rgi(RC1); |
333 | rotate_1l(RA1); | 306 | rotate_1l(RA1); |
334 | inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); | 307 | inpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2); |
335 | rotate_1l(RA2); | 308 | rotate_1l(RA2); |
336 | 309 | ||
337 | movq %rsi, %r11; | ||
338 | |||
339 | decrypt_cycle(7); | 310 | decrypt_cycle(7); |
340 | decrypt_cycle(6); | 311 | decrypt_cycle(6); |
341 | decrypt_cycle(5); | 312 | decrypt_cycle(5); |
@@ -350,8 +321,103 @@ twofish_dec_blk_8way: | |||
350 | popq %rbx; | 321 | popq %rbx; |
351 | popq %rbp; | 322 | popq %rbp; |
352 | 323 | ||
353 | leaq (4*4*4)(%r11), %rax; | 324 | outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); |
354 | outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2); | 325 | outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); |
355 | outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2); | 326 | |
327 | ret; | ||
328 | |||
329 | .align 8 | ||
330 | .global twofish_ecb_enc_8way | ||
331 | .type twofish_ecb_enc_8way,@function; | ||
332 | |||
333 | twofish_ecb_enc_8way: | ||
334 | /* input: | ||
335 | * %rdi: ctx, CTX | ||
336 | * %rsi: dst | ||
337 | * %rdx: src | ||
338 | */ | ||
339 | |||
340 | movq %rsi, %r11; | ||
341 | |||
342 | load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
343 | |||
344 | call __twofish_enc_blk8; | ||
345 | |||
346 | store_8way(%r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); | ||
347 | |||
348 | ret; | ||
349 | |||
350 | .align 8 | ||
351 | .global twofish_ecb_dec_8way | ||
352 | .type twofish_ecb_dec_8way,@function; | ||
353 | |||
354 | twofish_ecb_dec_8way: | ||
355 | /* input: | ||
356 | * %rdi: ctx, CTX | ||
357 | * %rsi: dst | ||
358 | * %rdx: src | ||
359 | */ | ||
360 | |||
361 | movq %rsi, %r11; | ||
362 | |||
363 | load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); | ||
364 | |||
365 | call __twofish_dec_blk8; | ||
366 | |||
367 | store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
368 | |||
369 | ret; | ||
370 | |||
371 | .align 8 | ||
372 | .global twofish_cbc_dec_8way | ||
373 | .type twofish_cbc_dec_8way,@function; | ||
374 | |||
375 | twofish_cbc_dec_8way: | ||
376 | /* input: | ||
377 | * %rdi: ctx, CTX | ||
378 | * %rsi: dst | ||
379 | * %rdx: src | ||
380 | */ | ||
381 | |||
382 | pushq %r12; | ||
383 | |||
384 | movq %rsi, %r11; | ||
385 | movq %rdx, %r12; | ||
386 | |||
387 | load_8way(%rdx, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); | ||
388 | |||
389 | call __twofish_dec_blk8; | ||
390 | |||
391 | store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); | ||
392 | |||
393 | popq %r12; | ||
394 | |||
395 | ret; | ||
396 | |||
397 | .align 8 | ||
398 | .global twofish_ctr_8way | ||
399 | .type twofish_ctr_8way,@function; | ||
400 | |||
401 | twofish_ctr_8way: | ||
402 | /* input: | ||
403 | * %rdi: ctx, CTX | ||
404 | * %rsi: dst | ||
405 | * %rdx: src | ||
406 | * %rcx: iv (little endian, 128bit) | ||
407 | */ | ||
408 | |||
409 | pushq %r12; | ||
410 | |||
411 | movq %rsi, %r11; | ||
412 | movq %rdx, %r12; | ||
413 | |||
414 | load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, | ||
415 | RD2, RX0, RX1, RY0); | ||
416 | |||
417 | call __twofish_enc_blk8; | ||
418 | |||
419 | store_ctr_8way(%r12, %r11, RC1, RD1, RA1, RB1, RC2, RD2, RA2, RB2); | ||
420 | |||
421 | popq %r12; | ||
356 | 422 | ||
357 | ret; | 423 | ret; |
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c index e7708b5442e0..94ac91d26e47 100644 --- a/arch/x86/crypto/twofish_avx_glue.c +++ b/arch/x86/crypto/twofish_avx_glue.c | |||
@@ -45,66 +45,23 @@ | |||
45 | 45 | ||
46 | #define TWOFISH_PARALLEL_BLOCKS 8 | 46 | #define TWOFISH_PARALLEL_BLOCKS 8 |
47 | 47 | ||
48 | static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | ||
49 | const u8 *src) | ||
50 | { | ||
51 | __twofish_enc_blk_3way(ctx, dst, src, false); | ||
52 | } | ||
53 | |||
54 | /* 8-way parallel cipher functions */ | 48 | /* 8-way parallel cipher functions */ |
55 | asmlinkage void __twofish_enc_blk_8way(struct twofish_ctx *ctx, u8 *dst, | 49 | asmlinkage void twofish_ecb_enc_8way(struct twofish_ctx *ctx, u8 *dst, |
56 | const u8 *src, bool xor); | 50 | const u8 *src); |
57 | asmlinkage void twofish_dec_blk_8way(struct twofish_ctx *ctx, u8 *dst, | 51 | asmlinkage void twofish_ecb_dec_8way(struct twofish_ctx *ctx, u8 *dst, |
58 | const u8 *src); | 52 | const u8 *src); |
59 | 53 | ||
60 | static inline void twofish_enc_blk_xway(struct twofish_ctx *ctx, u8 *dst, | 54 | asmlinkage void twofish_cbc_dec_8way(struct twofish_ctx *ctx, u8 *dst, |
61 | const u8 *src) | 55 | const u8 *src); |
62 | { | 56 | asmlinkage void twofish_ctr_8way(struct twofish_ctx *ctx, u8 *dst, |
63 | __twofish_enc_blk_8way(ctx, dst, src, false); | 57 | const u8 *src, le128 *iv); |
64 | } | ||
65 | |||
66 | static inline void twofish_enc_blk_xway_xor(struct twofish_ctx *ctx, u8 *dst, | ||
67 | const u8 *src) | ||
68 | { | ||
69 | __twofish_enc_blk_8way(ctx, dst, src, true); | ||
70 | } | ||
71 | 58 | ||
72 | static inline void twofish_dec_blk_xway(struct twofish_ctx *ctx, u8 *dst, | 59 | static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, |
73 | const u8 *src) | 60 | const u8 *src) |
74 | { | 61 | { |
75 | twofish_dec_blk_8way(ctx, dst, src); | 62 | __twofish_enc_blk_3way(ctx, dst, src, false); |
76 | } | ||
77 | |||
78 | static void twofish_dec_blk_cbc_xway(void *ctx, u128 *dst, const u128 *src) | ||
79 | { | ||
80 | u128 ivs[TWOFISH_PARALLEL_BLOCKS - 1]; | ||
81 | unsigned int j; | ||
82 | |||
83 | for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) | ||
84 | ivs[j] = src[j]; | ||
85 | |||
86 | twofish_dec_blk_xway(ctx, (u8 *)dst, (u8 *)src); | ||
87 | |||
88 | for (j = 0; j < TWOFISH_PARALLEL_BLOCKS - 1; j++) | ||
89 | u128_xor(dst + (j + 1), dst + (j + 1), ivs + j); | ||
90 | } | 63 | } |
91 | 64 | ||
92 | static void twofish_enc_blk_ctr_xway(void *ctx, u128 *dst, const u128 *src, | ||
93 | u128 *iv) | ||
94 | { | ||
95 | be128 ctrblks[TWOFISH_PARALLEL_BLOCKS]; | ||
96 | unsigned int i; | ||
97 | |||
98 | for (i = 0; i < TWOFISH_PARALLEL_BLOCKS; i++) { | ||
99 | if (dst != src) | ||
100 | dst[i] = src[i]; | ||
101 | |||
102 | u128_to_be128(&ctrblks[i], iv); | ||
103 | u128_inc(iv); | ||
104 | } | ||
105 | |||
106 | twofish_enc_blk_xway_xor(ctx, (u8 *)dst, (u8 *)ctrblks); | ||
107 | } | ||
108 | 65 | ||
109 | static const struct common_glue_ctx twofish_enc = { | 66 | static const struct common_glue_ctx twofish_enc = { |
110 | .num_funcs = 3, | 67 | .num_funcs = 3, |
@@ -112,7 +69,7 @@ static const struct common_glue_ctx twofish_enc = { | |||
112 | 69 | ||
113 | .funcs = { { | 70 | .funcs = { { |
114 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, | 71 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, |
115 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_xway) } | 72 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_enc_8way) } |
116 | }, { | 73 | }, { |
117 | .num_blocks = 3, | 74 | .num_blocks = 3, |
118 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } | 75 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_enc_blk_3way) } |
@@ -128,7 +85,7 @@ static const struct common_glue_ctx twofish_ctr = { | |||
128 | 85 | ||
129 | .funcs = { { | 86 | .funcs = { { |
130 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, | 87 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, |
131 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_xway) } | 88 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_ctr_8way) } |
132 | }, { | 89 | }, { |
133 | .num_blocks = 3, | 90 | .num_blocks = 3, |
134 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } | 91 | .fn_u = { .ctr = GLUE_CTR_FUNC_CAST(twofish_enc_blk_ctr_3way) } |
@@ -144,7 +101,7 @@ static const struct common_glue_ctx twofish_dec = { | |||
144 | 101 | ||
145 | .funcs = { { | 102 | .funcs = { { |
146 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, | 103 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, |
147 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_xway) } | 104 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_ecb_dec_8way) } |
148 | }, { | 105 | }, { |
149 | .num_blocks = 3, | 106 | .num_blocks = 3, |
150 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } | 107 | .fn_u = { .ecb = GLUE_FUNC_CAST(twofish_dec_blk_3way) } |
@@ -160,7 +117,7 @@ static const struct common_glue_ctx twofish_dec_cbc = { | |||
160 | 117 | ||
161 | .funcs = { { | 118 | .funcs = { { |
162 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, | 119 | .num_blocks = TWOFISH_PARALLEL_BLOCKS, |
163 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_xway) } | 120 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_cbc_dec_8way) } |
164 | }, { | 121 | }, { |
165 | .num_blocks = 3, | 122 | .num_blocks = 3, |
166 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } | 123 | .fn_u = { .cbc = GLUE_CBC_FUNC_CAST(twofish_dec_blk_cbc_3way) } |
@@ -227,7 +184,7 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
227 | ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); | 184 | ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); |
228 | 185 | ||
229 | if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { | 186 | if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { |
230 | twofish_enc_blk_xway(ctx->ctx, srcdst, srcdst); | 187 | twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); |
231 | return; | 188 | return; |
232 | } | 189 | } |
233 | 190 | ||
@@ -249,7 +206,7 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |||
249 | ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); | 206 | ctx->fpu_enabled = twofish_fpu_begin(ctx->fpu_enabled, nbytes); |
250 | 207 | ||
251 | if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { | 208 | if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { |
252 | twofish_dec_blk_xway(ctx->ctx, srcdst, srcdst); | 209 | twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); |
253 | return; | 210 | return; |
254 | } | 211 | } |
255 | 212 | ||
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index aa3eb358b7e8..13e63b3e1dfb 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c | |||
@@ -62,15 +62,15 @@ void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src) | |||
62 | } | 62 | } |
63 | EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); | 63 | EXPORT_SYMBOL_GPL(twofish_dec_blk_cbc_3way); |
64 | 64 | ||
65 | void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) | 65 | void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, le128 *iv) |
66 | { | 66 | { |
67 | be128 ctrblk; | 67 | be128 ctrblk; |
68 | 68 | ||
69 | if (dst != src) | 69 | if (dst != src) |
70 | *dst = *src; | 70 | *dst = *src; |
71 | 71 | ||
72 | u128_to_be128(&ctrblk, iv); | 72 | le128_to_be128(&ctrblk, iv); |
73 | u128_inc(iv); | 73 | le128_inc(iv); |
74 | 74 | ||
75 | twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); | 75 | twofish_enc_blk(ctx, (u8 *)&ctrblk, (u8 *)&ctrblk); |
76 | u128_xor(dst, dst, (u128 *)&ctrblk); | 76 | u128_xor(dst, dst, (u128 *)&ctrblk); |
@@ -78,7 +78,7 @@ void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, u128 *iv) | |||
78 | EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); | 78 | EXPORT_SYMBOL_GPL(twofish_enc_blk_ctr); |
79 | 79 | ||
80 | void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, | 80 | void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, |
81 | u128 *iv) | 81 | le128 *iv) |
82 | { | 82 | { |
83 | be128 ctrblks[3]; | 83 | be128 ctrblks[3]; |
84 | 84 | ||
@@ -88,12 +88,12 @@ void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, | |||
88 | dst[2] = src[2]; | 88 | dst[2] = src[2]; |
89 | } | 89 | } |
90 | 90 | ||
91 | u128_to_be128(&ctrblks[0], iv); | 91 | le128_to_be128(&ctrblks[0], iv); |
92 | u128_inc(iv); | 92 | le128_inc(iv); |
93 | u128_to_be128(&ctrblks[1], iv); | 93 | le128_to_be128(&ctrblks[1], iv); |
94 | u128_inc(iv); | 94 | le128_inc(iv); |
95 | u128_to_be128(&ctrblks[2], iv); | 95 | le128_to_be128(&ctrblks[2], iv); |
96 | u128_inc(iv); | 96 | le128_inc(iv); |
97 | 97 | ||
98 | twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); | 98 | twofish_enc_blk_xor_3way(ctx, (u8 *)dst, (u8 *)ctrblks); |
99 | } | 99 | } |
diff --git a/arch/x86/include/asm/crypto/camellia.h b/arch/x86/include/asm/crypto/camellia.h new file mode 100644 index 000000000000..98038add801e --- /dev/null +++ b/arch/x86/include/asm/crypto/camellia.h | |||
@@ -0,0 +1,82 @@ | |||
1 | #ifndef ASM_X86_CAMELLIA_H | ||
2 | #define ASM_X86_CAMELLIA_H | ||
3 | |||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/crypto.h> | ||
6 | |||
7 | #define CAMELLIA_MIN_KEY_SIZE 16 | ||
8 | #define CAMELLIA_MAX_KEY_SIZE 32 | ||
9 | #define CAMELLIA_BLOCK_SIZE 16 | ||
10 | #define CAMELLIA_TABLE_BYTE_LEN 272 | ||
11 | #define CAMELLIA_PARALLEL_BLOCKS 2 | ||
12 | |||
13 | struct camellia_ctx { | ||
14 | u64 key_table[CAMELLIA_TABLE_BYTE_LEN / sizeof(u64)]; | ||
15 | u32 key_length; | ||
16 | }; | ||
17 | |||
18 | struct camellia_lrw_ctx { | ||
19 | struct lrw_table_ctx lrw_table; | ||
20 | struct camellia_ctx camellia_ctx; | ||
21 | }; | ||
22 | |||
23 | struct camellia_xts_ctx { | ||
24 | struct camellia_ctx tweak_ctx; | ||
25 | struct camellia_ctx crypt_ctx; | ||
26 | }; | ||
27 | |||
28 | extern int __camellia_setkey(struct camellia_ctx *cctx, | ||
29 | const unsigned char *key, | ||
30 | unsigned int key_len, u32 *flags); | ||
31 | |||
32 | extern int lrw_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, | ||
33 | unsigned int keylen); | ||
34 | extern void lrw_camellia_exit_tfm(struct crypto_tfm *tfm); | ||
35 | |||
36 | extern int xts_camellia_setkey(struct crypto_tfm *tfm, const u8 *key, | ||
37 | unsigned int keylen); | ||
38 | |||
39 | /* regular block cipher functions */ | ||
40 | asmlinkage void __camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, | ||
41 | const u8 *src, bool xor); | ||
42 | asmlinkage void camellia_dec_blk(struct camellia_ctx *ctx, u8 *dst, | ||
43 | const u8 *src); | ||
44 | |||
45 | /* 2-way parallel cipher functions */ | ||
46 | asmlinkage void __camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, | ||
47 | const u8 *src, bool xor); | ||
48 | asmlinkage void camellia_dec_blk_2way(struct camellia_ctx *ctx, u8 *dst, | ||
49 | const u8 *src); | ||
50 | |||
51 | static inline void camellia_enc_blk(struct camellia_ctx *ctx, u8 *dst, | ||
52 | const u8 *src) | ||
53 | { | ||
54 | __camellia_enc_blk(ctx, dst, src, false); | ||
55 | } | ||
56 | |||
57 | static inline void camellia_enc_blk_xor(struct camellia_ctx *ctx, u8 *dst, | ||
58 | const u8 *src) | ||
59 | { | ||
60 | __camellia_enc_blk(ctx, dst, src, true); | ||
61 | } | ||
62 | |||
63 | static inline void camellia_enc_blk_2way(struct camellia_ctx *ctx, u8 *dst, | ||
64 | const u8 *src) | ||
65 | { | ||
66 | __camellia_enc_blk_2way(ctx, dst, src, false); | ||
67 | } | ||
68 | |||
69 | static inline void camellia_enc_blk_xor_2way(struct camellia_ctx *ctx, u8 *dst, | ||
70 | const u8 *src) | ||
71 | { | ||
72 | __camellia_enc_blk_2way(ctx, dst, src, true); | ||
73 | } | ||
74 | |||
75 | /* glue helpers */ | ||
76 | extern void camellia_decrypt_cbc_2way(void *ctx, u128 *dst, const u128 *src); | ||
77 | extern void camellia_crypt_ctr(void *ctx, u128 *dst, const u128 *src, | ||
78 | le128 *iv); | ||
79 | extern void camellia_crypt_ctr_2way(void *ctx, u128 *dst, const u128 *src, | ||
80 | le128 *iv); | ||
81 | |||
82 | #endif /* ASM_X86_CAMELLIA_H */ | ||
diff --git a/arch/x86/include/asm/crypto/glue_helper.h b/arch/x86/include/asm/crypto/glue_helper.h index 3e408bddc96f..e2d65b061d27 100644 --- a/arch/x86/include/asm/crypto/glue_helper.h +++ b/arch/x86/include/asm/crypto/glue_helper.h | |||
@@ -13,7 +13,7 @@ | |||
13 | typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); | 13 | typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); |
14 | typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); | 14 | typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); |
15 | typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, | 15 | typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, |
16 | u128 *iv); | 16 | le128 *iv); |
17 | 17 | ||
18 | #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) | 18 | #define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) |
19 | #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) | 19 | #define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) |
@@ -71,23 +71,29 @@ static inline void glue_fpu_end(bool fpu_enabled) | |||
71 | kernel_fpu_end(); | 71 | kernel_fpu_end(); |
72 | } | 72 | } |
73 | 73 | ||
74 | static inline void u128_to_be128(be128 *dst, const u128 *src) | 74 | static inline void le128_to_be128(be128 *dst, const le128 *src) |
75 | { | 75 | { |
76 | dst->a = cpu_to_be64(src->a); | 76 | dst->a = cpu_to_be64(le64_to_cpu(src->a)); |
77 | dst->b = cpu_to_be64(src->b); | 77 | dst->b = cpu_to_be64(le64_to_cpu(src->b)); |
78 | } | 78 | } |
79 | 79 | ||
80 | static inline void be128_to_u128(u128 *dst, const be128 *src) | 80 | static inline void be128_to_le128(le128 *dst, const be128 *src) |
81 | { | 81 | { |
82 | dst->a = be64_to_cpu(src->a); | 82 | dst->a = cpu_to_le64(be64_to_cpu(src->a)); |
83 | dst->b = be64_to_cpu(src->b); | 83 | dst->b = cpu_to_le64(be64_to_cpu(src->b)); |
84 | } | 84 | } |
85 | 85 | ||
86 | static inline void u128_inc(u128 *i) | 86 | static inline void le128_inc(le128 *i) |
87 | { | 87 | { |
88 | i->b++; | 88 | u64 a = le64_to_cpu(i->a); |
89 | if (!i->b) | 89 | u64 b = le64_to_cpu(i->b); |
90 | i->a++; | 90 | |
91 | b++; | ||
92 | if (!b) | ||
93 | a++; | ||
94 | |||
95 | i->a = cpu_to_le64(a); | ||
96 | i->b = cpu_to_le64(b); | ||
91 | } | 97 | } |
92 | 98 | ||
93 | extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, | 99 | extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, |
diff --git a/arch/x86/include/asm/crypto/serpent-avx.h b/arch/x86/include/asm/crypto/serpent-avx.h index 432deedd2945..0da1d3e2a55c 100644 --- a/arch/x86/include/asm/crypto/serpent-avx.h +++ b/arch/x86/include/asm/crypto/serpent-avx.h | |||
@@ -6,27 +6,14 @@ | |||
6 | 6 | ||
7 | #define SERPENT_PARALLEL_BLOCKS 8 | 7 | #define SERPENT_PARALLEL_BLOCKS 8 |
8 | 8 | ||
9 | asmlinkage void __serpent_enc_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, | 9 | asmlinkage void serpent_ecb_enc_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
10 | const u8 *src, bool xor); | 10 | const u8 *src); |
11 | asmlinkage void serpent_dec_blk_8way_avx(struct serpent_ctx *ctx, u8 *dst, | 11 | asmlinkage void serpent_ecb_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
12 | const u8 *src); | 12 | const u8 *src); |
13 | 13 | ||
14 | static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst, | 14 | asmlinkage void serpent_cbc_dec_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
15 | const u8 *src) | 15 | const u8 *src); |
16 | { | 16 | asmlinkage void serpent_ctr_8way_avx(struct serpent_ctx *ctx, u8 *dst, |
17 | __serpent_enc_blk_8way_avx(ctx, dst, src, false); | 17 | const u8 *src, le128 *iv); |
18 | } | ||
19 | |||
20 | static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst, | ||
21 | const u8 *src) | ||
22 | { | ||
23 | __serpent_enc_blk_8way_avx(ctx, dst, src, true); | ||
24 | } | ||
25 | |||
26 | static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst, | ||
27 | const u8 *src) | ||
28 | { | ||
29 | serpent_dec_blk_8way_avx(ctx, dst, src); | ||
30 | } | ||
31 | 18 | ||
32 | #endif | 19 | #endif |
diff --git a/arch/x86/include/asm/crypto/twofish.h b/arch/x86/include/asm/crypto/twofish.h index 9d2c514bd5f9..878c51ceebb5 100644 --- a/arch/x86/include/asm/crypto/twofish.h +++ b/arch/x86/include/asm/crypto/twofish.h | |||
@@ -31,9 +31,9 @@ asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, | |||
31 | /* helpers from twofish_x86_64-3way module */ | 31 | /* helpers from twofish_x86_64-3way module */ |
32 | extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); | 32 | extern void twofish_dec_blk_cbc_3way(void *ctx, u128 *dst, const u128 *src); |
33 | extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, | 33 | extern void twofish_enc_blk_ctr(void *ctx, u128 *dst, const u128 *src, |
34 | u128 *iv); | 34 | le128 *iv); |
35 | extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, | 35 | extern void twofish_enc_blk_ctr_3way(void *ctx, u128 *dst, const u128 *src, |
36 | u128 *iv); | 36 | le128 *iv); |
37 | 37 | ||
38 | extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, | 38 | extern int lrw_twofish_setkey(struct crypto_tfm *tfm, const u8 *key, |
39 | unsigned int keylen); | 39 | unsigned int keylen); |