diff options
author | Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 2011-09-26 09:47:25 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2011-10-21 08:23:08 -0400 |
commit | 8280daad436edb7dd9e7e06fc13bcecb6b2a885c (patch) | |
tree | 0d4cb032c6da8617bd4a2dd84bd8ef1a605fa19d /arch/x86/crypto | |
parent | 91d41f159d75d602f6001218eec64c5e761475a6 (diff) |
crypto: twofish - add 3-way parallel x86_64 assembler implemention
Patch adds 3-way parallel x86_64 assembly implementation of twofish as new
module. New assembler functions crypt data in three blocks chunks, improving
cipher performance on out-of-order CPUs.
Patch has been tested with tcrypt and automated filesystem tests.
Summary of the tcrypt benchmarks:
Twofish 3-way-asm vs twofish asm (128bit 8kb block ECB)
encrypt: 1.3x speed
decrypt: 1.3x speed
Twofish 3-way-asm vs twofish asm (128bit 8kb block CBC)
encrypt: 1.07x speed
decrypt: 1.4x speed
Twofish 3-way-asm vs twofish asm (128bit 8kb block CTR)
encrypt: 1.4x speed
Twofish 3-way-asm vs AES asm (128bit 8kb block ECB)
encrypt: 1.0x speed
decrypt: 1.0x speed
Twofish 3-way-asm vs AES asm (128bit 8kb block CBC)
encrypt: 0.84x speed
decrypt: 1.09x speed
Twofish 3-way-asm vs AES asm (128bit 8kb block CTR)
encrypt: 1.15x speed
Full output:
http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-twofish-3way-asm-x86_64.txt
http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-twofish-asm-x86_64.txt
http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-aes-asm-x86_64.txt
Tests were run on:
vendor_id : AuthenticAMD
cpu family : 16
model : 10
model name : AMD Phenom(tm) II X6 1055T Processor
Also userspace test were run on:
vendor_id : GenuineIntel
cpu family : 6
model : 15
model name : Intel(R) Xeon(R) CPU E7330 @ 2.40GHz
stepping : 11
Userspace test results:
Encryption/decryption of twofish 3-way vs x86_64-asm on AMD Phenom II:
encrypt: 1.27x
decrypt: 1.25x
Encryption/decryption of twofish 3-way vs x86_64-asm on Intel Xeon E7330:
encrypt: 1.36x
decrypt: 1.36x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/crypto/twofish-x86_64-asm_64-3way.S | 316 | ||||
-rw-r--r-- | arch/x86/crypto/twofish_glue_3way.c | 472 |
3 files changed, 790 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 725addfacf0a..3537d4b91f74 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile | |||
@@ -9,6 +9,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o | |||
9 | obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o | 9 | obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o |
10 | obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o | 10 | obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o |
11 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o | 11 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o |
12 | obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o | ||
12 | obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o | 13 | obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o |
13 | obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o | 14 | obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o |
14 | obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o | 15 | obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o |
@@ -23,6 +24,7 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o | |||
23 | aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o | 24 | aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o |
24 | blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o | 25 | blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o |
25 | twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o | 26 | twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o |
27 | twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o | ||
26 | salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o | 28 | salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o |
27 | 29 | ||
28 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o | 30 | aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o |
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S new file mode 100644 index 000000000000..5b012a2c5119 --- /dev/null +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S | |||
@@ -0,0 +1,316 @@ | |||
1 | /* | ||
2 | * Twofish Cipher 3-way parallel algorithm (x86_64) | ||
3 | * | ||
4 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | ||
19 | * USA | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | .file "twofish-x86_64-asm-3way.S" | ||
24 | .text | ||
25 | |||
26 | /* structure of crypto context */ | ||
27 | #define s0 0 | ||
28 | #define s1 1024 | ||
29 | #define s2 2048 | ||
30 | #define s3 3072 | ||
31 | #define w 4096 | ||
32 | #define k 4128 | ||
33 | |||
34 | /********************************************************************** | ||
35 | 3-way twofish | ||
36 | **********************************************************************/ | ||
37 | #define CTX %rdi | ||
38 | #define RIO %rdx | ||
39 | |||
40 | #define RAB0 %rax | ||
41 | #define RAB1 %rbx | ||
42 | #define RAB2 %rcx | ||
43 | |||
44 | #define RAB0d %eax | ||
45 | #define RAB1d %ebx | ||
46 | #define RAB2d %ecx | ||
47 | |||
48 | #define RAB0bh %ah | ||
49 | #define RAB1bh %bh | ||
50 | #define RAB2bh %ch | ||
51 | |||
52 | #define RAB0bl %al | ||
53 | #define RAB1bl %bl | ||
54 | #define RAB2bl %cl | ||
55 | |||
56 | #define RCD0 %r8 | ||
57 | #define RCD1 %r9 | ||
58 | #define RCD2 %r10 | ||
59 | |||
60 | #define RCD0d %r8d | ||
61 | #define RCD1d %r9d | ||
62 | #define RCD2d %r10d | ||
63 | |||
64 | #define RX0 %rbp | ||
65 | #define RX1 %r11 | ||
66 | #define RX2 %r12 | ||
67 | |||
68 | #define RX0d %ebp | ||
69 | #define RX1d %r11d | ||
70 | #define RX2d %r12d | ||
71 | |||
72 | #define RY0 %r13 | ||
73 | #define RY1 %r14 | ||
74 | #define RY2 %r15 | ||
75 | |||
76 | #define RY0d %r13d | ||
77 | #define RY1d %r14d | ||
78 | #define RY2d %r15d | ||
79 | |||
80 | #define RT0 %rdx | ||
81 | #define RT1 %rsi | ||
82 | |||
83 | #define RT0d %edx | ||
84 | #define RT1d %esi | ||
85 | |||
86 | #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ | ||
87 | movzbl ab ## bl, tmp2 ## d; \ | ||
88 | movzbl ab ## bh, tmp1 ## d; \ | ||
89 | rorq $(rot), ab; \ | ||
90 | op1##l T0(CTX, tmp2, 4), dst ## d; \ | ||
91 | op2##l T1(CTX, tmp1, 4), dst ## d; | ||
92 | |||
93 | /* | ||
94 | * Combined G1 & G2 function. Reordered with help of rotates to have moves | ||
95 | * at begining. | ||
96 | */ | ||
97 | #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ | ||
98 | /* G1,1 && G2,1 */ \ | ||
99 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ | ||
100 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ | ||
101 | \ | ||
102 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ | ||
103 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ | ||
104 | \ | ||
105 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ | ||
106 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ | ||
107 | \ | ||
108 | /* G1,2 && G2,2 */ \ | ||
109 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ | ||
110 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ | ||
111 | xchgq cd ## 0, ab ## 0; \ | ||
112 | \ | ||
113 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ | ||
114 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ | ||
115 | xchgq cd ## 1, ab ## 1; \ | ||
116 | \ | ||
117 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ | ||
118 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ | ||
119 | xchgq cd ## 2, ab ## 2; | ||
120 | |||
121 | #define enc_round_end(ab, x, y, n) \ | ||
122 | addl y ## d, x ## d; \ | ||
123 | addl x ## d, y ## d; \ | ||
124 | addl k+4*(2*(n))(CTX), x ## d; \ | ||
125 | xorl ab ## d, x ## d; \ | ||
126 | addl k+4*(2*(n)+1)(CTX), y ## d; \ | ||
127 | shrq $32, ab; \ | ||
128 | roll $1, ab ## d; \ | ||
129 | xorl y ## d, ab ## d; \ | ||
130 | shlq $32, ab; \ | ||
131 | rorl $1, x ## d; \ | ||
132 | orq x, ab; | ||
133 | |||
134 | #define dec_round_end(ba, x, y, n) \ | ||
135 | addl y ## d, x ## d; \ | ||
136 | addl x ## d, y ## d; \ | ||
137 | addl k+4*(2*(n))(CTX), x ## d; \ | ||
138 | addl k+4*(2*(n)+1)(CTX), y ## d; \ | ||
139 | xorl ba ## d, y ## d; \ | ||
140 | shrq $32, ba; \ | ||
141 | roll $1, ba ## d; \ | ||
142 | xorl x ## d, ba ## d; \ | ||
143 | shlq $32, ba; \ | ||
144 | rorl $1, y ## d; \ | ||
145 | orq y, ba; | ||
146 | |||
147 | #define encrypt_round3(ab, cd, n) \ | ||
148 | g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ | ||
149 | \ | ||
150 | enc_round_end(ab ## 0, RX0, RY0, n); \ | ||
151 | enc_round_end(ab ## 1, RX1, RY1, n); \ | ||
152 | enc_round_end(ab ## 2, RX2, RY2, n); | ||
153 | |||
154 | #define decrypt_round3(ba, dc, n) \ | ||
155 | g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ | ||
156 | \ | ||
157 | dec_round_end(ba ## 0, RX0, RY0, n); \ | ||
158 | dec_round_end(ba ## 1, RX1, RY1, n); \ | ||
159 | dec_round_end(ba ## 2, RX2, RY2, n); | ||
160 | |||
161 | #define encrypt_cycle3(ab, cd, n) \ | ||
162 | encrypt_round3(ab, cd, n*2); \ | ||
163 | encrypt_round3(ab, cd, (n*2)+1); | ||
164 | |||
165 | #define decrypt_cycle3(ba, dc, n) \ | ||
166 | decrypt_round3(ba, dc, (n*2)+1); \ | ||
167 | decrypt_round3(ba, dc, (n*2)); | ||
168 | |||
169 | #define inpack3(in, n, xy, m) \ | ||
170 | movq 4*(n)(in), xy ## 0; \ | ||
171 | xorq w+4*m(CTX), xy ## 0; \ | ||
172 | \ | ||
173 | movq 4*(4+(n))(in), xy ## 1; \ | ||
174 | xorq w+4*m(CTX), xy ## 1; \ | ||
175 | \ | ||
176 | movq 4*(8+(n))(in), xy ## 2; \ | ||
177 | xorq w+4*m(CTX), xy ## 2; | ||
178 | |||
179 | #define outunpack3(op, out, n, xy, m) \ | ||
180 | xorq w+4*m(CTX), xy ## 0; \ | ||
181 | op ## q xy ## 0, 4*(n)(out); \ | ||
182 | \ | ||
183 | xorq w+4*m(CTX), xy ## 1; \ | ||
184 | op ## q xy ## 1, 4*(4+(n))(out); \ | ||
185 | \ | ||
186 | xorq w+4*m(CTX), xy ## 2; \ | ||
187 | op ## q xy ## 2, 4*(8+(n))(out); | ||
188 | |||
189 | #define inpack_enc3() \ | ||
190 | inpack3(RIO, 0, RAB, 0); \ | ||
191 | inpack3(RIO, 2, RCD, 2); | ||
192 | |||
193 | #define outunpack_enc3(op) \ | ||
194 | outunpack3(op, RIO, 2, RAB, 6); \ | ||
195 | outunpack3(op, RIO, 0, RCD, 4); | ||
196 | |||
197 | #define inpack_dec3() \ | ||
198 | inpack3(RIO, 0, RAB, 4); \ | ||
199 | rorq $32, RAB0; \ | ||
200 | rorq $32, RAB1; \ | ||
201 | rorq $32, RAB2; \ | ||
202 | inpack3(RIO, 2, RCD, 6); \ | ||
203 | rorq $32, RCD0; \ | ||
204 | rorq $32, RCD1; \ | ||
205 | rorq $32, RCD2; | ||
206 | |||
207 | #define outunpack_dec3() \ | ||
208 | rorq $32, RCD0; \ | ||
209 | rorq $32, RCD1; \ | ||
210 | rorq $32, RCD2; \ | ||
211 | outunpack3(mov, RIO, 0, RCD, 0); \ | ||
212 | rorq $32, RAB0; \ | ||
213 | rorq $32, RAB1; \ | ||
214 | rorq $32, RAB2; \ | ||
215 | outunpack3(mov, RIO, 2, RAB, 2); | ||
216 | |||
217 | .align 8 | ||
218 | .global __twofish_enc_blk_3way | ||
219 | .type __twofish_enc_blk_3way,@function; | ||
220 | |||
221 | __twofish_enc_blk_3way: | ||
222 | /* input: | ||
223 | * %rdi: ctx, CTX | ||
224 | * %rsi: dst | ||
225 | * %rdx: src, RIO | ||
226 | * %rcx: bool, if true: xor output | ||
227 | */ | ||
228 | pushq %r15; | ||
229 | pushq %r14; | ||
230 | pushq %r13; | ||
231 | pushq %r12; | ||
232 | pushq %rbp; | ||
233 | pushq %rbx; | ||
234 | |||
235 | pushq %rcx; /* bool xor */ | ||
236 | pushq %rsi; /* dst */ | ||
237 | |||
238 | inpack_enc3(); | ||
239 | |||
240 | encrypt_cycle3(RAB, RCD, 0); | ||
241 | encrypt_cycle3(RAB, RCD, 1); | ||
242 | encrypt_cycle3(RAB, RCD, 2); | ||
243 | encrypt_cycle3(RAB, RCD, 3); | ||
244 | encrypt_cycle3(RAB, RCD, 4); | ||
245 | encrypt_cycle3(RAB, RCD, 5); | ||
246 | encrypt_cycle3(RAB, RCD, 6); | ||
247 | encrypt_cycle3(RAB, RCD, 7); | ||
248 | |||
249 | popq RIO; /* dst */ | ||
250 | popq %rbp; /* bool xor */ | ||
251 | |||
252 | testb %bpl, %bpl; | ||
253 | jnz __enc_xor3; | ||
254 | |||
255 | outunpack_enc3(mov); | ||
256 | |||
257 | popq %rbx; | ||
258 | popq %rbp; | ||
259 | popq %r12; | ||
260 | popq %r13; | ||
261 | popq %r14; | ||
262 | popq %r15; | ||
263 | ret; | ||
264 | |||
265 | __enc_xor3: | ||
266 | outunpack_enc3(xor); | ||
267 | |||
268 | popq %rbx; | ||
269 | popq %rbp; | ||
270 | popq %r12; | ||
271 | popq %r13; | ||
272 | popq %r14; | ||
273 | popq %r15; | ||
274 | ret; | ||
275 | |||
276 | .global twofish_dec_blk_3way | ||
277 | .type twofish_dec_blk_3way,@function; | ||
278 | |||
279 | twofish_dec_blk_3way: | ||
280 | /* input: | ||
281 | * %rdi: ctx, CTX | ||
282 | * %rsi: dst | ||
283 | * %rdx: src, RIO | ||
284 | */ | ||
285 | pushq %r15; | ||
286 | pushq %r14; | ||
287 | pushq %r13; | ||
288 | pushq %r12; | ||
289 | pushq %rbp; | ||
290 | pushq %rbx; | ||
291 | |||
292 | pushq %rsi; /* dst */ | ||
293 | |||
294 | inpack_dec3(); | ||
295 | |||
296 | decrypt_cycle3(RAB, RCD, 7); | ||
297 | decrypt_cycle3(RAB, RCD, 6); | ||
298 | decrypt_cycle3(RAB, RCD, 5); | ||
299 | decrypt_cycle3(RAB, RCD, 4); | ||
300 | decrypt_cycle3(RAB, RCD, 3); | ||
301 | decrypt_cycle3(RAB, RCD, 2); | ||
302 | decrypt_cycle3(RAB, RCD, 1); | ||
303 | decrypt_cycle3(RAB, RCD, 0); | ||
304 | |||
305 | popq RIO; /* dst */ | ||
306 | |||
307 | outunpack_dec3(); | ||
308 | |||
309 | popq %rbx; | ||
310 | popq %rbp; | ||
311 | popq %r12; | ||
312 | popq %r13; | ||
313 | popq %r14; | ||
314 | popq %r15; | ||
315 | ret; | ||
316 | |||
diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c new file mode 100644 index 000000000000..0cbf9faea86a --- /dev/null +++ b/arch/x86/crypto/twofish_glue_3way.c | |||
@@ -0,0 +1,472 @@ | |||
1 | /* | ||
2 | * Glue Code for 3-way parallel assembler optimized version of Twofish | ||
3 | * | ||
4 | * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: | ||
7 | * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
8 | * CTR part based on code (crypto/ctr.c) by: | ||
9 | * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or modify | ||
12 | * it under the terms of the GNU General Public License as published by | ||
13 | * the Free Software Foundation; either version 2 of the License, or | ||
14 | * (at your option) any later version. | ||
15 | * | ||
16 | * This program is distributed in the hope that it will be useful, | ||
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
19 | * GNU General Public License for more details. | ||
20 | * | ||
21 | * You should have received a copy of the GNU General Public License | ||
22 | * along with this program; if not, write to the Free Software | ||
23 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | ||
24 | * USA | ||
25 | * | ||
26 | */ | ||
27 | |||
28 | #include <linux/crypto.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/types.h> | ||
32 | #include <crypto/algapi.h> | ||
33 | #include <crypto/twofish.h> | ||
34 | #include <crypto/b128ops.h> | ||
35 | |||
36 | /* regular block cipher functions from twofish_x86_64 module */ | ||
37 | asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, | ||
38 | const u8 *src); | ||
39 | asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, | ||
40 | const u8 *src); | ||
41 | |||
42 | /* 3-way parallel cipher functions */ | ||
43 | asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | ||
44 | const u8 *src, bool xor); | ||
45 | asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, | ||
46 | const u8 *src); | ||
47 | |||
48 | static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, | ||
49 | const u8 *src) | ||
50 | { | ||
51 | __twofish_enc_blk_3way(ctx, dst, src, false); | ||
52 | } | ||
53 | |||
54 | static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst, | ||
55 | const u8 *src) | ||
56 | { | ||
57 | __twofish_enc_blk_3way(ctx, dst, src, true); | ||
58 | } | ||
59 | |||
60 | static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | ||
61 | void (*fn)(struct twofish_ctx *, u8 *, const u8 *), | ||
62 | void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *)) | ||
63 | { | ||
64 | struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
65 | unsigned int bsize = TF_BLOCK_SIZE; | ||
66 | unsigned int nbytes; | ||
67 | int err; | ||
68 | |||
69 | err = blkcipher_walk_virt(desc, walk); | ||
70 | |||
71 | while ((nbytes = walk->nbytes)) { | ||
72 | u8 *wsrc = walk->src.virt.addr; | ||
73 | u8 *wdst = walk->dst.virt.addr; | ||
74 | |||
75 | /* Process three block batch */ | ||
76 | if (nbytes >= bsize * 3) { | ||
77 | do { | ||
78 | fn_3way(ctx, wdst, wsrc); | ||
79 | |||
80 | wsrc += bsize * 3; | ||
81 | wdst += bsize * 3; | ||
82 | nbytes -= bsize * 3; | ||
83 | } while (nbytes >= bsize * 3); | ||
84 | |||
85 | if (nbytes < bsize) | ||
86 | goto done; | ||
87 | } | ||
88 | |||
89 | /* Handle leftovers */ | ||
90 | do { | ||
91 | fn(ctx, wdst, wsrc); | ||
92 | |||
93 | wsrc += bsize; | ||
94 | wdst += bsize; | ||
95 | nbytes -= bsize; | ||
96 | } while (nbytes >= bsize); | ||
97 | |||
98 | done: | ||
99 | err = blkcipher_walk_done(desc, walk, nbytes); | ||
100 | } | ||
101 | |||
102 | return err; | ||
103 | } | ||
104 | |||
105 | static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
106 | struct scatterlist *src, unsigned int nbytes) | ||
107 | { | ||
108 | struct blkcipher_walk walk; | ||
109 | |||
110 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
111 | return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way); | ||
112 | } | ||
113 | |||
114 | static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
115 | struct scatterlist *src, unsigned int nbytes) | ||
116 | { | ||
117 | struct blkcipher_walk walk; | ||
118 | |||
119 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
120 | return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); | ||
121 | } | ||
122 | |||
123 | static struct crypto_alg blk_ecb_alg = { | ||
124 | .cra_name = "ecb(twofish)", | ||
125 | .cra_driver_name = "ecb-twofish-3way", | ||
126 | .cra_priority = 300, | ||
127 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
128 | .cra_blocksize = TF_BLOCK_SIZE, | ||
129 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
130 | .cra_alignmask = 0, | ||
131 | .cra_type = &crypto_blkcipher_type, | ||
132 | .cra_module = THIS_MODULE, | ||
133 | .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), | ||
134 | .cra_u = { | ||
135 | .blkcipher = { | ||
136 | .min_keysize = TF_MIN_KEY_SIZE, | ||
137 | .max_keysize = TF_MAX_KEY_SIZE, | ||
138 | .setkey = twofish_setkey, | ||
139 | .encrypt = ecb_encrypt, | ||
140 | .decrypt = ecb_decrypt, | ||
141 | }, | ||
142 | }, | ||
143 | }; | ||
144 | |||
145 | static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, | ||
146 | struct blkcipher_walk *walk) | ||
147 | { | ||
148 | struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
149 | unsigned int bsize = TF_BLOCK_SIZE; | ||
150 | unsigned int nbytes = walk->nbytes; | ||
151 | u128 *src = (u128 *)walk->src.virt.addr; | ||
152 | u128 *dst = (u128 *)walk->dst.virt.addr; | ||
153 | u128 *iv = (u128 *)walk->iv; | ||
154 | |||
155 | do { | ||
156 | u128_xor(dst, src, iv); | ||
157 | twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); | ||
158 | iv = dst; | ||
159 | |||
160 | src += 1; | ||
161 | dst += 1; | ||
162 | nbytes -= bsize; | ||
163 | } while (nbytes >= bsize); | ||
164 | |||
165 | u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); | ||
166 | return nbytes; | ||
167 | } | ||
168 | |||
169 | static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
170 | struct scatterlist *src, unsigned int nbytes) | ||
171 | { | ||
172 | struct blkcipher_walk walk; | ||
173 | int err; | ||
174 | |||
175 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
176 | err = blkcipher_walk_virt(desc, &walk); | ||
177 | |||
178 | while ((nbytes = walk.nbytes)) { | ||
179 | nbytes = __cbc_encrypt(desc, &walk); | ||
180 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
181 | } | ||
182 | |||
183 | return err; | ||
184 | } | ||
185 | |||
186 | static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, | ||
187 | struct blkcipher_walk *walk) | ||
188 | { | ||
189 | struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
190 | unsigned int bsize = TF_BLOCK_SIZE; | ||
191 | unsigned int nbytes = walk->nbytes; | ||
192 | u128 *src = (u128 *)walk->src.virt.addr; | ||
193 | u128 *dst = (u128 *)walk->dst.virt.addr; | ||
194 | u128 ivs[3 - 1]; | ||
195 | u128 last_iv; | ||
196 | |||
197 | /* Start of the last block. */ | ||
198 | src += nbytes / bsize - 1; | ||
199 | dst += nbytes / bsize - 1; | ||
200 | |||
201 | last_iv = *src; | ||
202 | |||
203 | /* Process three block batch */ | ||
204 | if (nbytes >= bsize * 3) { | ||
205 | do { | ||
206 | nbytes -= bsize * (3 - 1); | ||
207 | src -= 3 - 1; | ||
208 | dst -= 3 - 1; | ||
209 | |||
210 | ivs[0] = src[0]; | ||
211 | ivs[1] = src[1]; | ||
212 | |||
213 | twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); | ||
214 | |||
215 | u128_xor(dst + 1, dst + 1, ivs + 0); | ||
216 | u128_xor(dst + 2, dst + 2, ivs + 1); | ||
217 | |||
218 | nbytes -= bsize; | ||
219 | if (nbytes < bsize) | ||
220 | goto done; | ||
221 | |||
222 | u128_xor(dst, dst, src - 1); | ||
223 | src -= 1; | ||
224 | dst -= 1; | ||
225 | } while (nbytes >= bsize * 3); | ||
226 | |||
227 | if (nbytes < bsize) | ||
228 | goto done; | ||
229 | } | ||
230 | |||
231 | /* Handle leftovers */ | ||
232 | for (;;) { | ||
233 | twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src); | ||
234 | |||
235 | nbytes -= bsize; | ||
236 | if (nbytes < bsize) | ||
237 | break; | ||
238 | |||
239 | u128_xor(dst, dst, src - 1); | ||
240 | src -= 1; | ||
241 | dst -= 1; | ||
242 | } | ||
243 | |||
244 | done: | ||
245 | u128_xor(dst, dst, (u128 *)walk->iv); | ||
246 | *(u128 *)walk->iv = last_iv; | ||
247 | |||
248 | return nbytes; | ||
249 | } | ||
250 | |||
251 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
252 | struct scatterlist *src, unsigned int nbytes) | ||
253 | { | ||
254 | struct blkcipher_walk walk; | ||
255 | int err; | ||
256 | |||
257 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
258 | err = blkcipher_walk_virt(desc, &walk); | ||
259 | |||
260 | while ((nbytes = walk.nbytes)) { | ||
261 | nbytes = __cbc_decrypt(desc, &walk); | ||
262 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
263 | } | ||
264 | |||
265 | return err; | ||
266 | } | ||
267 | |||
268 | static struct crypto_alg blk_cbc_alg = { | ||
269 | .cra_name = "cbc(twofish)", | ||
270 | .cra_driver_name = "cbc-twofish-3way", | ||
271 | .cra_priority = 300, | ||
272 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
273 | .cra_blocksize = TF_BLOCK_SIZE, | ||
274 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
275 | .cra_alignmask = 0, | ||
276 | .cra_type = &crypto_blkcipher_type, | ||
277 | .cra_module = THIS_MODULE, | ||
278 | .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), | ||
279 | .cra_u = { | ||
280 | .blkcipher = { | ||
281 | .min_keysize = TF_MIN_KEY_SIZE, | ||
282 | .max_keysize = TF_MAX_KEY_SIZE, | ||
283 | .ivsize = TF_BLOCK_SIZE, | ||
284 | .setkey = twofish_setkey, | ||
285 | .encrypt = cbc_encrypt, | ||
286 | .decrypt = cbc_decrypt, | ||
287 | }, | ||
288 | }, | ||
289 | }; | ||
290 | |||
291 | static inline void u128_to_be128(be128 *dst, const u128 *src) | ||
292 | { | ||
293 | dst->a = cpu_to_be64(src->a); | ||
294 | dst->b = cpu_to_be64(src->b); | ||
295 | } | ||
296 | |||
297 | static inline void be128_to_u128(u128 *dst, const be128 *src) | ||
298 | { | ||
299 | dst->a = be64_to_cpu(src->a); | ||
300 | dst->b = be64_to_cpu(src->b); | ||
301 | } | ||
302 | |||
303 | static inline void u128_inc(u128 *i) | ||
304 | { | ||
305 | i->b++; | ||
306 | if (!i->b) | ||
307 | i->a++; | ||
308 | } | ||
309 | |||
310 | static void ctr_crypt_final(struct blkcipher_desc *desc, | ||
311 | struct blkcipher_walk *walk) | ||
312 | { | ||
313 | struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
314 | u8 *ctrblk = walk->iv; | ||
315 | u8 keystream[TF_BLOCK_SIZE]; | ||
316 | u8 *src = walk->src.virt.addr; | ||
317 | u8 *dst = walk->dst.virt.addr; | ||
318 | unsigned int nbytes = walk->nbytes; | ||
319 | |||
320 | twofish_enc_blk(ctx, keystream, ctrblk); | ||
321 | crypto_xor(keystream, src, nbytes); | ||
322 | memcpy(dst, keystream, nbytes); | ||
323 | |||
324 | crypto_inc(ctrblk, TF_BLOCK_SIZE); | ||
325 | } | ||
326 | |||
327 | static unsigned int __ctr_crypt(struct blkcipher_desc *desc, | ||
328 | struct blkcipher_walk *walk) | ||
329 | { | ||
330 | struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | ||
331 | unsigned int bsize = TF_BLOCK_SIZE; | ||
332 | unsigned int nbytes = walk->nbytes; | ||
333 | u128 *src = (u128 *)walk->src.virt.addr; | ||
334 | u128 *dst = (u128 *)walk->dst.virt.addr; | ||
335 | u128 ctrblk; | ||
336 | be128 ctrblocks[3]; | ||
337 | |||
338 | be128_to_u128(&ctrblk, (be128 *)walk->iv); | ||
339 | |||
340 | /* Process three block batch */ | ||
341 | if (nbytes >= bsize * 3) { | ||
342 | do { | ||
343 | if (dst != src) { | ||
344 | dst[0] = src[0]; | ||
345 | dst[1] = src[1]; | ||
346 | dst[2] = src[2]; | ||
347 | } | ||
348 | |||
349 | /* create ctrblks for parallel encrypt */ | ||
350 | u128_to_be128(&ctrblocks[0], &ctrblk); | ||
351 | u128_inc(&ctrblk); | ||
352 | u128_to_be128(&ctrblocks[1], &ctrblk); | ||
353 | u128_inc(&ctrblk); | ||
354 | u128_to_be128(&ctrblocks[2], &ctrblk); | ||
355 | u128_inc(&ctrblk); | ||
356 | |||
357 | twofish_enc_blk_xor_3way(ctx, (u8 *)dst, | ||
358 | (u8 *)ctrblocks); | ||
359 | |||
360 | src += 3; | ||
361 | dst += 3; | ||
362 | nbytes -= bsize * 3; | ||
363 | } while (nbytes >= bsize * 3); | ||
364 | |||
365 | if (nbytes < bsize) | ||
366 | goto done; | ||
367 | } | ||
368 | |||
369 | /* Handle leftovers */ | ||
370 | do { | ||
371 | if (dst != src) | ||
372 | *dst = *src; | ||
373 | |||
374 | u128_to_be128(&ctrblocks[0], &ctrblk); | ||
375 | u128_inc(&ctrblk); | ||
376 | |||
377 | twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); | ||
378 | u128_xor(dst, dst, (u128 *)ctrblocks); | ||
379 | |||
380 | src += 1; | ||
381 | dst += 1; | ||
382 | nbytes -= bsize; | ||
383 | } while (nbytes >= bsize); | ||
384 | |||
385 | done: | ||
386 | u128_to_be128((be128 *)walk->iv, &ctrblk); | ||
387 | return nbytes; | ||
388 | } | ||
389 | |||
390 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | ||
391 | struct scatterlist *src, unsigned int nbytes) | ||
392 | { | ||
393 | struct blkcipher_walk walk; | ||
394 | int err; | ||
395 | |||
396 | blkcipher_walk_init(&walk, dst, src, nbytes); | ||
397 | err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE); | ||
398 | |||
399 | while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) { | ||
400 | nbytes = __ctr_crypt(desc, &walk); | ||
401 | err = blkcipher_walk_done(desc, &walk, nbytes); | ||
402 | } | ||
403 | |||
404 | if (walk.nbytes) { | ||
405 | ctr_crypt_final(desc, &walk); | ||
406 | err = blkcipher_walk_done(desc, &walk, 0); | ||
407 | } | ||
408 | |||
409 | return err; | ||
410 | } | ||
411 | |||
412 | static struct crypto_alg blk_ctr_alg = { | ||
413 | .cra_name = "ctr(twofish)", | ||
414 | .cra_driver_name = "ctr-twofish-3way", | ||
415 | .cra_priority = 300, | ||
416 | .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, | ||
417 | .cra_blocksize = TF_BLOCK_SIZE, | ||
418 | .cra_ctxsize = sizeof(struct twofish_ctx), | ||
419 | .cra_alignmask = 0, | ||
420 | .cra_type = &crypto_blkcipher_type, | ||
421 | .cra_module = THIS_MODULE, | ||
422 | .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), | ||
423 | .cra_u = { | ||
424 | .blkcipher = { | ||
425 | .min_keysize = TF_MIN_KEY_SIZE, | ||
426 | .max_keysize = TF_MAX_KEY_SIZE, | ||
427 | .ivsize = TF_BLOCK_SIZE, | ||
428 | .setkey = twofish_setkey, | ||
429 | .encrypt = ctr_crypt, | ||
430 | .decrypt = ctr_crypt, | ||
431 | }, | ||
432 | }, | ||
433 | }; | ||
434 | |||
435 | int __init init(void) | ||
436 | { | ||
437 | int err; | ||
438 | |||
439 | err = crypto_register_alg(&blk_ecb_alg); | ||
440 | if (err) | ||
441 | goto ecb_err; | ||
442 | err = crypto_register_alg(&blk_cbc_alg); | ||
443 | if (err) | ||
444 | goto cbc_err; | ||
445 | err = crypto_register_alg(&blk_ctr_alg); | ||
446 | if (err) | ||
447 | goto ctr_err; | ||
448 | |||
449 | return 0; | ||
450 | |||
451 | ctr_err: | ||
452 | crypto_unregister_alg(&blk_cbc_alg); | ||
453 | cbc_err: | ||
454 | crypto_unregister_alg(&blk_ecb_alg); | ||
455 | ecb_err: | ||
456 | return err; | ||
457 | } | ||
458 | |||
459 | void __exit fini(void) | ||
460 | { | ||
461 | crypto_unregister_alg(&blk_ctr_alg); | ||
462 | crypto_unregister_alg(&blk_cbc_alg); | ||
463 | crypto_unregister_alg(&blk_ecb_alg); | ||
464 | } | ||
465 | |||
466 | module_init(init); | ||
467 | module_exit(fini); | ||
468 | |||
469 | MODULE_LICENSE("GPL"); | ||
470 | MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized"); | ||
471 | MODULE_ALIAS("twofish"); | ||
472 | MODULE_ALIAS("twofish-asm"); | ||