diff options
author | Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 2011-09-26 09:47:25 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2011-10-21 08:23:08 -0400 |
commit | 8280daad436edb7dd9e7e06fc13bcecb6b2a885c (patch) | |
tree | 0d4cb032c6da8617bd4a2dd84bd8ef1a605fa19d /arch/x86/crypto/twofish-x86_64-asm_64-3way.S | |
parent | 91d41f159d75d602f6001218eec64c5e761475a6 (diff) |
crypto: twofish - add 3-way parallel x86_64 assembler implemention
Patch adds 3-way parallel x86_64 assembly implementation of twofish as new
module. New assembler functions crypt data in three blocks chunks, improving
cipher performance on out-of-order CPUs.
Patch has been tested with tcrypt and automated filesystem tests.
Summary of the tcrypt benchmarks:
Twofish 3-way-asm vs twofish asm (128bit 8kb block ECB)
encrypt: 1.3x speed
decrypt: 1.3x speed
Twofish 3-way-asm vs twofish asm (128bit 8kb block CBC)
encrypt: 1.07x speed
decrypt: 1.4x speed
Twofish 3-way-asm vs twofish asm (128bit 8kb block CTR)
encrypt: 1.4x speed
Twofish 3-way-asm vs AES asm (128bit 8kb block ECB)
encrypt: 1.0x speed
decrypt: 1.0x speed
Twofish 3-way-asm vs AES asm (128bit 8kb block CBC)
encrypt: 0.84x speed
decrypt: 1.09x speed
Twofish 3-way-asm vs AES asm (128bit 8kb block CTR)
encrypt: 1.15x speed
Full output:
http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-twofish-3way-asm-x86_64.txt
http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-twofish-asm-x86_64.txt
http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-aes-asm-x86_64.txt
Tests were run on:
vendor_id : AuthenticAMD
cpu family : 16
model : 10
model name : AMD Phenom(tm) II X6 1055T Processor
Also userspace test were run on:
vendor_id : GenuineIntel
cpu family : 6
model : 15
model name : Intel(R) Xeon(R) CPU E7330 @ 2.40GHz
stepping : 11
Userspace test results:
Encryption/decryption of twofish 3-way vs x86_64-asm on AMD Phenom II:
encrypt: 1.27x
decrypt: 1.25x
Encryption/decryption of twofish 3-way vs x86_64-asm on Intel Xeon E7330:
encrypt: 1.36x
decrypt: 1.36x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/twofish-x86_64-asm_64-3way.S')
-rw-r--r-- | arch/x86/crypto/twofish-x86_64-asm_64-3way.S | 316 |
1 files changed, 316 insertions, 0 deletions
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S new file mode 100644 index 00000000000..5b012a2c511 --- /dev/null +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S | |||
@@ -0,0 +1,316 @@ | |||
1 | /* | ||
2 | * Twofish Cipher 3-way parallel algorithm (x86_64) | ||
3 | * | ||
4 | * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | ||
19 | * USA | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | .file "twofish-x86_64-asm-3way.S" | ||
24 | .text | ||
25 | |||
26 | /* structure of crypto context */ | ||
27 | #define s0 0 | ||
28 | #define s1 1024 | ||
29 | #define s2 2048 | ||
30 | #define s3 3072 | ||
31 | #define w 4096 | ||
32 | #define k 4128 | ||
33 | |||
34 | /********************************************************************** | ||
35 | 3-way twofish | ||
36 | **********************************************************************/ | ||
37 | #define CTX %rdi | ||
38 | #define RIO %rdx | ||
39 | |||
40 | #define RAB0 %rax | ||
41 | #define RAB1 %rbx | ||
42 | #define RAB2 %rcx | ||
43 | |||
44 | #define RAB0d %eax | ||
45 | #define RAB1d %ebx | ||
46 | #define RAB2d %ecx | ||
47 | |||
48 | #define RAB0bh %ah | ||
49 | #define RAB1bh %bh | ||
50 | #define RAB2bh %ch | ||
51 | |||
52 | #define RAB0bl %al | ||
53 | #define RAB1bl %bl | ||
54 | #define RAB2bl %cl | ||
55 | |||
56 | #define RCD0 %r8 | ||
57 | #define RCD1 %r9 | ||
58 | #define RCD2 %r10 | ||
59 | |||
60 | #define RCD0d %r8d | ||
61 | #define RCD1d %r9d | ||
62 | #define RCD2d %r10d | ||
63 | |||
64 | #define RX0 %rbp | ||
65 | #define RX1 %r11 | ||
66 | #define RX2 %r12 | ||
67 | |||
68 | #define RX0d %ebp | ||
69 | #define RX1d %r11d | ||
70 | #define RX2d %r12d | ||
71 | |||
72 | #define RY0 %r13 | ||
73 | #define RY1 %r14 | ||
74 | #define RY2 %r15 | ||
75 | |||
76 | #define RY0d %r13d | ||
77 | #define RY1d %r14d | ||
78 | #define RY2d %r15d | ||
79 | |||
80 | #define RT0 %rdx | ||
81 | #define RT1 %rsi | ||
82 | |||
83 | #define RT0d %edx | ||
84 | #define RT1d %esi | ||
85 | |||
86 | #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ | ||
87 | movzbl ab ## bl, tmp2 ## d; \ | ||
88 | movzbl ab ## bh, tmp1 ## d; \ | ||
89 | rorq $(rot), ab; \ | ||
90 | op1##l T0(CTX, tmp2, 4), dst ## d; \ | ||
91 | op2##l T1(CTX, tmp1, 4), dst ## d; | ||
92 | |||
93 | /* | ||
94 | * Combined G1 & G2 function. Reordered with help of rotates to have moves | ||
95 | * at begining. | ||
96 | */ | ||
97 | #define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ | ||
98 | /* G1,1 && G2,1 */ \ | ||
99 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ | ||
100 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ | ||
101 | \ | ||
102 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ | ||
103 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ | ||
104 | \ | ||
105 | do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ | ||
106 | do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ | ||
107 | \ | ||
108 | /* G1,2 && G2,2 */ \ | ||
109 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ | ||
110 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ | ||
111 | xchgq cd ## 0, ab ## 0; \ | ||
112 | \ | ||
113 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ | ||
114 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ | ||
115 | xchgq cd ## 1, ab ## 1; \ | ||
116 | \ | ||
117 | do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ | ||
118 | do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ | ||
119 | xchgq cd ## 2, ab ## 2; | ||
120 | |||
121 | #define enc_round_end(ab, x, y, n) \ | ||
122 | addl y ## d, x ## d; \ | ||
123 | addl x ## d, y ## d; \ | ||
124 | addl k+4*(2*(n))(CTX), x ## d; \ | ||
125 | xorl ab ## d, x ## d; \ | ||
126 | addl k+4*(2*(n)+1)(CTX), y ## d; \ | ||
127 | shrq $32, ab; \ | ||
128 | roll $1, ab ## d; \ | ||
129 | xorl y ## d, ab ## d; \ | ||
130 | shlq $32, ab; \ | ||
131 | rorl $1, x ## d; \ | ||
132 | orq x, ab; | ||
133 | |||
134 | #define dec_round_end(ba, x, y, n) \ | ||
135 | addl y ## d, x ## d; \ | ||
136 | addl x ## d, y ## d; \ | ||
137 | addl k+4*(2*(n))(CTX), x ## d; \ | ||
138 | addl k+4*(2*(n)+1)(CTX), y ## d; \ | ||
139 | xorl ba ## d, y ## d; \ | ||
140 | shrq $32, ba; \ | ||
141 | roll $1, ba ## d; \ | ||
142 | xorl x ## d, ba ## d; \ | ||
143 | shlq $32, ba; \ | ||
144 | rorl $1, y ## d; \ | ||
145 | orq y, ba; | ||
146 | |||
147 | #define encrypt_round3(ab, cd, n) \ | ||
148 | g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ | ||
149 | \ | ||
150 | enc_round_end(ab ## 0, RX0, RY0, n); \ | ||
151 | enc_round_end(ab ## 1, RX1, RY1, n); \ | ||
152 | enc_round_end(ab ## 2, RX2, RY2, n); | ||
153 | |||
154 | #define decrypt_round3(ba, dc, n) \ | ||
155 | g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ | ||
156 | \ | ||
157 | dec_round_end(ba ## 0, RX0, RY0, n); \ | ||
158 | dec_round_end(ba ## 1, RX1, RY1, n); \ | ||
159 | dec_round_end(ba ## 2, RX2, RY2, n); | ||
160 | |||
161 | #define encrypt_cycle3(ab, cd, n) \ | ||
162 | encrypt_round3(ab, cd, n*2); \ | ||
163 | encrypt_round3(ab, cd, (n*2)+1); | ||
164 | |||
165 | #define decrypt_cycle3(ba, dc, n) \ | ||
166 | decrypt_round3(ba, dc, (n*2)+1); \ | ||
167 | decrypt_round3(ba, dc, (n*2)); | ||
168 | |||
169 | #define inpack3(in, n, xy, m) \ | ||
170 | movq 4*(n)(in), xy ## 0; \ | ||
171 | xorq w+4*m(CTX), xy ## 0; \ | ||
172 | \ | ||
173 | movq 4*(4+(n))(in), xy ## 1; \ | ||
174 | xorq w+4*m(CTX), xy ## 1; \ | ||
175 | \ | ||
176 | movq 4*(8+(n))(in), xy ## 2; \ | ||
177 | xorq w+4*m(CTX), xy ## 2; | ||
178 | |||
179 | #define outunpack3(op, out, n, xy, m) \ | ||
180 | xorq w+4*m(CTX), xy ## 0; \ | ||
181 | op ## q xy ## 0, 4*(n)(out); \ | ||
182 | \ | ||
183 | xorq w+4*m(CTX), xy ## 1; \ | ||
184 | op ## q xy ## 1, 4*(4+(n))(out); \ | ||
185 | \ | ||
186 | xorq w+4*m(CTX), xy ## 2; \ | ||
187 | op ## q xy ## 2, 4*(8+(n))(out); | ||
188 | |||
189 | #define inpack_enc3() \ | ||
190 | inpack3(RIO, 0, RAB, 0); \ | ||
191 | inpack3(RIO, 2, RCD, 2); | ||
192 | |||
193 | #define outunpack_enc3(op) \ | ||
194 | outunpack3(op, RIO, 2, RAB, 6); \ | ||
195 | outunpack3(op, RIO, 0, RCD, 4); | ||
196 | |||
197 | #define inpack_dec3() \ | ||
198 | inpack3(RIO, 0, RAB, 4); \ | ||
199 | rorq $32, RAB0; \ | ||
200 | rorq $32, RAB1; \ | ||
201 | rorq $32, RAB2; \ | ||
202 | inpack3(RIO, 2, RCD, 6); \ | ||
203 | rorq $32, RCD0; \ | ||
204 | rorq $32, RCD1; \ | ||
205 | rorq $32, RCD2; | ||
206 | |||
207 | #define outunpack_dec3() \ | ||
208 | rorq $32, RCD0; \ | ||
209 | rorq $32, RCD1; \ | ||
210 | rorq $32, RCD2; \ | ||
211 | outunpack3(mov, RIO, 0, RCD, 0); \ | ||
212 | rorq $32, RAB0; \ | ||
213 | rorq $32, RAB1; \ | ||
214 | rorq $32, RAB2; \ | ||
215 | outunpack3(mov, RIO, 2, RAB, 2); | ||
216 | |||
217 | .align 8 | ||
218 | .global __twofish_enc_blk_3way | ||
219 | .type __twofish_enc_blk_3way,@function; | ||
220 | |||
221 | __twofish_enc_blk_3way: | ||
222 | /* input: | ||
223 | * %rdi: ctx, CTX | ||
224 | * %rsi: dst | ||
225 | * %rdx: src, RIO | ||
226 | * %rcx: bool, if true: xor output | ||
227 | */ | ||
228 | pushq %r15; | ||
229 | pushq %r14; | ||
230 | pushq %r13; | ||
231 | pushq %r12; | ||
232 | pushq %rbp; | ||
233 | pushq %rbx; | ||
234 | |||
235 | pushq %rcx; /* bool xor */ | ||
236 | pushq %rsi; /* dst */ | ||
237 | |||
238 | inpack_enc3(); | ||
239 | |||
240 | encrypt_cycle3(RAB, RCD, 0); | ||
241 | encrypt_cycle3(RAB, RCD, 1); | ||
242 | encrypt_cycle3(RAB, RCD, 2); | ||
243 | encrypt_cycle3(RAB, RCD, 3); | ||
244 | encrypt_cycle3(RAB, RCD, 4); | ||
245 | encrypt_cycle3(RAB, RCD, 5); | ||
246 | encrypt_cycle3(RAB, RCD, 6); | ||
247 | encrypt_cycle3(RAB, RCD, 7); | ||
248 | |||
249 | popq RIO; /* dst */ | ||
250 | popq %rbp; /* bool xor */ | ||
251 | |||
252 | testb %bpl, %bpl; | ||
253 | jnz __enc_xor3; | ||
254 | |||
255 | outunpack_enc3(mov); | ||
256 | |||
257 | popq %rbx; | ||
258 | popq %rbp; | ||
259 | popq %r12; | ||
260 | popq %r13; | ||
261 | popq %r14; | ||
262 | popq %r15; | ||
263 | ret; | ||
264 | |||
265 | __enc_xor3: | ||
266 | outunpack_enc3(xor); | ||
267 | |||
268 | popq %rbx; | ||
269 | popq %rbp; | ||
270 | popq %r12; | ||
271 | popq %r13; | ||
272 | popq %r14; | ||
273 | popq %r15; | ||
274 | ret; | ||
275 | |||
276 | .global twofish_dec_blk_3way | ||
277 | .type twofish_dec_blk_3way,@function; | ||
278 | |||
279 | twofish_dec_blk_3way: | ||
280 | /* input: | ||
281 | * %rdi: ctx, CTX | ||
282 | * %rsi: dst | ||
283 | * %rdx: src, RIO | ||
284 | */ | ||
285 | pushq %r15; | ||
286 | pushq %r14; | ||
287 | pushq %r13; | ||
288 | pushq %r12; | ||
289 | pushq %rbp; | ||
290 | pushq %rbx; | ||
291 | |||
292 | pushq %rsi; /* dst */ | ||
293 | |||
294 | inpack_dec3(); | ||
295 | |||
296 | decrypt_cycle3(RAB, RCD, 7); | ||
297 | decrypt_cycle3(RAB, RCD, 6); | ||
298 | decrypt_cycle3(RAB, RCD, 5); | ||
299 | decrypt_cycle3(RAB, RCD, 4); | ||
300 | decrypt_cycle3(RAB, RCD, 3); | ||
301 | decrypt_cycle3(RAB, RCD, 2); | ||
302 | decrypt_cycle3(RAB, RCD, 1); | ||
303 | decrypt_cycle3(RAB, RCD, 0); | ||
304 | |||
305 | popq RIO; /* dst */ | ||
306 | |||
307 | outunpack_dec3(); | ||
308 | |||
309 | popq %rbx; | ||
310 | popq %rbp; | ||
311 | popq %r12; | ||
312 | popq %r13; | ||
313 | popq %r14; | ||
314 | popq %r15; | ||
315 | ret; | ||
316 | |||