aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/crypto/twofish-i586-asm.S
diff options
context:
space:
mode:
authorJoachim Fritschi <jfritschi@freenet.de>2006-06-20 06:59:16 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2006-09-20 21:16:28 -0400
commitb9f535ffe38f7eb61ac2219d32d97c377b69f70d (patch)
tree57e09481226ab5a25f3938963f8299c9f0cd8439 /arch/i386/crypto/twofish-i586-asm.S
parent758f570ea785a5fbcdca026dfab2e9e1a3f89726 (diff)
[CRYPTO] twofish: i586 assembly version
The patch passed the trycpt tests and automated filesystem tests. This rewrite resulted in some nice perfomance increase over my last patch. Short summary of the tcrypt benchmarks: Twofish Assembler vs. Twofish C (256bit 8kb block CBC) encrypt: -33% Cycles decrypt: -45% Cycles Twofish Assembler vs. AES Assembler (128bit 8kb block CBC) encrypt: +3% Cycles decrypt: -22% Cycles Twofish Assembler vs. AES Assembler (256bit 8kb block CBC) encrypt: -20% Cycles decrypt: -36% Cycles Full Output: http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-asm-i586.txt http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-c-i586.txt http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-aes-asm-i586.txt Here is another bonnie++ benchmark with encrypted filesystems. All runs with the twofish assembler modules max out the drivespeed. It should give some idea what the module can do for encrypted filesystem performance even though you can't see the full numbers. http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060611_205432_x86.html Signed-off-by: Joachim Fritschi <jfritschi@freenet.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/i386/crypto/twofish-i586-asm.S')
-rw-r--r--arch/i386/crypto/twofish-i586-asm.S335
1 files changed, 335 insertions, 0 deletions
diff --git a/arch/i386/crypto/twofish-i586-asm.S b/arch/i386/crypto/twofish-i586-asm.S
new file mode 100644
index 000000000000..39b98ed2c1b9
--- /dev/null
+++ b/arch/i386/crypto/twofish-i586-asm.S
@@ -0,0 +1,335 @@
1/***************************************************************************
2* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> *
3* *
4* This program is free software; you can redistribute it and/or modify *
5* it under the terms of the GNU General Public License as published by *
6* the Free Software Foundation; either version 2 of the License, or *
7* (at your option) any later version. *
8* *
9* This program is distributed in the hope that it will be useful, *
10* but WITHOUT ANY WARRANTY; without even the implied warranty of *
11* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
12* GNU General Public License for more details. *
13* *
14* You should have received a copy of the GNU General Public License *
15* along with this program; if not, write to the *
16* Free Software Foundation, Inc., *
17* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
18***************************************************************************/
19
20.file "twofish-i586-asm.S"
21.text
22
23#include <asm/asm-offsets.h>
24
25/* return adress at 0 */
26
27#define in_blk 12 /* input byte array address parameter*/
28#define out_blk 8 /* output byte array address parameter*/
29#define tfm 4 /* Twofish context structure */
30
31#define a_offset 0
32#define b_offset 4
33#define c_offset 8
34#define d_offset 12
35
36/* Structure of the crypto context struct*/
37
38#define s0 0 /* S0 Array 256 Words each */
39#define s1 1024 /* S1 Array */
40#define s2 2048 /* S2 Array */
41#define s3 3072 /* S3 Array */
42#define w 4096 /* 8 whitening keys (word) */
43#define k 4128 /* key 1-32 ( word ) */
44
45/* define a few register aliases to allow macro substitution */
46
47#define R0D %eax
48#define R0B %al
49#define R0H %ah
50
51#define R1D %ebx
52#define R1B %bl
53#define R1H %bh
54
55#define R2D %ecx
56#define R2B %cl
57#define R2H %ch
58
59#define R3D %edx
60#define R3B %dl
61#define R3H %dh
62
63
64/* performs input whitening */
65#define input_whitening(src,context,offset)\
66 xor w+offset(context), src;
67
68/* performs input whitening */
69#define output_whitening(src,context,offset)\
70 xor w+16+offset(context), src;
71
72/*
73 * a input register containing a (rotated 16)
74 * b input register containing b
75 * c input register containing c
76 * d input register containing d (already rol $1)
77 * operations on a and b are interleaved to increase performance
78 */
79#define encrypt_round(a,b,c,d,round)\
80 push d ## D;\
81 movzx b ## B, %edi;\
82 mov s1(%ebp,%edi,4),d ## D;\
83 movzx a ## B, %edi;\
84 mov s2(%ebp,%edi,4),%esi;\
85 movzx b ## H, %edi;\
86 ror $16, b ## D;\
87 xor s2(%ebp,%edi,4),d ## D;\
88 movzx a ## H, %edi;\
89 ror $16, a ## D;\
90 xor s3(%ebp,%edi,4),%esi;\
91 movzx b ## B, %edi;\
92 xor s3(%ebp,%edi,4),d ## D;\
93 movzx a ## B, %edi;\
94 xor (%ebp,%edi,4), %esi;\
95 movzx b ## H, %edi;\
96 ror $15, b ## D;\
97 xor (%ebp,%edi,4), d ## D;\
98 movzx a ## H, %edi;\
99 xor s1(%ebp,%edi,4),%esi;\
100 pop %edi;\
101 add d ## D, %esi;\
102 add %esi, d ## D;\
103 add k+round(%ebp), %esi;\
104 xor %esi, c ## D;\
105 rol $15, c ## D;\
106 add k+4+round(%ebp),d ## D;\
107 xor %edi, d ## D;
108
109/*
110 * a input register containing a (rotated 16)
111 * b input register containing b
112 * c input register containing c
113 * d input register containing d (already rol $1)
114 * operations on a and b are interleaved to increase performance
115 * last round has different rotations for the output preparation
116 */
117#define encrypt_last_round(a,b,c,d,round)\
118 push d ## D;\
119 movzx b ## B, %edi;\
120 mov s1(%ebp,%edi,4),d ## D;\
121 movzx a ## B, %edi;\
122 mov s2(%ebp,%edi,4),%esi;\
123 movzx b ## H, %edi;\
124 ror $16, b ## D;\
125 xor s2(%ebp,%edi,4),d ## D;\
126 movzx a ## H, %edi;\
127 ror $16, a ## D;\
128 xor s3(%ebp,%edi,4),%esi;\
129 movzx b ## B, %edi;\
130 xor s3(%ebp,%edi,4),d ## D;\
131 movzx a ## B, %edi;\
132 xor (%ebp,%edi,4), %esi;\
133 movzx b ## H, %edi;\
134 ror $16, b ## D;\
135 xor (%ebp,%edi,4), d ## D;\
136 movzx a ## H, %edi;\
137 xor s1(%ebp,%edi,4),%esi;\
138 pop %edi;\
139 add d ## D, %esi;\
140 add %esi, d ## D;\
141 add k+round(%ebp), %esi;\
142 xor %esi, c ## D;\
143 ror $1, c ## D;\
144 add k+4+round(%ebp),d ## D;\
145 xor %edi, d ## D;
146
147/*
148 * a input register containing a
149 * b input register containing b (rotated 16)
150 * c input register containing c
151 * d input register containing d (already rol $1)
152 * operations on a and b are interleaved to increase performance
153 */
154#define decrypt_round(a,b,c,d,round)\
155 push c ## D;\
156 movzx a ## B, %edi;\
157 mov (%ebp,%edi,4), c ## D;\
158 movzx b ## B, %edi;\
159 mov s3(%ebp,%edi,4),%esi;\
160 movzx a ## H, %edi;\
161 ror $16, a ## D;\
162 xor s1(%ebp,%edi,4),c ## D;\
163 movzx b ## H, %edi;\
164 ror $16, b ## D;\
165 xor (%ebp,%edi,4), %esi;\
166 movzx a ## B, %edi;\
167 xor s2(%ebp,%edi,4),c ## D;\
168 movzx b ## B, %edi;\
169 xor s1(%ebp,%edi,4),%esi;\
170 movzx a ## H, %edi;\
171 ror $15, a ## D;\
172 xor s3(%ebp,%edi,4),c ## D;\
173 movzx b ## H, %edi;\
174 xor s2(%ebp,%edi,4),%esi;\
175 pop %edi;\
176 add %esi, c ## D;\
177 add c ## D, %esi;\
178 add k+round(%ebp), c ## D;\
179 xor %edi, c ## D;\
180 add k+4+round(%ebp),%esi;\
181 xor %esi, d ## D;\
182 rol $15, d ## D;
183
184/*
185 * a input register containing a
186 * b input register containing b (rotated 16)
187 * c input register containing c
188 * d input register containing d (already rol $1)
189 * operations on a and b are interleaved to increase performance
190 * last round has different rotations for the output preparation
191 */
192#define decrypt_last_round(a,b,c,d,round)\
193 push c ## D;\
194 movzx a ## B, %edi;\
195 mov (%ebp,%edi,4), c ## D;\
196 movzx b ## B, %edi;\
197 mov s3(%ebp,%edi,4),%esi;\
198 movzx a ## H, %edi;\
199 ror $16, a ## D;\
200 xor s1(%ebp,%edi,4),c ## D;\
201 movzx b ## H, %edi;\
202 ror $16, b ## D;\
203 xor (%ebp,%edi,4), %esi;\
204 movzx a ## B, %edi;\
205 xor s2(%ebp,%edi,4),c ## D;\
206 movzx b ## B, %edi;\
207 xor s1(%ebp,%edi,4),%esi;\
208 movzx a ## H, %edi;\
209 ror $16, a ## D;\
210 xor s3(%ebp,%edi,4),c ## D;\
211 movzx b ## H, %edi;\
212 xor s2(%ebp,%edi,4),%esi;\
213 pop %edi;\
214 add %esi, c ## D;\
215 add c ## D, %esi;\
216 add k+round(%ebp), c ## D;\
217 xor %edi, c ## D;\
218 add k+4+round(%ebp),%esi;\
219 xor %esi, d ## D;\
220 ror $1, d ## D;
221
222.align 4
223.global twofish_enc_blk
224.global twofish_dec_blk
225
226twofish_enc_blk:
227 push %ebp /* save registers according to calling convention*/
228 push %ebx
229 push %esi
230 push %edi
231
232 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
233 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
234 mov in_blk+16(%esp),%edi /* input adress in edi */
235
236 mov (%edi), %eax
237 mov b_offset(%edi), %ebx
238 mov c_offset(%edi), %ecx
239 mov d_offset(%edi), %edx
240 input_whitening(%eax,%ebp,a_offset)
241 ror $16, %eax
242 input_whitening(%ebx,%ebp,b_offset)
243 input_whitening(%ecx,%ebp,c_offset)
244 input_whitening(%edx,%ebp,d_offset)
245 rol $1, %edx
246
247 encrypt_round(R0,R1,R2,R3,0);
248 encrypt_round(R2,R3,R0,R1,8);
249 encrypt_round(R0,R1,R2,R3,2*8);
250 encrypt_round(R2,R3,R0,R1,3*8);
251 encrypt_round(R0,R1,R2,R3,4*8);
252 encrypt_round(R2,R3,R0,R1,5*8);
253 encrypt_round(R0,R1,R2,R3,6*8);
254 encrypt_round(R2,R3,R0,R1,7*8);
255 encrypt_round(R0,R1,R2,R3,8*8);
256 encrypt_round(R2,R3,R0,R1,9*8);
257 encrypt_round(R0,R1,R2,R3,10*8);
258 encrypt_round(R2,R3,R0,R1,11*8);
259 encrypt_round(R0,R1,R2,R3,12*8);
260 encrypt_round(R2,R3,R0,R1,13*8);
261 encrypt_round(R0,R1,R2,R3,14*8);
262 encrypt_last_round(R2,R3,R0,R1,15*8);
263
264 output_whitening(%eax,%ebp,c_offset)
265 output_whitening(%ebx,%ebp,d_offset)
266 output_whitening(%ecx,%ebp,a_offset)
267 output_whitening(%edx,%ebp,b_offset)
268 mov out_blk+16(%esp),%edi;
269 mov %eax, c_offset(%edi)
270 mov %ebx, d_offset(%edi)
271 mov %ecx, (%edi)
272 mov %edx, b_offset(%edi)
273
274 pop %edi
275 pop %esi
276 pop %ebx
277 pop %ebp
278 mov $1, %eax
279 ret
280
281twofish_dec_blk:
282 push %ebp /* save registers according to calling convention*/
283 push %ebx
284 push %esi
285 push %edi
286
287
288 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
289 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */
290 mov in_blk+16(%esp),%edi /* input adress in edi */
291
292 mov (%edi), %eax
293 mov b_offset(%edi), %ebx
294 mov c_offset(%edi), %ecx
295 mov d_offset(%edi), %edx
296 output_whitening(%eax,%ebp,a_offset)
297 output_whitening(%ebx,%ebp,b_offset)
298 ror $16, %ebx
299 output_whitening(%ecx,%ebp,c_offset)
300 output_whitening(%edx,%ebp,d_offset)
301 rol $1, %ecx
302
303 decrypt_round(R0,R1,R2,R3,15*8);
304 decrypt_round(R2,R3,R0,R1,14*8);
305 decrypt_round(R0,R1,R2,R3,13*8);
306 decrypt_round(R2,R3,R0,R1,12*8);
307 decrypt_round(R0,R1,R2,R3,11*8);
308 decrypt_round(R2,R3,R0,R1,10*8);
309 decrypt_round(R0,R1,R2,R3,9*8);
310 decrypt_round(R2,R3,R0,R1,8*8);
311 decrypt_round(R0,R1,R2,R3,7*8);
312 decrypt_round(R2,R3,R0,R1,6*8);
313 decrypt_round(R0,R1,R2,R3,5*8);
314 decrypt_round(R2,R3,R0,R1,4*8);
315 decrypt_round(R0,R1,R2,R3,3*8);
316 decrypt_round(R2,R3,R0,R1,2*8);
317 decrypt_round(R0,R1,R2,R3,1*8);
318 decrypt_last_round(R2,R3,R0,R1,0);
319
320 input_whitening(%eax,%ebp,c_offset)
321 input_whitening(%ebx,%ebp,d_offset)
322 input_whitening(%ecx,%ebp,a_offset)
323 input_whitening(%edx,%ebp,b_offset)
324 mov out_blk+16(%esp),%edi;
325 mov %eax, c_offset(%edi)
326 mov %ebx, d_offset(%edi)
327 mov %ecx, (%edi)
328 mov %edx, b_offset(%edi)
329
330 pop %edi
331 pop %esi
332 pop %ebx
333 pop %ebp
334 mov $1, %eax
335 ret