aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2011-09-23 12:50:55 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2011-10-21 08:23:07 -0400
commite827bb09c815955d5d5f0ddf98483a7efd04f55b (patch)
tree6a4b262e8b51c5b863855549d5f6280b4f5c0e6c /arch/x86/crypto
parentfad8fa4782fde8afffc16b2b907b7f5bdbf03133 (diff)
crypto: blowfish-x86_64 - improve x86_64 blowfish 4-way performance
This patch adds improved F-macro for 4-way parallel functions. With new F-macro for 4-way parallel functions, blowfish sees ~15% improvement in speed tests on AMD Phenom II (~5% on Intel Xeon E7330). However when used in 1-way blowfish function new macro would be ~10% slower than original, so old F-macro is kept for 1-way functions. Patch cleans up old F-macro as it is no longer needed in 4-way part. Patch also does register macro renaming to reduce stack usage. Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S198
1 files changed, 98 insertions, 100 deletions
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 44eb23ab9676..391d245dc086 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -56,38 +56,32 @@
56 56
57#define RT0 %rbp 57#define RT0 %rbp
58#define RT1 %rsi 58#define RT1 %rsi
59#define RT2 %r8
60#define RT3 %r9
59 61
60#define RT0d %ebp 62#define RT0d %ebp
61#define RT1d %esi 63#define RT1d %esi
64#define RT2d %r8d
65#define RT3d %r9d
62 66
63#define RK0 %r8 67#define RKEY %r10
64#define RK1 %r9
65#define RK2 %r10
66#define RK3 %r11
67
68#define RK0d %r8d
69#define RK1d %r9d
70#define RK2d %r10d
71#define RK3d %r11d
72
73#define RKEY %r12
74 68
75/*********************************************************************** 69/***********************************************************************
76 * 1-way blowfish 70 * 1-way blowfish
77 ***********************************************************************/ 71 ***********************************************************************/
78#define F(x, k) \ 72#define F() \
79 rorq $16, x; \ 73 rorq $16, RX0; \
80 movzbl x ## bh, RT0d; \ 74 movzbl RX0bh, RT0d; \
81 movzbl x ## bl, RT1d; \ 75 movzbl RX0bl, RT1d; \
82 rolq $16, x; \ 76 rolq $16, RX0; \
83 movl s0(CTX,RT0,4), k ## d; \ 77 movl s0(CTX,RT0,4), RT0d; \
84 addl s1(CTX,RT1,4), k ## d; \ 78 addl s1(CTX,RT1,4), RT0d; \
85 movzbl x ## bh, RT0d; \ 79 movzbl RX0bh, RT1d; \
86 movzbl x ## bl, RT1d; \ 80 movzbl RX0bl, RT2d; \
87 rolq $32, x; \ 81 rolq $32, RX0; \
88 xorl s2(CTX,RT0,4), k ## d; \ 82 xorl s2(CTX,RT1,4), RT0d; \
89 addl s3(CTX,RT1,4), k ## d; \ 83 addl s3(CTX,RT2,4), RT0d; \
90 xorq k, x; 84 xorq RT0, RX0;
91 85
92#define add_roundkey_enc(n) \ 86#define add_roundkey_enc(n) \
93 xorq p+4*(n)(CTX), RX0; 87 xorq p+4*(n)(CTX), RX0;
@@ -95,11 +89,8 @@
95#define round_enc(n) \ 89#define round_enc(n) \
96 add_roundkey_enc(n); \ 90 add_roundkey_enc(n); \
97 \ 91 \
98 F(RX0, RK0); \ 92 F(); \
99 F(RX0, RK0); 93 F();
100
101#define round_final_enc(n) \
102 xorq p+4*(n)(CTX), RX0;
103 94
104#define add_roundkey_dec(n) \ 95#define add_roundkey_dec(n) \
105 movq p+4*(n-1)(CTX), RT0; \ 96 movq p+4*(n-1)(CTX), RT0; \
@@ -109,8 +100,8 @@
109#define round_dec(n) \ 100#define round_dec(n) \
110 add_roundkey_dec(n); \ 101 add_roundkey_dec(n); \
111 \ 102 \
112 F(RX0, RK0); \ 103 F(); \
113 F(RX0, RK0); \ 104 F(); \
114 105
115#define read_block() \ 106#define read_block() \
116 movq (RIO), RX0; \ 107 movq (RIO), RX0; \
@@ -130,16 +121,15 @@
130.type __blowfish_enc_blk,@function; 121.type __blowfish_enc_blk,@function;
131 122
132__blowfish_enc_blk: 123__blowfish_enc_blk:
133 // input: 124 /* input:
134 // %rdi: ctx, CTX 125 * %rdi: ctx, CTX
135 // %rsi: dst 126 * %rsi: dst
136 // %rdx: src 127 * %rdx: src
137 // %rcx: bool xor 128 * %rcx: bool, if true: xor output
138 pushq %rbp; 129 */
139 pushq %rbx; 130 movq %rbp, %r11;
140 131
141 pushq %rsi; 132 movq %rsi, %r10;
142 pushq %rcx;
143 movq %rdx, RIO; 133 movq %rdx, RIO;
144 134
145 read_block(); 135 read_block();
@@ -154,38 +144,31 @@ __blowfish_enc_blk:
154 round_enc(14); 144 round_enc(14);
155 add_roundkey_enc(16); 145 add_roundkey_enc(16);
156 146
157 popq %rbp; 147 movq %r11, %rbp;
158 popq RIO;
159 148
160 test %bpl, %bpl; 149 movq %r10, RIO;
150 test %cl, %cl;
161 jnz __enc_xor; 151 jnz __enc_xor;
162 152
163 write_block(); 153 write_block();
164
165__enc_ret:
166 popq %rbx;
167 popq %rbp;
168
169 ret; 154 ret;
170
171__enc_xor: 155__enc_xor:
172 xor_block(); 156 xor_block();
173 157 ret;
174 jmp __enc_ret;
175 158
176.align 8 159.align 8
177.global blowfish_dec_blk 160.global blowfish_dec_blk
178.type blowfish_dec_blk,@function; 161.type blowfish_dec_blk,@function;
179 162
180blowfish_dec_blk: 163blowfish_dec_blk:
181 // input: 164 /* input:
182 // %rdi: ctx, CTX 165 * %rdi: ctx, CTX
183 // %rsi: dst 166 * %rsi: dst
184 // %rdx: src 167 * %rdx: src
185 pushq %rbp; 168 */
186 pushq %rbx; 169 movq %rbp, %r11;
187 170
188 pushq %rsi; 171 movq %rsi, %r10;
189 movq %rdx, RIO; 172 movq %rdx, RIO;
190 173
191 read_block(); 174 read_block();
@@ -200,17 +183,33 @@ blowfish_dec_blk:
200 round_dec(3); 183 round_dec(3);
201 add_roundkey_dec(1); 184 add_roundkey_dec(1);
202 185
203 popq RIO; 186 movq %r10, RIO;
204 write_block(); 187 write_block();
205 188
206 popq %rbx; 189 movq %r11, %rbp;
207 popq %rbp;
208 190
209 ret; 191 ret;
210 192
211/********************************************************************** 193/**********************************************************************
212 4-way blowfish, four blocks parallel 194 4-way blowfish, four blocks parallel
213 **********************************************************************/ 195 **********************************************************************/
196
197/* F() for 4-way. Slower when used alone/1-way, but faster when used
198 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
199 */
200#define F4(x) \
201 movzbl x ## bh, RT1d; \
202 movzbl x ## bl, RT3d; \
203 rorq $16, x; \
204 movzbl x ## bh, RT0d; \
205 movzbl x ## bl, RT2d; \
206 rorq $16, x; \
207 movl s0(CTX,RT0,4), RT0d; \
208 addl s1(CTX,RT2,4), RT0d; \
209 xorl s2(CTX,RT1,4), RT0d; \
210 addl s3(CTX,RT3,4), RT0d; \
211 xorq RT0, x;
212
214#define add_preloaded_roundkey4() \ 213#define add_preloaded_roundkey4() \
215 xorq RKEY, RX0; \ 214 xorq RKEY, RX0; \
216 xorq RKEY, RX1; \ 215 xorq RKEY, RX1; \
@@ -227,15 +226,15 @@ blowfish_dec_blk:
227#define round_enc4(n) \ 226#define round_enc4(n) \
228 add_roundkey_enc4(n); \ 227 add_roundkey_enc4(n); \
229 \ 228 \
230 F(RX0, RK0); \ 229 F4(RX0); \
231 F(RX1, RK1); \ 230 F4(RX1); \
232 F(RX2, RK2); \ 231 F4(RX2); \
233 F(RX3, RK3); \ 232 F4(RX3); \
234 \ 233 \
235 F(RX0, RK0); \ 234 F4(RX0); \
236 F(RX1, RK1); \ 235 F4(RX1); \
237 F(RX2, RK2); \ 236 F4(RX2); \
238 F(RX3, RK3); 237 F4(RX3);
239 238
240#define preload_roundkey_dec(n) \ 239#define preload_roundkey_dec(n) \
241 movq p+4*((n)-1)(CTX), RKEY; \ 240 movq p+4*((n)-1)(CTX), RKEY; \
@@ -248,15 +247,15 @@ blowfish_dec_blk:
248#define round_dec4(n) \ 247#define round_dec4(n) \
249 add_roundkey_dec4(n); \ 248 add_roundkey_dec4(n); \
250 \ 249 \
251 F(RX0, RK0); \ 250 F4(RX0); \
252 F(RX1, RK1); \ 251 F4(RX1); \
253 F(RX2, RK2); \ 252 F4(RX2); \
254 F(RX3, RK3); \ 253 F4(RX3); \
255 \ 254 \
256 F(RX0, RK0); \ 255 F4(RX0); \
257 F(RX1, RK1); \ 256 F4(RX1); \
258 F(RX2, RK2); \ 257 F4(RX2); \
259 F(RX3, RK3); 258 F4(RX3);
260 259
261#define read_block4() \ 260#define read_block4() \
262 movq (RIO), RX0; \ 261 movq (RIO), RX0; \
@@ -306,18 +305,19 @@ blowfish_dec_blk:
306.type __blowfish_enc_blk_4way,@function; 305.type __blowfish_enc_blk_4way,@function;
307 306
308__blowfish_enc_blk_4way: 307__blowfish_enc_blk_4way:
309 // input: 308 /* input:
310 // %rdi: ctx, CTX 309 * %rdi: ctx, CTX
311 // %rsi: dst 310 * %rsi: dst
312 // %rdx: src 311 * %rdx: src
313 // %rcx: bool xor 312 * %rcx: bool, if true: xor output
313 */
314 pushq %rbp; 314 pushq %rbp;
315 pushq %rbx; 315 pushq %rbx;
316 pushq RKEY; 316 pushq %rcx;
317
317 preload_roundkey_enc(0); 318 preload_roundkey_enc(0);
318 319
319 pushq %rsi; 320 movq %rsi, %r11;
320 pushq %rcx;
321 movq %rdx, RIO; 321 movq %rdx, RIO;
322 322
323 read_block4(); 323 read_block4();
@@ -333,40 +333,39 @@ __blowfish_enc_blk_4way:
333 add_preloaded_roundkey4(); 333 add_preloaded_roundkey4();
334 334
335 popq %rbp; 335 popq %rbp;
336 popq RIO; 336 movq %r11, RIO;
337 337
338 test %bpl, %bpl; 338 test %bpl, %bpl;
339 jnz __enc_xor4; 339 jnz __enc_xor4;
340 340
341 write_block4(); 341 write_block4();
342 342
343__enc_ret4:
344 popq RKEY;
345 popq %rbx; 343 popq %rbx;
346 popq %rbp; 344 popq %rbp;
347
348 ret; 345 ret;
349 346
350__enc_xor4: 347__enc_xor4:
351 xor_block4(); 348 xor_block4();
352 349
353 jmp __enc_ret4; 350 popq %rbx;
351 popq %rbp;
352 ret;
354 353
355.align 8 354.align 8
356.global blowfish_dec_blk_4way 355.global blowfish_dec_blk_4way
357.type blowfish_dec_blk_4way,@function; 356.type blowfish_dec_blk_4way,@function;
358 357
359blowfish_dec_blk_4way: 358blowfish_dec_blk_4way:
360 // input: 359 /* input:
361 // %rdi: ctx, CTX 360 * %rdi: ctx, CTX
362 // %rsi: dst 361 * %rsi: dst
363 // %rdx: src 362 * %rdx: src
363 */
364 pushq %rbp; 364 pushq %rbp;
365 pushq %rbx; 365 pushq %rbx;
366 pushq RKEY;
367 preload_roundkey_dec(17); 366 preload_roundkey_dec(17);
368 367
369 pushq %rsi; 368 movq %rsi, %r11;
370 movq %rdx, RIO; 369 movq %rdx, RIO;
371 370
372 read_block4(); 371 read_block4();
@@ -381,10 +380,9 @@ blowfish_dec_blk_4way:
381 round_dec4(3); 380 round_dec4(3);
382 add_preloaded_roundkey4(); 381 add_preloaded_roundkey4();
383 382
384 popq RIO; 383 movq %r11, RIO;
385 write_block4(); 384 write_block4();
386 385
387 popq RKEY;
388 popq %rbx; 386 popq %rbx;
389 popq %rbp; 387 popq %rbp;
390 388