diff options
author | Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 2011-09-23 12:50:55 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2011-10-21 08:23:07 -0400 |
commit | e827bb09c815955d5d5f0ddf98483a7efd04f55b (patch) | |
tree | 6a4b262e8b51c5b863855549d5f6280b4f5c0e6c /arch/x86/crypto | |
parent | fad8fa4782fde8afffc16b2b907b7f5bdbf03133 (diff) |
crypto: blowfish-x86_64 - improve x86_64 blowfish 4-way performance
This patch adds improved F-macro for 4-way parallel functions. With new
F-macro for 4-way parallel functions, blowfish sees ~15% improvement in
speed tests on AMD Phenom II (~5% on Intel Xeon E7330).
However when used in 1-way blowfish function new macro would be ~10%
slower than original, so old F-macro is kept for 1-way functions.
Patch cleans up old F-macro as it is no longer needed in 4-way part.
Patch also does register macro renaming to reduce stack usage.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r-- | arch/x86/crypto/blowfish-x86_64-asm_64.S | 198 |
1 files changed, 98 insertions, 100 deletions
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S index 44eb23ab9676..391d245dc086 100644 --- a/arch/x86/crypto/blowfish-x86_64-asm_64.S +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S | |||
@@ -56,38 +56,32 @@ | |||
56 | 56 | ||
57 | #define RT0 %rbp | 57 | #define RT0 %rbp |
58 | #define RT1 %rsi | 58 | #define RT1 %rsi |
59 | #define RT2 %r8 | ||
60 | #define RT3 %r9 | ||
59 | 61 | ||
60 | #define RT0d %ebp | 62 | #define RT0d %ebp |
61 | #define RT1d %esi | 63 | #define RT1d %esi |
64 | #define RT2d %r8d | ||
65 | #define RT3d %r9d | ||
62 | 66 | ||
63 | #define RK0 %r8 | 67 | #define RKEY %r10 |
64 | #define RK1 %r9 | ||
65 | #define RK2 %r10 | ||
66 | #define RK3 %r11 | ||
67 | |||
68 | #define RK0d %r8d | ||
69 | #define RK1d %r9d | ||
70 | #define RK2d %r10d | ||
71 | #define RK3d %r11d | ||
72 | |||
73 | #define RKEY %r12 | ||
74 | 68 | ||
75 | /*********************************************************************** | 69 | /*********************************************************************** |
76 | * 1-way blowfish | 70 | * 1-way blowfish |
77 | ***********************************************************************/ | 71 | ***********************************************************************/ |
78 | #define F(x, k) \ | 72 | #define F() \ |
79 | rorq $16, x; \ | 73 | rorq $16, RX0; \ |
80 | movzbl x ## bh, RT0d; \ | 74 | movzbl RX0bh, RT0d; \ |
81 | movzbl x ## bl, RT1d; \ | 75 | movzbl RX0bl, RT1d; \ |
82 | rolq $16, x; \ | 76 | rolq $16, RX0; \ |
83 | movl s0(CTX,RT0,4), k ## d; \ | 77 | movl s0(CTX,RT0,4), RT0d; \ |
84 | addl s1(CTX,RT1,4), k ## d; \ | 78 | addl s1(CTX,RT1,4), RT0d; \ |
85 | movzbl x ## bh, RT0d; \ | 79 | movzbl RX0bh, RT1d; \ |
86 | movzbl x ## bl, RT1d; \ | 80 | movzbl RX0bl, RT2d; \ |
87 | rolq $32, x; \ | 81 | rolq $32, RX0; \ |
88 | xorl s2(CTX,RT0,4), k ## d; \ | 82 | xorl s2(CTX,RT1,4), RT0d; \ |
89 | addl s3(CTX,RT1,4), k ## d; \ | 83 | addl s3(CTX,RT2,4), RT0d; \ |
90 | xorq k, x; | 84 | xorq RT0, RX0; |
91 | 85 | ||
92 | #define add_roundkey_enc(n) \ | 86 | #define add_roundkey_enc(n) \ |
93 | xorq p+4*(n)(CTX), RX0; | 87 | xorq p+4*(n)(CTX), RX0; |
@@ -95,11 +89,8 @@ | |||
95 | #define round_enc(n) \ | 89 | #define round_enc(n) \ |
96 | add_roundkey_enc(n); \ | 90 | add_roundkey_enc(n); \ |
97 | \ | 91 | \ |
98 | F(RX0, RK0); \ | 92 | F(); \ |
99 | F(RX0, RK0); | 93 | F(); |
100 | |||
101 | #define round_final_enc(n) \ | ||
102 | xorq p+4*(n)(CTX), RX0; | ||
103 | 94 | ||
104 | #define add_roundkey_dec(n) \ | 95 | #define add_roundkey_dec(n) \ |
105 | movq p+4*(n-1)(CTX), RT0; \ | 96 | movq p+4*(n-1)(CTX), RT0; \ |
@@ -109,8 +100,8 @@ | |||
109 | #define round_dec(n) \ | 100 | #define round_dec(n) \ |
110 | add_roundkey_dec(n); \ | 101 | add_roundkey_dec(n); \ |
111 | \ | 102 | \ |
112 | F(RX0, RK0); \ | 103 | F(); \ |
113 | F(RX0, RK0); \ | 104 | F(); \ |
114 | 105 | ||
115 | #define read_block() \ | 106 | #define read_block() \ |
116 | movq (RIO), RX0; \ | 107 | movq (RIO), RX0; \ |
@@ -130,16 +121,15 @@ | |||
130 | .type __blowfish_enc_blk,@function; | 121 | .type __blowfish_enc_blk,@function; |
131 | 122 | ||
132 | __blowfish_enc_blk: | 123 | __blowfish_enc_blk: |
133 | // input: | 124 | /* input: |
134 | // %rdi: ctx, CTX | 125 | * %rdi: ctx, CTX |
135 | // %rsi: dst | 126 | * %rsi: dst |
136 | // %rdx: src | 127 | * %rdx: src |
137 | // %rcx: bool xor | 128 | * %rcx: bool, if true: xor output |
138 | pushq %rbp; | 129 | */ |
139 | pushq %rbx; | 130 | movq %rbp, %r11; |
140 | 131 | ||
141 | pushq %rsi; | 132 | movq %rsi, %r10; |
142 | pushq %rcx; | ||
143 | movq %rdx, RIO; | 133 | movq %rdx, RIO; |
144 | 134 | ||
145 | read_block(); | 135 | read_block(); |
@@ -154,38 +144,31 @@ __blowfish_enc_blk: | |||
154 | round_enc(14); | 144 | round_enc(14); |
155 | add_roundkey_enc(16); | 145 | add_roundkey_enc(16); |
156 | 146 | ||
157 | popq %rbp; | 147 | movq %r11, %rbp; |
158 | popq RIO; | ||
159 | 148 | ||
160 | test %bpl, %bpl; | 149 | movq %r10, RIO; |
150 | test %cl, %cl; | ||
161 | jnz __enc_xor; | 151 | jnz __enc_xor; |
162 | 152 | ||
163 | write_block(); | 153 | write_block(); |
164 | |||
165 | __enc_ret: | ||
166 | popq %rbx; | ||
167 | popq %rbp; | ||
168 | |||
169 | ret; | 154 | ret; |
170 | |||
171 | __enc_xor: | 155 | __enc_xor: |
172 | xor_block(); | 156 | xor_block(); |
173 | 157 | ret; | |
174 | jmp __enc_ret; | ||
175 | 158 | ||
176 | .align 8 | 159 | .align 8 |
177 | .global blowfish_dec_blk | 160 | .global blowfish_dec_blk |
178 | .type blowfish_dec_blk,@function; | 161 | .type blowfish_dec_blk,@function; |
179 | 162 | ||
180 | blowfish_dec_blk: | 163 | blowfish_dec_blk: |
181 | // input: | 164 | /* input: |
182 | // %rdi: ctx, CTX | 165 | * %rdi: ctx, CTX |
183 | // %rsi: dst | 166 | * %rsi: dst |
184 | // %rdx: src | 167 | * %rdx: src |
185 | pushq %rbp; | 168 | */ |
186 | pushq %rbx; | 169 | movq %rbp, %r11; |
187 | 170 | ||
188 | pushq %rsi; | 171 | movq %rsi, %r10; |
189 | movq %rdx, RIO; | 172 | movq %rdx, RIO; |
190 | 173 | ||
191 | read_block(); | 174 | read_block(); |
@@ -200,17 +183,33 @@ blowfish_dec_blk: | |||
200 | round_dec(3); | 183 | round_dec(3); |
201 | add_roundkey_dec(1); | 184 | add_roundkey_dec(1); |
202 | 185 | ||
203 | popq RIO; | 186 | movq %r10, RIO; |
204 | write_block(); | 187 | write_block(); |
205 | 188 | ||
206 | popq %rbx; | 189 | movq %r11, %rbp; |
207 | popq %rbp; | ||
208 | 190 | ||
209 | ret; | 191 | ret; |
210 | 192 | ||
211 | /********************************************************************** | 193 | /********************************************************************** |
212 | 4-way blowfish, four blocks parallel | 194 | 4-way blowfish, four blocks parallel |
213 | **********************************************************************/ | 195 | **********************************************************************/ |
196 | |||
197 | /* F() for 4-way. Slower when used alone/1-way, but faster when used | ||
198 | * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). | ||
199 | */ | ||
200 | #define F4(x) \ | ||
201 | movzbl x ## bh, RT1d; \ | ||
202 | movzbl x ## bl, RT3d; \ | ||
203 | rorq $16, x; \ | ||
204 | movzbl x ## bh, RT0d; \ | ||
205 | movzbl x ## bl, RT2d; \ | ||
206 | rorq $16, x; \ | ||
207 | movl s0(CTX,RT0,4), RT0d; \ | ||
208 | addl s1(CTX,RT2,4), RT0d; \ | ||
209 | xorl s2(CTX,RT1,4), RT0d; \ | ||
210 | addl s3(CTX,RT3,4), RT0d; \ | ||
211 | xorq RT0, x; | ||
212 | |||
214 | #define add_preloaded_roundkey4() \ | 213 | #define add_preloaded_roundkey4() \ |
215 | xorq RKEY, RX0; \ | 214 | xorq RKEY, RX0; \ |
216 | xorq RKEY, RX1; \ | 215 | xorq RKEY, RX1; \ |
@@ -227,15 +226,15 @@ blowfish_dec_blk: | |||
227 | #define round_enc4(n) \ | 226 | #define round_enc4(n) \ |
228 | add_roundkey_enc4(n); \ | 227 | add_roundkey_enc4(n); \ |
229 | \ | 228 | \ |
230 | F(RX0, RK0); \ | 229 | F4(RX0); \ |
231 | F(RX1, RK1); \ | 230 | F4(RX1); \ |
232 | F(RX2, RK2); \ | 231 | F4(RX2); \ |
233 | F(RX3, RK3); \ | 232 | F4(RX3); \ |
234 | \ | 233 | \ |
235 | F(RX0, RK0); \ | 234 | F4(RX0); \ |
236 | F(RX1, RK1); \ | 235 | F4(RX1); \ |
237 | F(RX2, RK2); \ | 236 | F4(RX2); \ |
238 | F(RX3, RK3); | 237 | F4(RX3); |
239 | 238 | ||
240 | #define preload_roundkey_dec(n) \ | 239 | #define preload_roundkey_dec(n) \ |
241 | movq p+4*((n)-1)(CTX), RKEY; \ | 240 | movq p+4*((n)-1)(CTX), RKEY; \ |
@@ -248,15 +247,15 @@ blowfish_dec_blk: | |||
248 | #define round_dec4(n) \ | 247 | #define round_dec4(n) \ |
249 | add_roundkey_dec4(n); \ | 248 | add_roundkey_dec4(n); \ |
250 | \ | 249 | \ |
251 | F(RX0, RK0); \ | 250 | F4(RX0); \ |
252 | F(RX1, RK1); \ | 251 | F4(RX1); \ |
253 | F(RX2, RK2); \ | 252 | F4(RX2); \ |
254 | F(RX3, RK3); \ | 253 | F4(RX3); \ |
255 | \ | 254 | \ |
256 | F(RX0, RK0); \ | 255 | F4(RX0); \ |
257 | F(RX1, RK1); \ | 256 | F4(RX1); \ |
258 | F(RX2, RK2); \ | 257 | F4(RX2); \ |
259 | F(RX3, RK3); | 258 | F4(RX3); |
260 | 259 | ||
261 | #define read_block4() \ | 260 | #define read_block4() \ |
262 | movq (RIO), RX0; \ | 261 | movq (RIO), RX0; \ |
@@ -306,18 +305,19 @@ blowfish_dec_blk: | |||
306 | .type __blowfish_enc_blk_4way,@function; | 305 | .type __blowfish_enc_blk_4way,@function; |
307 | 306 | ||
308 | __blowfish_enc_blk_4way: | 307 | __blowfish_enc_blk_4way: |
309 | // input: | 308 | /* input: |
310 | // %rdi: ctx, CTX | 309 | * %rdi: ctx, CTX |
311 | // %rsi: dst | 310 | * %rsi: dst |
312 | // %rdx: src | 311 | * %rdx: src |
313 | // %rcx: bool xor | 312 | * %rcx: bool, if true: xor output |
313 | */ | ||
314 | pushq %rbp; | 314 | pushq %rbp; |
315 | pushq %rbx; | 315 | pushq %rbx; |
316 | pushq RKEY; | 316 | pushq %rcx; |
317 | |||
317 | preload_roundkey_enc(0); | 318 | preload_roundkey_enc(0); |
318 | 319 | ||
319 | pushq %rsi; | 320 | movq %rsi, %r11; |
320 | pushq %rcx; | ||
321 | movq %rdx, RIO; | 321 | movq %rdx, RIO; |
322 | 322 | ||
323 | read_block4(); | 323 | read_block4(); |
@@ -333,40 +333,39 @@ __blowfish_enc_blk_4way: | |||
333 | add_preloaded_roundkey4(); | 333 | add_preloaded_roundkey4(); |
334 | 334 | ||
335 | popq %rbp; | 335 | popq %rbp; |
336 | popq RIO; | 336 | movq %r11, RIO; |
337 | 337 | ||
338 | test %bpl, %bpl; | 338 | test %bpl, %bpl; |
339 | jnz __enc_xor4; | 339 | jnz __enc_xor4; |
340 | 340 | ||
341 | write_block4(); | 341 | write_block4(); |
342 | 342 | ||
343 | __enc_ret4: | ||
344 | popq RKEY; | ||
345 | popq %rbx; | 343 | popq %rbx; |
346 | popq %rbp; | 344 | popq %rbp; |
347 | |||
348 | ret; | 345 | ret; |
349 | 346 | ||
350 | __enc_xor4: | 347 | __enc_xor4: |
351 | xor_block4(); | 348 | xor_block4(); |
352 | 349 | ||
353 | jmp __enc_ret4; | 350 | popq %rbx; |
351 | popq %rbp; | ||
352 | ret; | ||
354 | 353 | ||
355 | .align 8 | 354 | .align 8 |
356 | .global blowfish_dec_blk_4way | 355 | .global blowfish_dec_blk_4way |
357 | .type blowfish_dec_blk_4way,@function; | 356 | .type blowfish_dec_blk_4way,@function; |
358 | 357 | ||
359 | blowfish_dec_blk_4way: | 358 | blowfish_dec_blk_4way: |
360 | // input: | 359 | /* input: |
361 | // %rdi: ctx, CTX | 360 | * %rdi: ctx, CTX |
362 | // %rsi: dst | 361 | * %rsi: dst |
363 | // %rdx: src | 362 | * %rdx: src |
363 | */ | ||
364 | pushq %rbp; | 364 | pushq %rbp; |
365 | pushq %rbx; | 365 | pushq %rbx; |
366 | pushq RKEY; | ||
367 | preload_roundkey_dec(17); | 366 | preload_roundkey_dec(17); |
368 | 367 | ||
369 | pushq %rsi; | 368 | movq %rsi, %r11; |
370 | movq %rdx, RIO; | 369 | movq %rdx, RIO; |
371 | 370 | ||
372 | read_block4(); | 371 | read_block4(); |
@@ -381,10 +380,9 @@ blowfish_dec_blk_4way: | |||
381 | round_dec4(3); | 380 | round_dec4(3); |
382 | add_preloaded_roundkey4(); | 381 | add_preloaded_roundkey4(); |
383 | 382 | ||
384 | popq RIO; | 383 | movq %r11, RIO; |
385 | write_block4(); | 384 | write_block4(); |
386 | 385 | ||
387 | popq RKEY; | ||
388 | popq %rbx; | 386 | popq %rbx; |
389 | popq %rbp; | 387 | popq %rbp; |
390 | 388 | ||