aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-08-28 07:24:49 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-09-06 16:17:04 -0400
commitddaea7869d29beb9e0042c96ea52c9cca2afd68a (patch)
treed4d6d6e71ae0d1d451c58e7910ae3a05f55d0ad8
parentf94a73f8dd5644f45f9d2e3139608ca83b932d93 (diff)
crypto: cast5-avx - tune assembler code for more performance
Patch replaces 'movb' instructions with 'movzbl' to break false register dependencies, interleaves instructions better for out-of-order scheduling and merges constant 16-bit rotation with round-key variable rotation. tcrypt ECB results (128bit key): Intel Core i5-2450M: size old-vs-new new-vs-generic old-vs-generic enc dec enc dec enc dec 256 1.18x 1.18x 2.45x 2.47x 2.08x 2.10x 1k 1.20x 1.20x 2.73x 2.73x 2.28x 2.28x 8k 1.20x 1.19x 2.73x 2.73x 2.28x 2.29x [v2] - Do instruction interleaving another way to avoid adding new FPU<=>CPU register moves as these cause performance drop on Bulldozer. - Improvements to round-key variable rotation handling. - Further interleaving improvements for better out-of-order scheduling. Cc: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S266
1 files changed, 160 insertions, 106 deletions
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index 94693c877e3b..a41a3aaba220 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
4 * Copyright (C) 2012 Johannes Goetzfried 4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 * 6 *
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
8 *
7 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 10 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or 11 * the Free Software Foundation; either version 2 of the License, or
@@ -22,7 +24,6 @@
22 */ 24 */
23 25
24.file "cast5-avx-x86_64-asm_64.S" 26.file "cast5-avx-x86_64-asm_64.S"
25.text
26 27
27.extern cast5_s1 28.extern cast5_s1
28.extern cast5_s2 29.extern cast5_s2
@@ -57,17 +58,19 @@
57#define RX %xmm8 58#define RX %xmm8
58 59
59#define RKM %xmm9 60#define RKM %xmm9
60#define RKRF %xmm10 61#define RKR %xmm10
61#define RKRR %xmm11 62#define RKRF %xmm11
63#define RKRR %xmm12
64
65#define R32 %xmm13
66#define R1ST %xmm14
62 67
63#define RTMP %xmm12 68#define RTMP %xmm15
64#define RMASK %xmm13
65#define R32 %xmm14
66 69
67#define RID1 %rax 70#define RID1 %rbp
68#define RID1b %al 71#define RID1d %ebp
69#define RID2 %rbx 72#define RID2 %rsi
70#define RID2b %bl 73#define RID2d %esi
71 74
72#define RGI1 %rdx 75#define RGI1 %rdx
73#define RGI1bl %dl 76#define RGI1bl %dl
@@ -76,6 +79,13 @@
76#define RGI2bl %cl 79#define RGI2bl %cl
77#define RGI2bh %ch 80#define RGI2bh %ch
78 81
82#define RGI3 %rax
83#define RGI3bl %al
84#define RGI3bh %ah
85#define RGI4 %rbx
86#define RGI4bl %bl
87#define RGI4bh %bh
88
79#define RFS1 %r8 89#define RFS1 %r8
80#define RFS1d %r8d 90#define RFS1d %r8d
81#define RFS2 %r9 91#define RFS2 %r9
@@ -84,60 +94,84 @@
84#define RFS3d %r10d 94#define RFS3d %r10d
85 95
86 96
87#define lookup_32bit(src, dst, op1, op2, op3) \ 97#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
88 movb src ## bl, RID1b; \ 98 movzbl src ## bh, RID1d; \
89 movb src ## bh, RID2b; \ 99 movzbl src ## bl, RID2d; \
100 shrq $16, src; \
90 movl s1(, RID1, 4), dst ## d; \ 101 movl s1(, RID1, 4), dst ## d; \
91 op1 s2(, RID2, 4), dst ## d; \ 102 op1 s2(, RID2, 4), dst ## d; \
92 shrq $16, src; \ 103 movzbl src ## bh, RID1d; \
93 movb src ## bl, RID1b; \ 104 movzbl src ## bl, RID2d; \
94 movb src ## bh, RID2b; \ 105 interleave_op(il_reg); \
95 op2 s3(, RID1, 4), dst ## d; \ 106 op2 s3(, RID1, 4), dst ## d; \
96 op3 s4(, RID2, 4), dst ## d; 107 op3 s4(, RID2, 4), dst ## d;
97 108
98#define F(a, x, op0, op1, op2, op3) \ 109#define dummy(d) /* do nothing */
110
111#define shr_next(reg) \
112 shrq $16, reg;
113
114#define F_head(a, x, gi1, gi2, op0) \
99 op0 a, RKM, x; \ 115 op0 a, RKM, x; \
100 vpslld RKRF, x, RTMP; \ 116 vpslld RKRF, x, RTMP; \
101 vpsrld RKRR, x, x; \ 117 vpsrld RKRR, x, x; \
102 vpor RTMP, x, x; \ 118 vpor RTMP, x, x; \
103 \ 119 \
104 vpshufb RMASK, x, x; \ 120 vmovq x, gi1; \
105 vmovq x, RGI1; \ 121 vpextrq $1, x, gi2;
106 vpsrldq $8, x, x; \ 122
107 vmovq x, RGI2; \ 123#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
108 \ 124 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
109 lookup_32bit(RGI1, RFS1, op1, op2, op3); \ 125 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
110 shrq $16, RGI1; \
111 lookup_32bit(RGI1, RFS2, op1, op2, op3); \
112 shlq $32, RFS2; \
113 orq RFS1, RFS2; \
114 \ 126 \
115 lookup_32bit(RGI2, RFS1, op1, op2, op3); \ 127 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
116 shrq $16, RGI2; \ 128 shlq $32, RFS2; \
117 lookup_32bit(RGI2, RFS3, op1, op2, op3); \ 129 orq RFS1, RFS2; \
118 shlq $32, RFS3; \ 130 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
119 orq RFS1, RFS3; \ 131 shlq $32, RFS1; \
132 orq RFS1, RFS3; \
120 \ 133 \
121 vmovq RFS2, x; \ 134 vmovq RFS2, x; \
122 vpinsrq $1, RFS3, x, x; 135 vpinsrq $1, RFS3, x, x;
123 136
124#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl) 137#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
125#define F2(b, x) F(b, x, vpxor, subl, addl, xorl) 138 F_head(b1, RX, RGI1, RGI2, op0); \
126#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl) 139 F_head(b2, RX, RGI3, RGI4, op0); \
140 \
141 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
142 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
143 \
144 vpxor a1, RX, a1; \
145 vpxor a2, RTMP, a2;
146
147#define F1_2(a1, b1, a2, b2) \
148 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
149#define F2_2(a1, b1, a2, b2) \
150 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
151#define F3_2(a1, b1, a2, b2) \
152 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
127 153
128#define subround(a, b, x, n, f) \ 154#define subround(a1, b1, a2, b2, f) \
129 F ## f(b, x); \ 155 F ## f ## _2(a1, b1, a2, b2);
130 vpxor a, x, a;
131 156
132#define round(l, r, n, f) \ 157#define round(l, r, n, f) \
133 vbroadcastss (km+(4*n))(CTX), RKM; \ 158 vbroadcastss (km+(4*n))(CTX), RKM; \
134 vpinsrb $0, (kr+n)(CTX), RKRF, RKRF; \ 159 vpand R1ST, RKR, RKRF; \
135 vpsubq RKRF, R32, RKRR; \ 160 vpsubq RKRF, R32, RKRR; \
136 subround(l ## 1, r ## 1, RX, n, f); \ 161 vpsrldq $1, RKR, RKR; \
137 subround(l ## 2, r ## 2, RX, n, f); \ 162 subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
138 subround(l ## 3, r ## 3, RX, n, f); \ 163 subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
139 subround(l ## 4, r ## 4, RX, n, f); 164
165#define enc_preload_rkr() \
166 vbroadcastss .L16_mask, RKR; \
167 /* add 16-bit rotation to key rotations (mod 32) */ \
168 vpxor kr(CTX), RKR, RKR;
140 169
170#define dec_preload_rkr() \
171 vbroadcastss .L16_mask, RKR; \
172 /* add 16-bit rotation to key rotations (mod 32) */ \
173 vpxor kr(CTX), RKR, RKR; \
174 vpshufb .Lbswap128_mask, RKR, RKR;
141 175
142#define transpose_2x4(x0, x1, t0, t1) \ 176#define transpose_2x4(x0, x1, t0, t1) \
143 vpunpckldq x1, x0, t0; \ 177 vpunpckldq x1, x0, t0; \
@@ -146,37 +180,47 @@
146 vpunpcklqdq t1, t0, x0; \ 180 vpunpcklqdq t1, t0, x0; \
147 vpunpckhqdq t1, t0, x1; 181 vpunpckhqdq t1, t0, x1;
148 182
149#define inpack_blocks(in, x0, x1, t0, t1) \ 183#define inpack_blocks(in, x0, x1, t0, t1, rmask) \
150 vmovdqu (0*4*4)(in), x0; \ 184 vmovdqu (0*4*4)(in), x0; \
151 vmovdqu (1*4*4)(in), x1; \ 185 vmovdqu (1*4*4)(in), x1; \
152 vpshufb RMASK, x0, x0; \ 186 vpshufb rmask, x0, x0; \
153 vpshufb RMASK, x1, x1; \ 187 vpshufb rmask, x1, x1; \
154 \ 188 \
155 transpose_2x4(x0, x1, t0, t1) 189 transpose_2x4(x0, x1, t0, t1)
156 190
157#define outunpack_blocks(out, x0, x1, t0, t1) \ 191#define outunpack_blocks(out, x0, x1, t0, t1, rmask) \
158 transpose_2x4(x0, x1, t0, t1) \ 192 transpose_2x4(x0, x1, t0, t1) \
159 \ 193 \
160 vpshufb RMASK, x0, x0; \ 194 vpshufb rmask, x0, x0; \
161 vpshufb RMASK, x1, x1; \ 195 vpshufb rmask, x1, x1; \
162 vmovdqu x0, (0*4*4)(out); \ 196 vmovdqu x0, (0*4*4)(out); \
163 vmovdqu x1, (1*4*4)(out); 197 vmovdqu x1, (1*4*4)(out);
164 198
165#define outunpack_xor_blocks(out, x0, x1, t0, t1) \ 199#define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \
166 transpose_2x4(x0, x1, t0, t1) \ 200 transpose_2x4(x0, x1, t0, t1) \
167 \ 201 \
168 vpshufb RMASK, x0, x0; \ 202 vpshufb rmask, x0, x0; \
169 vpshufb RMASK, x1, x1; \ 203 vpshufb rmask, x1, x1; \
170 vpxor (0*4*4)(out), x0, x0; \ 204 vpxor (0*4*4)(out), x0, x0; \
171 vmovdqu x0, (0*4*4)(out); \ 205 vmovdqu x0, (0*4*4)(out); \
172 vpxor (1*4*4)(out), x1, x1; \ 206 vpxor (1*4*4)(out), x1, x1; \
173 vmovdqu x1, (1*4*4)(out); 207 vmovdqu x1, (1*4*4)(out);
174 208
209.data
210
175.align 16 211.align 16
176.Lbswap_mask: 212.Lbswap_mask:
177 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 213 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
214.Lbswap128_mask:
215 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
216.L16_mask:
217 .byte 16, 16, 16, 16
178.L32_mask: 218.L32_mask:
179 .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0 219 .byte 32, 0, 0, 0
220.Lfirst_mask:
221 .byte 0x1f, 0, 0, 0
222
223.text
180 224
181.align 16 225.align 16
182.global __cast5_enc_blk_16way 226.global __cast5_enc_blk_16way
@@ -190,23 +234,24 @@ __cast5_enc_blk_16way:
190 * %rcx: bool, if true: xor output 234 * %rcx: bool, if true: xor output
191 */ 235 */
192 236
237 pushq %rbp;
193 pushq %rbx; 238 pushq %rbx;
194 pushq %rcx; 239 pushq %rcx;
195 240
196 vmovdqu .Lbswap_mask, RMASK; 241 vmovdqa .Lbswap_mask, RKM;
197 vmovdqu .L32_mask, R32; 242 vmovd .Lfirst_mask, R1ST;
198 vpxor RKRF, RKRF, RKRF; 243 vmovd .L32_mask, R32;
244 enc_preload_rkr();
199 245
200 inpack_blocks(%rdx, RL1, RR1, RTMP, RX); 246 leaq 1*(2*4*4)(%rdx), %rax;
201 leaq (2*4*4)(%rdx), %rax; 247 inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
202 inpack_blocks(%rax, RL2, RR2, RTMP, RX); 248 inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
203 leaq (2*4*4)(%rax), %rax; 249 leaq 2*(2*4*4)(%rdx), %rax;
204 inpack_blocks(%rax, RL3, RR3, RTMP, RX); 250 inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
205 leaq (2*4*4)(%rax), %rax; 251 leaq 3*(2*4*4)(%rdx), %rax;
206 inpack_blocks(%rax, RL4, RR4, RTMP, RX); 252 inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
207 253
208 xorq RID1, RID1; 254 movq %rsi, %r11;
209 xorq RID2, RID2;
210 255
211 round(RL, RR, 0, 1); 256 round(RL, RR, 0, 1);
212 round(RR, RL, 1, 2); 257 round(RR, RL, 1, 2);
@@ -221,8 +266,8 @@ __cast5_enc_blk_16way:
221 round(RL, RR, 10, 2); 266 round(RL, RR, 10, 2);
222 round(RR, RL, 11, 3); 267 round(RR, RL, 11, 3);
223 268
224 movb rr(CTX), %al; 269 movzbl rr(CTX), %eax;
225 testb %al, %al; 270 testl %eax, %eax;
226 jnz __skip_enc; 271 jnz __skip_enc;
227 272
228 round(RL, RR, 12, 1); 273 round(RL, RR, 12, 1);
@@ -233,28 +278,30 @@ __cast5_enc_blk_16way:
233__skip_enc: 278__skip_enc:
234 popq %rcx; 279 popq %rcx;
235 popq %rbx; 280 popq %rbx;
281 popq %rbp;
282
283 vmovdqa .Lbswap_mask, RKM;
284 leaq 1*(2*4*4)(%r11), %rax;
236 285
237 testb %cl, %cl; 286 testb %cl, %cl;
238 jnz __enc_xor16; 287 jnz __enc_xor16;
239 288
240 outunpack_blocks(%rsi, RR1, RL1, RTMP, RX); 289 outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
241 leaq (2*4*4)(%rsi), %rax; 290 outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
242 outunpack_blocks(%rax, RR2, RL2, RTMP, RX); 291 leaq 2*(2*4*4)(%r11), %rax;
243 leaq (2*4*4)(%rax), %rax; 292 outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
244 outunpack_blocks(%rax, RR3, RL3, RTMP, RX); 293 leaq 3*(2*4*4)(%r11), %rax;
245 leaq (2*4*4)(%rax), %rax; 294 outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
246 outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
247 295
248 ret; 296 ret;
249 297
250__enc_xor16: 298__enc_xor16:
251 outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX); 299 outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
252 leaq (2*4*4)(%rsi), %rax; 300 outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
253 outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX); 301 leaq 2*(2*4*4)(%r11), %rax;
254 leaq (2*4*4)(%rax), %rax; 302 outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
255 outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX); 303 leaq 3*(2*4*4)(%r11), %rax;
256 leaq (2*4*4)(%rax), %rax; 304 outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
257 outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX);
258 305
259 ret; 306 ret;
260 307
@@ -269,25 +316,26 @@ cast5_dec_blk_16way:
269 * %rdx: src 316 * %rdx: src
270 */ 317 */
271 318
319 pushq %rbp;
272 pushq %rbx; 320 pushq %rbx;
273 321
274 vmovdqu .Lbswap_mask, RMASK; 322 vmovdqa .Lbswap_mask, RKM;
275 vmovdqu .L32_mask, R32; 323 vmovd .Lfirst_mask, R1ST;
276 vpxor RKRF, RKRF, RKRF; 324 vmovd .L32_mask, R32;
325 dec_preload_rkr();
277 326
278 inpack_blocks(%rdx, RL1, RR1, RTMP, RX); 327 leaq 1*(2*4*4)(%rdx), %rax;
279 leaq (2*4*4)(%rdx), %rax; 328 inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM);
280 inpack_blocks(%rax, RL2, RR2, RTMP, RX); 329 inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM);
281 leaq (2*4*4)(%rax), %rax; 330 leaq 2*(2*4*4)(%rdx), %rax;
282 inpack_blocks(%rax, RL3, RR3, RTMP, RX); 331 inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM);
283 leaq (2*4*4)(%rax), %rax; 332 leaq 3*(2*4*4)(%rdx), %rax;
284 inpack_blocks(%rax, RL4, RR4, RTMP, RX); 333 inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM);
285 334
286 xorq RID1, RID1; 335 movq %rsi, %r11;
287 xorq RID2, RID2;
288 336
289 movb rr(CTX), %al; 337 movzbl rr(CTX), %eax;
290 testb %al, %al; 338 testl %eax, %eax;
291 jnz __skip_dec; 339 jnz __skip_dec;
292 340
293 round(RL, RR, 15, 1); 341 round(RL, RR, 15, 1);
@@ -295,7 +343,7 @@ cast5_dec_blk_16way:
295 round(RL, RR, 13, 2); 343 round(RL, RR, 13, 2);
296 round(RR, RL, 12, 1); 344 round(RR, RL, 12, 1);
297 345
298__skip_dec: 346__dec_tail:
299 round(RL, RR, 11, 3); 347 round(RL, RR, 11, 3);
300 round(RR, RL, 10, 2); 348 round(RR, RL, 10, 2);
301 round(RL, RR, 9, 1); 349 round(RL, RR, 9, 1);
@@ -309,14 +357,20 @@ __skip_dec:
309 round(RL, RR, 1, 2); 357 round(RL, RR, 1, 2);
310 round(RR, RL, 0, 1); 358 round(RR, RL, 0, 1);
311 359
360 vmovdqa .Lbswap_mask, RKM;
312 popq %rbx; 361 popq %rbx;
362 popq %rbp;
313 363
314 outunpack_blocks(%rsi, RR1, RL1, RTMP, RX); 364 leaq 1*(2*4*4)(%r11), %rax;
315 leaq (2*4*4)(%rsi), %rax; 365 outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM);
316 outunpack_blocks(%rax, RR2, RL2, RTMP, RX); 366 outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM);
317 leaq (2*4*4)(%rax), %rax; 367 leaq 2*(2*4*4)(%r11), %rax;
318 outunpack_blocks(%rax, RR3, RL3, RTMP, RX); 368 outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM);
319 leaq (2*4*4)(%rax), %rax; 369 leaq 3*(2*4*4)(%r11), %rax;
320 outunpack_blocks(%rax, RR4, RL4, RTMP, RX); 370 outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM);
321 371
322 ret; 372 ret;
373
374__skip_dec:
375 vpsrldq $4, RKR, RKR;
376 jmp __dec_tail;