diff options
author | Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 2012-08-28 07:24:49 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2012-09-06 16:17:04 -0400 |
commit | ddaea7869d29beb9e0042c96ea52c9cca2afd68a (patch) | |
tree | d4d6d6e71ae0d1d451c58e7910ae3a05f55d0ad8 | |
parent | f94a73f8dd5644f45f9d2e3139608ca83b932d93 (diff) |
crypto: cast5-avx - tune assembler code for more performance
Patch replaces 'movb' instructions with 'movzbl' to break false register
dependencies, interleaves instructions better for out-of-order scheduling
and merges constant 16-bit rotation with round-key variable rotation.
tcrypt ECB results (128bit key):
Intel Core i5-2450M:
size old-vs-new new-vs-generic old-vs-generic
enc dec enc dec enc dec
256 1.18x 1.18x 2.45x 2.47x 2.08x 2.10x
1k 1.20x 1.20x 2.73x 2.73x 2.28x 2.28x
8k 1.20x 1.19x 2.73x 2.73x 2.28x 2.29x
[v2]
- Do instruction interleaving another way to avoid adding new FPU<=>CPU
register moves as these cause performance drop on Bulldozer.
- Improvements to round-key variable rotation handling.
- Further interleaving improvements for better out-of-order scheduling.
Cc: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 266 |
1 files changed, 160 insertions, 106 deletions
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S index 94693c877e3b..a41a3aaba220 100644 --- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S | |||
@@ -4,6 +4,8 @@ | |||
4 | * Copyright (C) 2012 Johannes Goetzfried | 4 | * Copyright (C) 2012 Johannes Goetzfried |
5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | 5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> |
6 | * | 6 | * |
7 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
8 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by | 10 | * it under the terms of the GNU General Public License as published by |
9 | * the Free Software Foundation; either version 2 of the License, or | 11 | * the Free Software Foundation; either version 2 of the License, or |
@@ -22,7 +24,6 @@ | |||
22 | */ | 24 | */ |
23 | 25 | ||
24 | .file "cast5-avx-x86_64-asm_64.S" | 26 | .file "cast5-avx-x86_64-asm_64.S" |
25 | .text | ||
26 | 27 | ||
27 | .extern cast5_s1 | 28 | .extern cast5_s1 |
28 | .extern cast5_s2 | 29 | .extern cast5_s2 |
@@ -57,17 +58,19 @@ | |||
57 | #define RX %xmm8 | 58 | #define RX %xmm8 |
58 | 59 | ||
59 | #define RKM %xmm9 | 60 | #define RKM %xmm9 |
60 | #define RKRF %xmm10 | 61 | #define RKR %xmm10 |
61 | #define RKRR %xmm11 | 62 | #define RKRF %xmm11 |
63 | #define RKRR %xmm12 | ||
64 | |||
65 | #define R32 %xmm13 | ||
66 | #define R1ST %xmm14 | ||
62 | 67 | ||
63 | #define RTMP %xmm12 | 68 | #define RTMP %xmm15 |
64 | #define RMASK %xmm13 | ||
65 | #define R32 %xmm14 | ||
66 | 69 | ||
67 | #define RID1 %rax | 70 | #define RID1 %rbp |
68 | #define RID1b %al | 71 | #define RID1d %ebp |
69 | #define RID2 %rbx | 72 | #define RID2 %rsi |
70 | #define RID2b %bl | 73 | #define RID2d %esi |
71 | 74 | ||
72 | #define RGI1 %rdx | 75 | #define RGI1 %rdx |
73 | #define RGI1bl %dl | 76 | #define RGI1bl %dl |
@@ -76,6 +79,13 @@ | |||
76 | #define RGI2bl %cl | 79 | #define RGI2bl %cl |
77 | #define RGI2bh %ch | 80 | #define RGI2bh %ch |
78 | 81 | ||
82 | #define RGI3 %rax | ||
83 | #define RGI3bl %al | ||
84 | #define RGI3bh %ah | ||
85 | #define RGI4 %rbx | ||
86 | #define RGI4bl %bl | ||
87 | #define RGI4bh %bh | ||
88 | |||
79 | #define RFS1 %r8 | 89 | #define RFS1 %r8 |
80 | #define RFS1d %r8d | 90 | #define RFS1d %r8d |
81 | #define RFS2 %r9 | 91 | #define RFS2 %r9 |
@@ -84,60 +94,84 @@ | |||
84 | #define RFS3d %r10d | 94 | #define RFS3d %r10d |
85 | 95 | ||
86 | 96 | ||
87 | #define lookup_32bit(src, dst, op1, op2, op3) \ | 97 | #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ |
88 | movb src ## bl, RID1b; \ | 98 | movzbl src ## bh, RID1d; \ |
89 | movb src ## bh, RID2b; \ | 99 | movzbl src ## bl, RID2d; \ |
100 | shrq $16, src; \ | ||
90 | movl s1(, RID1, 4), dst ## d; \ | 101 | movl s1(, RID1, 4), dst ## d; \ |
91 | op1 s2(, RID2, 4), dst ## d; \ | 102 | op1 s2(, RID2, 4), dst ## d; \ |
92 | shrq $16, src; \ | 103 | movzbl src ## bh, RID1d; \ |
93 | movb src ## bl, RID1b; \ | 104 | movzbl src ## bl, RID2d; \ |
94 | movb src ## bh, RID2b; \ | 105 | interleave_op(il_reg); \ |
95 | op2 s3(, RID1, 4), dst ## d; \ | 106 | op2 s3(, RID1, 4), dst ## d; \ |
96 | op3 s4(, RID2, 4), dst ## d; | 107 | op3 s4(, RID2, 4), dst ## d; |
97 | 108 | ||
98 | #define F(a, x, op0, op1, op2, op3) \ | 109 | #define dummy(d) /* do nothing */ |
110 | |||
111 | #define shr_next(reg) \ | ||
112 | shrq $16, reg; | ||
113 | |||
114 | #define F_head(a, x, gi1, gi2, op0) \ | ||
99 | op0 a, RKM, x; \ | 115 | op0 a, RKM, x; \ |
100 | vpslld RKRF, x, RTMP; \ | 116 | vpslld RKRF, x, RTMP; \ |
101 | vpsrld RKRR, x, x; \ | 117 | vpsrld RKRR, x, x; \ |
102 | vpor RTMP, x, x; \ | 118 | vpor RTMP, x, x; \ |
103 | \ | 119 | \ |
104 | vpshufb RMASK, x, x; \ | 120 | vmovq x, gi1; \ |
105 | vmovq x, RGI1; \ | 121 | vpextrq $1, x, gi2; |
106 | vpsrldq $8, x, x; \ | 122 | |
107 | vmovq x, RGI2; \ | 123 | #define F_tail(a, x, gi1, gi2, op1, op2, op3) \ |
108 | \ | 124 | lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ |
109 | lookup_32bit(RGI1, RFS1, op1, op2, op3); \ | 125 | lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ |
110 | shrq $16, RGI1; \ | ||
111 | lookup_32bit(RGI1, RFS2, op1, op2, op3); \ | ||
112 | shlq $32, RFS2; \ | ||
113 | orq RFS1, RFS2; \ | ||
114 | \ | 126 | \ |
115 | lookup_32bit(RGI2, RFS1, op1, op2, op3); \ | 127 | lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ |
116 | shrq $16, RGI2; \ | 128 | shlq $32, RFS2; \ |
117 | lookup_32bit(RGI2, RFS3, op1, op2, op3); \ | 129 | orq RFS1, RFS2; \ |
118 | shlq $32, RFS3; \ | 130 | lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ |
119 | orq RFS1, RFS3; \ | 131 | shlq $32, RFS1; \ |
132 | orq RFS1, RFS3; \ | ||
120 | \ | 133 | \ |
121 | vmovq RFS2, x; \ | 134 | vmovq RFS2, x; \ |
122 | vpinsrq $1, RFS3, x, x; | 135 | vpinsrq $1, RFS3, x, x; |
123 | 136 | ||
124 | #define F1(b, x) F(b, x, vpaddd, xorl, subl, addl) | 137 | #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ |
125 | #define F2(b, x) F(b, x, vpxor, subl, addl, xorl) | 138 | F_head(b1, RX, RGI1, RGI2, op0); \ |
126 | #define F3(b, x) F(b, x, vpsubd, addl, xorl, subl) | 139 | F_head(b2, RX, RGI3, RGI4, op0); \ |
140 | \ | ||
141 | F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ | ||
142 | F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ | ||
143 | \ | ||
144 | vpxor a1, RX, a1; \ | ||
145 | vpxor a2, RTMP, a2; | ||
146 | |||
147 | #define F1_2(a1, b1, a2, b2) \ | ||
148 | F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) | ||
149 | #define F2_2(a1, b1, a2, b2) \ | ||
150 | F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) | ||
151 | #define F3_2(a1, b1, a2, b2) \ | ||
152 | F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) | ||
127 | 153 | ||
128 | #define subround(a, b, x, n, f) \ | 154 | #define subround(a1, b1, a2, b2, f) \ |
129 | F ## f(b, x); \ | 155 | F ## f ## _2(a1, b1, a2, b2); |
130 | vpxor a, x, a; | ||
131 | 156 | ||
132 | #define round(l, r, n, f) \ | 157 | #define round(l, r, n, f) \ |
133 | vbroadcastss (km+(4*n))(CTX), RKM; \ | 158 | vbroadcastss (km+(4*n))(CTX), RKM; \ |
134 | vpinsrb $0, (kr+n)(CTX), RKRF, RKRF; \ | 159 | vpand R1ST, RKR, RKRF; \ |
135 | vpsubq RKRF, R32, RKRR; \ | 160 | vpsubq RKRF, R32, RKRR; \ |
136 | subround(l ## 1, r ## 1, RX, n, f); \ | 161 | vpsrldq $1, RKR, RKR; \ |
137 | subround(l ## 2, r ## 2, RX, n, f); \ | 162 | subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \ |
138 | subround(l ## 3, r ## 3, RX, n, f); \ | 163 | subround(l ## 3, r ## 3, l ## 4, r ## 4, f); |
139 | subround(l ## 4, r ## 4, RX, n, f); | 164 | |
165 | #define enc_preload_rkr() \ | ||
166 | vbroadcastss .L16_mask, RKR; \ | ||
167 | /* add 16-bit rotation to key rotations (mod 32) */ \ | ||
168 | vpxor kr(CTX), RKR, RKR; | ||
140 | 169 | ||
170 | #define dec_preload_rkr() \ | ||
171 | vbroadcastss .L16_mask, RKR; \ | ||
172 | /* add 16-bit rotation to key rotations (mod 32) */ \ | ||
173 | vpxor kr(CTX), RKR, RKR; \ | ||
174 | vpshufb .Lbswap128_mask, RKR, RKR; | ||
141 | 175 | ||
142 | #define transpose_2x4(x0, x1, t0, t1) \ | 176 | #define transpose_2x4(x0, x1, t0, t1) \ |
143 | vpunpckldq x1, x0, t0; \ | 177 | vpunpckldq x1, x0, t0; \ |
@@ -146,37 +180,47 @@ | |||
146 | vpunpcklqdq t1, t0, x0; \ | 180 | vpunpcklqdq t1, t0, x0; \ |
147 | vpunpckhqdq t1, t0, x1; | 181 | vpunpckhqdq t1, t0, x1; |
148 | 182 | ||
149 | #define inpack_blocks(in, x0, x1, t0, t1) \ | 183 | #define inpack_blocks(in, x0, x1, t0, t1, rmask) \ |
150 | vmovdqu (0*4*4)(in), x0; \ | 184 | vmovdqu (0*4*4)(in), x0; \ |
151 | vmovdqu (1*4*4)(in), x1; \ | 185 | vmovdqu (1*4*4)(in), x1; \ |
152 | vpshufb RMASK, x0, x0; \ | 186 | vpshufb rmask, x0, x0; \ |
153 | vpshufb RMASK, x1, x1; \ | 187 | vpshufb rmask, x1, x1; \ |
154 | \ | 188 | \ |
155 | transpose_2x4(x0, x1, t0, t1) | 189 | transpose_2x4(x0, x1, t0, t1) |
156 | 190 | ||
157 | #define outunpack_blocks(out, x0, x1, t0, t1) \ | 191 | #define outunpack_blocks(out, x0, x1, t0, t1, rmask) \ |
158 | transpose_2x4(x0, x1, t0, t1) \ | 192 | transpose_2x4(x0, x1, t0, t1) \ |
159 | \ | 193 | \ |
160 | vpshufb RMASK, x0, x0; \ | 194 | vpshufb rmask, x0, x0; \ |
161 | vpshufb RMASK, x1, x1; \ | 195 | vpshufb rmask, x1, x1; \ |
162 | vmovdqu x0, (0*4*4)(out); \ | 196 | vmovdqu x0, (0*4*4)(out); \ |
163 | vmovdqu x1, (1*4*4)(out); | 197 | vmovdqu x1, (1*4*4)(out); |
164 | 198 | ||
165 | #define outunpack_xor_blocks(out, x0, x1, t0, t1) \ | 199 | #define outunpack_xor_blocks(out, x0, x1, t0, t1, rmask) \ |
166 | transpose_2x4(x0, x1, t0, t1) \ | 200 | transpose_2x4(x0, x1, t0, t1) \ |
167 | \ | 201 | \ |
168 | vpshufb RMASK, x0, x0; \ | 202 | vpshufb rmask, x0, x0; \ |
169 | vpshufb RMASK, x1, x1; \ | 203 | vpshufb rmask, x1, x1; \ |
170 | vpxor (0*4*4)(out), x0, x0; \ | 204 | vpxor (0*4*4)(out), x0, x0; \ |
171 | vmovdqu x0, (0*4*4)(out); \ | 205 | vmovdqu x0, (0*4*4)(out); \ |
172 | vpxor (1*4*4)(out), x1, x1; \ | 206 | vpxor (1*4*4)(out), x1, x1; \ |
173 | vmovdqu x1, (1*4*4)(out); | 207 | vmovdqu x1, (1*4*4)(out); |
174 | 208 | ||
209 | .data | ||
210 | |||
175 | .align 16 | 211 | .align 16 |
176 | .Lbswap_mask: | 212 | .Lbswap_mask: |
177 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | 213 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 |
214 | .Lbswap128_mask: | ||
215 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
216 | .L16_mask: | ||
217 | .byte 16, 16, 16, 16 | ||
178 | .L32_mask: | 218 | .L32_mask: |
179 | .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0 | 219 | .byte 32, 0, 0, 0 |
220 | .Lfirst_mask: | ||
221 | .byte 0x1f, 0, 0, 0 | ||
222 | |||
223 | .text | ||
180 | 224 | ||
181 | .align 16 | 225 | .align 16 |
182 | .global __cast5_enc_blk_16way | 226 | .global __cast5_enc_blk_16way |
@@ -190,23 +234,24 @@ __cast5_enc_blk_16way: | |||
190 | * %rcx: bool, if true: xor output | 234 | * %rcx: bool, if true: xor output |
191 | */ | 235 | */ |
192 | 236 | ||
237 | pushq %rbp; | ||
193 | pushq %rbx; | 238 | pushq %rbx; |
194 | pushq %rcx; | 239 | pushq %rcx; |
195 | 240 | ||
196 | vmovdqu .Lbswap_mask, RMASK; | 241 | vmovdqa .Lbswap_mask, RKM; |
197 | vmovdqu .L32_mask, R32; | 242 | vmovd .Lfirst_mask, R1ST; |
198 | vpxor RKRF, RKRF, RKRF; | 243 | vmovd .L32_mask, R32; |
244 | enc_preload_rkr(); | ||
199 | 245 | ||
200 | inpack_blocks(%rdx, RL1, RR1, RTMP, RX); | 246 | leaq 1*(2*4*4)(%rdx), %rax; |
201 | leaq (2*4*4)(%rdx), %rax; | 247 | inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); |
202 | inpack_blocks(%rax, RL2, RR2, RTMP, RX); | 248 | inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); |
203 | leaq (2*4*4)(%rax), %rax; | 249 | leaq 2*(2*4*4)(%rdx), %rax; |
204 | inpack_blocks(%rax, RL3, RR3, RTMP, RX); | 250 | inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); |
205 | leaq (2*4*4)(%rax), %rax; | 251 | leaq 3*(2*4*4)(%rdx), %rax; |
206 | inpack_blocks(%rax, RL4, RR4, RTMP, RX); | 252 | inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); |
207 | 253 | ||
208 | xorq RID1, RID1; | 254 | movq %rsi, %r11; |
209 | xorq RID2, RID2; | ||
210 | 255 | ||
211 | round(RL, RR, 0, 1); | 256 | round(RL, RR, 0, 1); |
212 | round(RR, RL, 1, 2); | 257 | round(RR, RL, 1, 2); |
@@ -221,8 +266,8 @@ __cast5_enc_blk_16way: | |||
221 | round(RL, RR, 10, 2); | 266 | round(RL, RR, 10, 2); |
222 | round(RR, RL, 11, 3); | 267 | round(RR, RL, 11, 3); |
223 | 268 | ||
224 | movb rr(CTX), %al; | 269 | movzbl rr(CTX), %eax; |
225 | testb %al, %al; | 270 | testl %eax, %eax; |
226 | jnz __skip_enc; | 271 | jnz __skip_enc; |
227 | 272 | ||
228 | round(RL, RR, 12, 1); | 273 | round(RL, RR, 12, 1); |
@@ -233,28 +278,30 @@ __cast5_enc_blk_16way: | |||
233 | __skip_enc: | 278 | __skip_enc: |
234 | popq %rcx; | 279 | popq %rcx; |
235 | popq %rbx; | 280 | popq %rbx; |
281 | popq %rbp; | ||
282 | |||
283 | vmovdqa .Lbswap_mask, RKM; | ||
284 | leaq 1*(2*4*4)(%r11), %rax; | ||
236 | 285 | ||
237 | testb %cl, %cl; | 286 | testb %cl, %cl; |
238 | jnz __enc_xor16; | 287 | jnz __enc_xor16; |
239 | 288 | ||
240 | outunpack_blocks(%rsi, RR1, RL1, RTMP, RX); | 289 | outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); |
241 | leaq (2*4*4)(%rsi), %rax; | 290 | outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); |
242 | outunpack_blocks(%rax, RR2, RL2, RTMP, RX); | 291 | leaq 2*(2*4*4)(%r11), %rax; |
243 | leaq (2*4*4)(%rax), %rax; | 292 | outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); |
244 | outunpack_blocks(%rax, RR3, RL3, RTMP, RX); | 293 | leaq 3*(2*4*4)(%r11), %rax; |
245 | leaq (2*4*4)(%rax), %rax; | 294 | outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); |
246 | outunpack_blocks(%rax, RR4, RL4, RTMP, RX); | ||
247 | 295 | ||
248 | ret; | 296 | ret; |
249 | 297 | ||
250 | __enc_xor16: | 298 | __enc_xor16: |
251 | outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX); | 299 | outunpack_xor_blocks(%r11, RR1, RL1, RTMP, RX, RKM); |
252 | leaq (2*4*4)(%rsi), %rax; | 300 | outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX, RKM); |
253 | outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX); | 301 | leaq 2*(2*4*4)(%r11), %rax; |
254 | leaq (2*4*4)(%rax), %rax; | 302 | outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX, RKM); |
255 | outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX); | 303 | leaq 3*(2*4*4)(%r11), %rax; |
256 | leaq (2*4*4)(%rax), %rax; | 304 | outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX, RKM); |
257 | outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX); | ||
258 | 305 | ||
259 | ret; | 306 | ret; |
260 | 307 | ||
@@ -269,25 +316,26 @@ cast5_dec_blk_16way: | |||
269 | * %rdx: src | 316 | * %rdx: src |
270 | */ | 317 | */ |
271 | 318 | ||
319 | pushq %rbp; | ||
272 | pushq %rbx; | 320 | pushq %rbx; |
273 | 321 | ||
274 | vmovdqu .Lbswap_mask, RMASK; | 322 | vmovdqa .Lbswap_mask, RKM; |
275 | vmovdqu .L32_mask, R32; | 323 | vmovd .Lfirst_mask, R1ST; |
276 | vpxor RKRF, RKRF, RKRF; | 324 | vmovd .L32_mask, R32; |
325 | dec_preload_rkr(); | ||
277 | 326 | ||
278 | inpack_blocks(%rdx, RL1, RR1, RTMP, RX); | 327 | leaq 1*(2*4*4)(%rdx), %rax; |
279 | leaq (2*4*4)(%rdx), %rax; | 328 | inpack_blocks(%rdx, RL1, RR1, RTMP, RX, RKM); |
280 | inpack_blocks(%rax, RL2, RR2, RTMP, RX); | 329 | inpack_blocks(%rax, RL2, RR2, RTMP, RX, RKM); |
281 | leaq (2*4*4)(%rax), %rax; | 330 | leaq 2*(2*4*4)(%rdx), %rax; |
282 | inpack_blocks(%rax, RL3, RR3, RTMP, RX); | 331 | inpack_blocks(%rax, RL3, RR3, RTMP, RX, RKM); |
283 | leaq (2*4*4)(%rax), %rax; | 332 | leaq 3*(2*4*4)(%rdx), %rax; |
284 | inpack_blocks(%rax, RL4, RR4, RTMP, RX); | 333 | inpack_blocks(%rax, RL4, RR4, RTMP, RX, RKM); |
285 | 334 | ||
286 | xorq RID1, RID1; | 335 | movq %rsi, %r11; |
287 | xorq RID2, RID2; | ||
288 | 336 | ||
289 | movb rr(CTX), %al; | 337 | movzbl rr(CTX), %eax; |
290 | testb %al, %al; | 338 | testl %eax, %eax; |
291 | jnz __skip_dec; | 339 | jnz __skip_dec; |
292 | 340 | ||
293 | round(RL, RR, 15, 1); | 341 | round(RL, RR, 15, 1); |
@@ -295,7 +343,7 @@ cast5_dec_blk_16way: | |||
295 | round(RL, RR, 13, 2); | 343 | round(RL, RR, 13, 2); |
296 | round(RR, RL, 12, 1); | 344 | round(RR, RL, 12, 1); |
297 | 345 | ||
298 | __skip_dec: | 346 | __dec_tail: |
299 | round(RL, RR, 11, 3); | 347 | round(RL, RR, 11, 3); |
300 | round(RR, RL, 10, 2); | 348 | round(RR, RL, 10, 2); |
301 | round(RL, RR, 9, 1); | 349 | round(RL, RR, 9, 1); |
@@ -309,14 +357,20 @@ __skip_dec: | |||
309 | round(RL, RR, 1, 2); | 357 | round(RL, RR, 1, 2); |
310 | round(RR, RL, 0, 1); | 358 | round(RR, RL, 0, 1); |
311 | 359 | ||
360 | vmovdqa .Lbswap_mask, RKM; | ||
312 | popq %rbx; | 361 | popq %rbx; |
362 | popq %rbp; | ||
313 | 363 | ||
314 | outunpack_blocks(%rsi, RR1, RL1, RTMP, RX); | 364 | leaq 1*(2*4*4)(%r11), %rax; |
315 | leaq (2*4*4)(%rsi), %rax; | 365 | outunpack_blocks(%r11, RR1, RL1, RTMP, RX, RKM); |
316 | outunpack_blocks(%rax, RR2, RL2, RTMP, RX); | 366 | outunpack_blocks(%rax, RR2, RL2, RTMP, RX, RKM); |
317 | leaq (2*4*4)(%rax), %rax; | 367 | leaq 2*(2*4*4)(%r11), %rax; |
318 | outunpack_blocks(%rax, RR3, RL3, RTMP, RX); | 368 | outunpack_blocks(%rax, RR3, RL3, RTMP, RX, RKM); |
319 | leaq (2*4*4)(%rax), %rax; | 369 | leaq 3*(2*4*4)(%r11), %rax; |
320 | outunpack_blocks(%rax, RR4, RL4, RTMP, RX); | 370 | outunpack_blocks(%rax, RR4, RL4, RTMP, RX, RKM); |
321 | 371 | ||
322 | ret; | 372 | ret; |
373 | |||
374 | __skip_dec: | ||
375 | vpsrldq $4, RKR, RKR; | ||
376 | jmp __dec_tail; | ||