diff options
author | Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | 2012-08-28 07:24:54 -0400 |
---|---|---|
committer | Herbert Xu <herbert@gondor.apana.org.au> | 2012-09-06 16:17:05 -0400 |
commit | c09220e1bc97d83cae445cab8dcb057fabd62361 (patch) | |
tree | 2f7a80b98592630c28350ebe4cb0f3be616cc8f8 | |
parent | ddaea7869d29beb9e0042c96ea52c9cca2afd68a (diff) |
crypto: cast6-avx - tune assembler code for more performance
Patch replaces 'movb' instructions with 'movzbl' to break false register
dependencies, interleaves instructions better for out-of-order scheduling
and merges constant 16-bit rotation with round-key variable rotation.
tcrypt ECB results:
Intel Core i5-2450M:
size old-vs-new new-vs-generic old-vs-generic
enc dec enc dec enc dec
256 1.13x 1.19x 2.05x 2.17x 1.82x 1.82x
1k 1.18x 1.21x 2.26x 2.33x 1.93x 1.93x
8k 1.19x 1.19x 2.32x 2.33x 1.95x 1.95x
[v2]
- Do instruction interleaving another way to avoid adding new FPU<=>CPU
register moves as these cause performance drop on Bulldozer.
- Improvements to round-key variable rotation handling.
- Further interleaving improvements for better out-of-order scheduling.
Cc: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r-- | arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 276 |
1 files changed, 162 insertions, 114 deletions
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S index d258ce0d2e06..218d283772f4 100644 --- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S +++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S | |||
@@ -4,6 +4,8 @@ | |||
4 | * Copyright (C) 2012 Johannes Goetzfried | 4 | * Copyright (C) 2012 Johannes Goetzfried |
5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> | 5 | * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> |
6 | * | 6 | * |
7 | * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
8 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | 9 | * This program is free software; you can redistribute it and/or modify |
8 | * it under the terms of the GNU General Public License as published by | 10 | * it under the terms of the GNU General Public License as published by |
9 | * the Free Software Foundation; either version 2 of the License, or | 11 | * the Free Software Foundation; either version 2 of the License, or |
@@ -22,7 +24,6 @@ | |||
22 | */ | 24 | */ |
23 | 25 | ||
24 | .file "cast6-avx-x86_64-asm_64.S" | 26 | .file "cast6-avx-x86_64-asm_64.S" |
25 | .text | ||
26 | 27 | ||
27 | .extern cast6_s1 | 28 | .extern cast6_s1 |
28 | .extern cast6_s2 | 29 | .extern cast6_s2 |
@@ -54,20 +55,21 @@ | |||
54 | #define RC2 %xmm6 | 55 | #define RC2 %xmm6 |
55 | #define RD2 %xmm7 | 56 | #define RD2 %xmm7 |
56 | 57 | ||
57 | #define RX %xmm8 | 58 | #define RX %xmm8 |
58 | 59 | ||
59 | #define RKM %xmm9 | 60 | #define RKM %xmm9 |
60 | #define RKRF %xmm10 | 61 | #define RKR %xmm10 |
61 | #define RKRR %xmm11 | 62 | #define RKRF %xmm11 |
63 | #define RKRR %xmm12 | ||
64 | #define R32 %xmm13 | ||
65 | #define R1ST %xmm14 | ||
62 | 66 | ||
63 | #define RTMP %xmm12 | 67 | #define RTMP %xmm15 |
64 | #define RMASK %xmm13 | ||
65 | #define R32 %xmm14 | ||
66 | 68 | ||
67 | #define RID1 %rax | 69 | #define RID1 %rbp |
68 | #define RID1b %al | 70 | #define RID1d %ebp |
69 | #define RID2 %rbx | 71 | #define RID2 %rsi |
70 | #define RID2b %bl | 72 | #define RID2d %esi |
71 | 73 | ||
72 | #define RGI1 %rdx | 74 | #define RGI1 %rdx |
73 | #define RGI1bl %dl | 75 | #define RGI1bl %dl |
@@ -76,6 +78,13 @@ | |||
76 | #define RGI2bl %cl | 78 | #define RGI2bl %cl |
77 | #define RGI2bh %ch | 79 | #define RGI2bh %ch |
78 | 80 | ||
81 | #define RGI3 %rax | ||
82 | #define RGI3bl %al | ||
83 | #define RGI3bh %ah | ||
84 | #define RGI4 %rbx | ||
85 | #define RGI4bl %bl | ||
86 | #define RGI4bh %bh | ||
87 | |||
79 | #define RFS1 %r8 | 88 | #define RFS1 %r8 |
80 | #define RFS1d %r8d | 89 | #define RFS1d %r8d |
81 | #define RFS2 %r9 | 90 | #define RFS2 %r9 |
@@ -84,95 +93,106 @@ | |||
84 | #define RFS3d %r10d | 93 | #define RFS3d %r10d |
85 | 94 | ||
86 | 95 | ||
87 | #define lookup_32bit(src, dst, op1, op2, op3) \ | 96 | #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ |
88 | movb src ## bl, RID1b; \ | 97 | movzbl src ## bh, RID1d; \ |
89 | movb src ## bh, RID2b; \ | 98 | movzbl src ## bl, RID2d; \ |
99 | shrq $16, src; \ | ||
90 | movl s1(, RID1, 4), dst ## d; \ | 100 | movl s1(, RID1, 4), dst ## d; \ |
91 | op1 s2(, RID2, 4), dst ## d; \ | 101 | op1 s2(, RID2, 4), dst ## d; \ |
92 | shrq $16, src; \ | 102 | movzbl src ## bh, RID1d; \ |
93 | movb src ## bl, RID1b; \ | 103 | movzbl src ## bl, RID2d; \ |
94 | movb src ## bh, RID2b; \ | 104 | interleave_op(il_reg); \ |
95 | op2 s3(, RID1, 4), dst ## d; \ | 105 | op2 s3(, RID1, 4), dst ## d; \ |
96 | op3 s4(, RID2, 4), dst ## d; | 106 | op3 s4(, RID2, 4), dst ## d; |
97 | 107 | ||
98 | #define F(a, x, op0, op1, op2, op3) \ | 108 | #define dummy(d) /* do nothing */ |
109 | |||
110 | #define shr_next(reg) \ | ||
111 | shrq $16, reg; | ||
112 | |||
113 | #define F_head(a, x, gi1, gi2, op0) \ | ||
99 | op0 a, RKM, x; \ | 114 | op0 a, RKM, x; \ |
100 | vpslld RKRF, x, RTMP; \ | 115 | vpslld RKRF, x, RTMP; \ |
101 | vpsrld RKRR, x, x; \ | 116 | vpsrld RKRR, x, x; \ |
102 | vpor RTMP, x, x; \ | 117 | vpor RTMP, x, x; \ |
103 | \ | 118 | \ |
104 | vpshufb RMASK, x, x; \ | 119 | vmovq x, gi1; \ |
105 | vmovq x, RGI1; \ | 120 | vpextrq $1, x, gi2; |
106 | vpsrldq $8, x, x; \ | 121 | |
107 | vmovq x, RGI2; \ | 122 | #define F_tail(a, x, gi1, gi2, op1, op2, op3) \ |
108 | \ | 123 | lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ |
109 | lookup_32bit(RGI1, RFS1, op1, op2, op3); \ | 124 | lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ |
110 | shrq $16, RGI1; \ | ||
111 | lookup_32bit(RGI1, RFS2, op1, op2, op3); \ | ||
112 | shlq $32, RFS2; \ | ||
113 | orq RFS1, RFS2; \ | ||
114 | \ | 125 | \ |
115 | lookup_32bit(RGI2, RFS1, op1, op2, op3); \ | 126 | lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ |
116 | shrq $16, RGI2; \ | 127 | shlq $32, RFS2; \ |
117 | lookup_32bit(RGI2, RFS3, op1, op2, op3); \ | 128 | orq RFS1, RFS2; \ |
118 | shlq $32, RFS3; \ | 129 | lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ |
119 | orq RFS1, RFS3; \ | 130 | shlq $32, RFS1; \ |
131 | orq RFS1, RFS3; \ | ||
120 | \ | 132 | \ |
121 | vmovq RFS2, x; \ | 133 | vmovq RFS2, x; \ |
122 | vpinsrq $1, RFS3, x, x; | 134 | vpinsrq $1, RFS3, x, x; |
123 | 135 | ||
124 | #define F1(b, x) F(b, x, vpaddd, xorl, subl, addl) | 136 | #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ |
125 | #define F2(b, x) F(b, x, vpxor, subl, addl, xorl) | 137 | F_head(b1, RX, RGI1, RGI2, op0); \ |
126 | #define F3(b, x) F(b, x, vpsubd, addl, xorl, subl) | 138 | F_head(b2, RX, RGI3, RGI4, op0); \ |
139 | \ | ||
140 | F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ | ||
141 | F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ | ||
142 | \ | ||
143 | vpxor a1, RX, a1; \ | ||
144 | vpxor a2, RTMP, a2; | ||
145 | |||
146 | #define F1_2(a1, b1, a2, b2) \ | ||
147 | F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) | ||
148 | #define F2_2(a1, b1, a2, b2) \ | ||
149 | F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) | ||
150 | #define F3_2(a1, b1, a2, b2) \ | ||
151 | F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) | ||
127 | 152 | ||
128 | #define qop(in, out, x, f) \ | 153 | #define qop(in, out, f) \ |
129 | F ## f(in ## 1, x); \ | 154 | F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); |
130 | vpxor out ## 1, x, out ## 1; \ | 155 | |
131 | F ## f(in ## 2, x); \ | 156 | #define get_round_keys(nn) \ |
132 | vpxor out ## 2, x, out ## 2; \ | 157 | vbroadcastss (km+(4*(nn)))(CTX), RKM; \ |
158 | vpand R1ST, RKR, RKRF; \ | ||
159 | vpsubq RKRF, R32, RKRR; \ | ||
160 | vpsrldq $1, RKR, RKR; | ||
133 | 161 | ||
134 | #define Q(n) \ | 162 | #define Q(n) \ |
135 | vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ | 163 | get_round_keys(4*n+0); \ |
136 | vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ | 164 | qop(RD, RC, 1); \ |
137 | vpsubq RKRF, R32, RKRR; \ | ||
138 | qop(RD, RC, RX, 1); \ | ||
139 | \ | 165 | \ |
140 | vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ | 166 | get_round_keys(4*n+1); \ |
141 | vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ | 167 | qop(RC, RB, 2); \ |
142 | vpsubq RKRF, R32, RKRR; \ | ||
143 | qop(RC, RB, RX, 2); \ | ||
144 | \ | 168 | \ |
145 | vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ | 169 | get_round_keys(4*n+2); \ |
146 | vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ | 170 | qop(RB, RA, 3); \ |
147 | vpsubq RKRF, R32, RKRR; \ | ||
148 | qop(RB, RA, RX, 3); \ | ||
149 | \ | 171 | \ |
150 | vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ | 172 | get_round_keys(4*n+3); \ |
151 | vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ | 173 | qop(RA, RD, 1); |
152 | vpsubq RKRF, R32, RKRR; \ | ||
153 | qop(RA, RD, RX, 1); | ||
154 | 174 | ||
155 | #define QBAR(n) \ | 175 | #define QBAR(n) \ |
156 | vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ | 176 | get_round_keys(4*n+3); \ |
157 | vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ | 177 | qop(RA, RD, 1); \ |
158 | vpsubq RKRF, R32, RKRR; \ | ||
159 | qop(RA, RD, RX, 1); \ | ||
160 | \ | 178 | \ |
161 | vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ | 179 | get_round_keys(4*n+2); \ |
162 | vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ | 180 | qop(RB, RA, 3); \ |
163 | vpsubq RKRF, R32, RKRR; \ | ||
164 | qop(RB, RA, RX, 3); \ | ||
165 | \ | 181 | \ |
166 | vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ | 182 | get_round_keys(4*n+1); \ |
167 | vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ | 183 | qop(RC, RB, 2); \ |
168 | vpsubq RKRF, R32, RKRR; \ | ||
169 | qop(RC, RB, RX, 2); \ | ||
170 | \ | 184 | \ |
171 | vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ | 185 | get_round_keys(4*n+0); \ |
172 | vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ | 186 | qop(RD, RC, 1); |
173 | vpsubq RKRF, R32, RKRR; \ | 187 | |
174 | qop(RD, RC, RX, 1); | 188 | #define shuffle(mask) \ |
189 | vpshufb mask, RKR, RKR; | ||
175 | 190 | ||
191 | #define preload_rkr(n, do_mask, mask) \ | ||
192 | vbroadcastss .L16_mask, RKR; \ | ||
193 | /* add 16-bit rotation to key rotations (mod 32) */ \ | ||
194 | vpxor (kr+n*16)(CTX), RKR, RKR; \ | ||
195 | do_mask(mask); | ||
176 | 196 | ||
177 | #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 197 | #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
178 | vpunpckldq x1, x0, t0; \ | 198 | vpunpckldq x1, x0, t0; \ |
@@ -185,37 +205,37 @@ | |||
185 | vpunpcklqdq x3, t2, x2; \ | 205 | vpunpcklqdq x3, t2, x2; \ |
186 | vpunpckhqdq x3, t2, x3; | 206 | vpunpckhqdq x3, t2, x3; |
187 | 207 | ||
188 | #define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ | 208 | #define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \ |
189 | vmovdqu (0*4*4)(in), x0; \ | 209 | vmovdqu (0*4*4)(in), x0; \ |
190 | vmovdqu (1*4*4)(in), x1; \ | 210 | vmovdqu (1*4*4)(in), x1; \ |
191 | vmovdqu (2*4*4)(in), x2; \ | 211 | vmovdqu (2*4*4)(in), x2; \ |
192 | vmovdqu (3*4*4)(in), x3; \ | 212 | vmovdqu (3*4*4)(in), x3; \ |
193 | vpshufb RMASK, x0, x0; \ | 213 | vpshufb rmask, x0, x0; \ |
194 | vpshufb RMASK, x1, x1; \ | 214 | vpshufb rmask, x1, x1; \ |
195 | vpshufb RMASK, x2, x2; \ | 215 | vpshufb rmask, x2, x2; \ |
196 | vpshufb RMASK, x3, x3; \ | 216 | vpshufb rmask, x3, x3; \ |
197 | \ | 217 | \ |
198 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) | 218 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) |
199 | 219 | ||
200 | #define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | 220 | #define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ |
201 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 221 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
202 | \ | 222 | \ |
203 | vpshufb RMASK, x0, x0; \ | 223 | vpshufb rmask, x0, x0; \ |
204 | vpshufb RMASK, x1, x1; \ | 224 | vpshufb rmask, x1, x1; \ |
205 | vpshufb RMASK, x2, x2; \ | 225 | vpshufb rmask, x2, x2; \ |
206 | vpshufb RMASK, x3, x3; \ | 226 | vpshufb rmask, x3, x3; \ |
207 | vmovdqu x0, (0*4*4)(out); \ | 227 | vmovdqu x0, (0*4*4)(out); \ |
208 | vmovdqu x1, (1*4*4)(out); \ | 228 | vmovdqu x1, (1*4*4)(out); \ |
209 | vmovdqu x2, (2*4*4)(out); \ | 229 | vmovdqu x2, (2*4*4)(out); \ |
210 | vmovdqu x3, (3*4*4)(out); | 230 | vmovdqu x3, (3*4*4)(out); |
211 | 231 | ||
212 | #define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ | 232 | #define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \ |
213 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ | 233 | transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ |
214 | \ | 234 | \ |
215 | vpshufb RMASK, x0, x0; \ | 235 | vpshufb rmask, x0, x0; \ |
216 | vpshufb RMASK, x1, x1; \ | 236 | vpshufb rmask, x1, x1; \ |
217 | vpshufb RMASK, x2, x2; \ | 237 | vpshufb rmask, x2, x2; \ |
218 | vpshufb RMASK, x3, x3; \ | 238 | vpshufb rmask, x3, x3; \ |
219 | vpxor (0*4*4)(out), x0, x0; \ | 239 | vpxor (0*4*4)(out), x0, x0; \ |
220 | vmovdqu x0, (0*4*4)(out); \ | 240 | vmovdqu x0, (0*4*4)(out); \ |
221 | vpxor (1*4*4)(out), x1, x1; \ | 241 | vpxor (1*4*4)(out), x1, x1; \ |
@@ -225,11 +245,29 @@ | |||
225 | vpxor (3*4*4)(out), x3, x3; \ | 245 | vpxor (3*4*4)(out), x3, x3; \ |
226 | vmovdqu x3, (3*4*4)(out); | 246 | vmovdqu x3, (3*4*4)(out); |
227 | 247 | ||
248 | .data | ||
249 | |||
228 | .align 16 | 250 | .align 16 |
229 | .Lbswap_mask: | 251 | .Lbswap_mask: |
230 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | 252 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 |
253 | .Lrkr_enc_Q_Q_QBAR_QBAR: | ||
254 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 | ||
255 | .Lrkr_enc_QBAR_QBAR_QBAR_QBAR: | ||
256 | .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 | ||
257 | .Lrkr_dec_Q_Q_Q_Q: | ||
258 | .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 | ||
259 | .Lrkr_dec_Q_Q_QBAR_QBAR: | ||
260 | .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 | ||
261 | .Lrkr_dec_QBAR_QBAR_QBAR_QBAR: | ||
262 | .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 | ||
263 | .L16_mask: | ||
264 | .byte 16, 16, 16, 16 | ||
231 | .L32_mask: | 265 | .L32_mask: |
232 | .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0 | 266 | .byte 32, 0, 0, 0 |
267 | .Lfirst_mask: | ||
268 | .byte 0x1f, 0, 0, 0 | ||
269 | |||
270 | .text | ||
233 | 271 | ||
234 | .align 16 | 272 | .align 16 |
235 | .global __cast6_enc_blk_8way | 273 | .global __cast6_enc_blk_8way |
@@ -243,28 +281,31 @@ __cast6_enc_blk_8way: | |||
243 | * %rcx: bool, if true: xor output | 281 | * %rcx: bool, if true: xor output |
244 | */ | 282 | */ |
245 | 283 | ||
284 | pushq %rbp; | ||
246 | pushq %rbx; | 285 | pushq %rbx; |
247 | pushq %rcx; | 286 | pushq %rcx; |
248 | 287 | ||
249 | vmovdqu .Lbswap_mask, RMASK; | 288 | vmovdqa .Lbswap_mask, RKM; |
250 | vmovdqu .L32_mask, R32; | 289 | vmovd .Lfirst_mask, R1ST; |
251 | vpxor RKRF, RKRF, RKRF; | 290 | vmovd .L32_mask, R32; |
252 | 291 | ||
253 | leaq (4*4*4)(%rdx), %rax; | 292 | leaq (4*4*4)(%rdx), %rax; |
254 | inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); | 293 | inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
255 | inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); | 294 | inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
256 | 295 | ||
257 | xorq RID1, RID1; | 296 | movq %rsi, %r11; |
258 | xorq RID2, RID2; | ||
259 | 297 | ||
298 | preload_rkr(0, dummy, none); | ||
260 | Q(0); | 299 | Q(0); |
261 | Q(1); | 300 | Q(1); |
262 | Q(2); | 301 | Q(2); |
263 | Q(3); | 302 | Q(3); |
303 | preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); | ||
264 | Q(4); | 304 | Q(4); |
265 | Q(5); | 305 | Q(5); |
266 | QBAR(6); | 306 | QBAR(6); |
267 | QBAR(7); | 307 | QBAR(7); |
308 | preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); | ||
268 | QBAR(8); | 309 | QBAR(8); |
269 | QBAR(9); | 310 | QBAR(9); |
270 | QBAR(10); | 311 | QBAR(10); |
@@ -272,20 +313,22 @@ __cast6_enc_blk_8way: | |||
272 | 313 | ||
273 | popq %rcx; | 314 | popq %rcx; |
274 | popq %rbx; | 315 | popq %rbx; |
316 | popq %rbp; | ||
275 | 317 | ||
276 | leaq (4*4*4)(%rsi), %rax; | 318 | vmovdqa .Lbswap_mask, RKM; |
319 | leaq (4*4*4)(%r11), %rax; | ||
277 | 320 | ||
278 | testb %cl, %cl; | 321 | testb %cl, %cl; |
279 | jnz __enc_xor8; | 322 | jnz __enc_xor8; |
280 | 323 | ||
281 | outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); | 324 | outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
282 | outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); | 325 | outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
283 | 326 | ||
284 | ret; | 327 | ret; |
285 | 328 | ||
286 | __enc_xor8: | 329 | __enc_xor8: |
287 | outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); | 330 | outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
288 | outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); | 331 | outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
289 | 332 | ||
290 | ret; | 333 | ret; |
291 | 334 | ||
@@ -300,36 +343,41 @@ cast6_dec_blk_8way: | |||
300 | * %rdx: src | 343 | * %rdx: src |
301 | */ | 344 | */ |
302 | 345 | ||
346 | pushq %rbp; | ||
303 | pushq %rbx; | 347 | pushq %rbx; |
304 | 348 | ||
305 | vmovdqu .Lbswap_mask, RMASK; | 349 | vmovdqa .Lbswap_mask, RKM; |
306 | vmovdqu .L32_mask, R32; | 350 | vmovd .Lfirst_mask, R1ST; |
307 | vpxor RKRF, RKRF, RKRF; | 351 | vmovd .L32_mask, R32; |
308 | 352 | ||
309 | leaq (4*4*4)(%rdx), %rax; | 353 | leaq (4*4*4)(%rdx), %rax; |
310 | inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); | 354 | inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
311 | inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); | 355 | inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); |
312 | 356 | ||
313 | xorq RID1, RID1; | 357 | movq %rsi, %r11; |
314 | xorq RID2, RID2; | ||
315 | 358 | ||
359 | preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); | ||
316 | Q(11); | 360 | Q(11); |
317 | Q(10); | 361 | Q(10); |
318 | Q(9); | 362 | Q(9); |
319 | Q(8); | 363 | Q(8); |
364 | preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); | ||
320 | Q(7); | 365 | Q(7); |
321 | Q(6); | 366 | Q(6); |
322 | QBAR(5); | 367 | QBAR(5); |
323 | QBAR(4); | 368 | QBAR(4); |
369 | preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); | ||
324 | QBAR(3); | 370 | QBAR(3); |
325 | QBAR(2); | 371 | QBAR(2); |
326 | QBAR(1); | 372 | QBAR(1); |
327 | QBAR(0); | 373 | QBAR(0); |
328 | 374 | ||
329 | popq %rbx; | 375 | popq %rbx; |
376 | popq %rbp; | ||
330 | 377 | ||
331 | leaq (4*4*4)(%rsi), %rax; | 378 | vmovdqa .Lbswap_mask, RKM; |
332 | outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); | 379 | leaq (4*4*4)(%r11), %rax; |
333 | outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); | 380 | outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); |
381 | outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); | ||
334 | 382 | ||
335 | ret; | 383 | ret; |