aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2012-08-28 07:24:54 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-09-06 16:17:05 -0400
commitc09220e1bc97d83cae445cab8dcb057fabd62361 (patch)
tree2f7a80b98592630c28350ebe4cb0f3be616cc8f8
parentddaea7869d29beb9e0042c96ea52c9cca2afd68a (diff)
crypto: cast6-avx - tune assembler code for more performance
Patch replaces 'movb' instructions with 'movzbl' to break false register dependencies, interleaves instructions better for out-of-order scheduling and merges constant 16-bit rotation with round-key variable rotation. tcrypt ECB results: Intel Core i5-2450M: size old-vs-new new-vs-generic old-vs-generic enc dec enc dec enc dec 256 1.13x 1.19x 2.05x 2.17x 1.82x 1.82x 1k 1.18x 1.21x 2.26x 2.33x 1.93x 1.93x 8k 1.19x 1.19x 2.32x 2.33x 1.95x 1.95x [v2] - Do instruction interleaving another way to avoid adding new FPU<=>CPU register moves as these cause performance drop on Bulldozer. - Improvements to round-key variable rotation handling. - Further interleaving improvements for better out-of-order scheduling. Cc: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S276
1 files changed, 162 insertions, 114 deletions
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index d258ce0d2e06..218d283772f4 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
4 * Copyright (C) 2012 Johannes Goetzfried 4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 * 6 *
7 * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
8 *
7 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 10 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or 11 * the Free Software Foundation; either version 2 of the License, or
@@ -22,7 +24,6 @@
22 */ 24 */
23 25
24.file "cast6-avx-x86_64-asm_64.S" 26.file "cast6-avx-x86_64-asm_64.S"
25.text
26 27
27.extern cast6_s1 28.extern cast6_s1
28.extern cast6_s2 29.extern cast6_s2
@@ -54,20 +55,21 @@
54#define RC2 %xmm6 55#define RC2 %xmm6
55#define RD2 %xmm7 56#define RD2 %xmm7
56 57
57#define RX %xmm8 58#define RX %xmm8
58 59
59#define RKM %xmm9 60#define RKM %xmm9
60#define RKRF %xmm10 61#define RKR %xmm10
61#define RKRR %xmm11 62#define RKRF %xmm11
63#define RKRR %xmm12
64#define R32 %xmm13
65#define R1ST %xmm14
62 66
63#define RTMP %xmm12 67#define RTMP %xmm15
64#define RMASK %xmm13
65#define R32 %xmm14
66 68
67#define RID1 %rax 69#define RID1 %rbp
68#define RID1b %al 70#define RID1d %ebp
69#define RID2 %rbx 71#define RID2 %rsi
70#define RID2b %bl 72#define RID2d %esi
71 73
72#define RGI1 %rdx 74#define RGI1 %rdx
73#define RGI1bl %dl 75#define RGI1bl %dl
@@ -76,6 +78,13 @@
76#define RGI2bl %cl 78#define RGI2bl %cl
77#define RGI2bh %ch 79#define RGI2bh %ch
78 80
81#define RGI3 %rax
82#define RGI3bl %al
83#define RGI3bh %ah
84#define RGI4 %rbx
85#define RGI4bl %bl
86#define RGI4bh %bh
87
79#define RFS1 %r8 88#define RFS1 %r8
80#define RFS1d %r8d 89#define RFS1d %r8d
81#define RFS2 %r9 90#define RFS2 %r9
@@ -84,95 +93,106 @@
84#define RFS3d %r10d 93#define RFS3d %r10d
85 94
86 95
87#define lookup_32bit(src, dst, op1, op2, op3) \ 96#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
88 movb src ## bl, RID1b; \ 97 movzbl src ## bh, RID1d; \
89 movb src ## bh, RID2b; \ 98 movzbl src ## bl, RID2d; \
99 shrq $16, src; \
90 movl s1(, RID1, 4), dst ## d; \ 100 movl s1(, RID1, 4), dst ## d; \
91 op1 s2(, RID2, 4), dst ## d; \ 101 op1 s2(, RID2, 4), dst ## d; \
92 shrq $16, src; \ 102 movzbl src ## bh, RID1d; \
93 movb src ## bl, RID1b; \ 103 movzbl src ## bl, RID2d; \
94 movb src ## bh, RID2b; \ 104 interleave_op(il_reg); \
95 op2 s3(, RID1, 4), dst ## d; \ 105 op2 s3(, RID1, 4), dst ## d; \
96 op3 s4(, RID2, 4), dst ## d; 106 op3 s4(, RID2, 4), dst ## d;
97 107
98#define F(a, x, op0, op1, op2, op3) \ 108#define dummy(d) /* do nothing */
109
110#define shr_next(reg) \
111 shrq $16, reg;
112
113#define F_head(a, x, gi1, gi2, op0) \
99 op0 a, RKM, x; \ 114 op0 a, RKM, x; \
100 vpslld RKRF, x, RTMP; \ 115 vpslld RKRF, x, RTMP; \
101 vpsrld RKRR, x, x; \ 116 vpsrld RKRR, x, x; \
102 vpor RTMP, x, x; \ 117 vpor RTMP, x, x; \
103 \ 118 \
104 vpshufb RMASK, x, x; \ 119 vmovq x, gi1; \
105 vmovq x, RGI1; \ 120 vpextrq $1, x, gi2;
106 vpsrldq $8, x, x; \ 121
107 vmovq x, RGI2; \ 122#define F_tail(a, x, gi1, gi2, op1, op2, op3) \
108 \ 123 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
109 lookup_32bit(RGI1, RFS1, op1, op2, op3); \ 124 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
110 shrq $16, RGI1; \
111 lookup_32bit(RGI1, RFS2, op1, op2, op3); \
112 shlq $32, RFS2; \
113 orq RFS1, RFS2; \
114 \ 125 \
115 lookup_32bit(RGI2, RFS1, op1, op2, op3); \ 126 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
116 shrq $16, RGI2; \ 127 shlq $32, RFS2; \
117 lookup_32bit(RGI2, RFS3, op1, op2, op3); \ 128 orq RFS1, RFS2; \
118 shlq $32, RFS3; \ 129 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
119 orq RFS1, RFS3; \ 130 shlq $32, RFS1; \
131 orq RFS1, RFS3; \
120 \ 132 \
121 vmovq RFS2, x; \ 133 vmovq RFS2, x; \
122 vpinsrq $1, RFS3, x, x; 134 vpinsrq $1, RFS3, x, x;
123 135
124#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl) 136#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
125#define F2(b, x) F(b, x, vpxor, subl, addl, xorl) 137 F_head(b1, RX, RGI1, RGI2, op0); \
126#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl) 138 F_head(b2, RX, RGI3, RGI4, op0); \
139 \
140 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
141 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
142 \
143 vpxor a1, RX, a1; \
144 vpxor a2, RTMP, a2;
145
146#define F1_2(a1, b1, a2, b2) \
147 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
148#define F2_2(a1, b1, a2, b2) \
149 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
150#define F3_2(a1, b1, a2, b2) \
151 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
127 152
128#define qop(in, out, x, f) \ 153#define qop(in, out, f) \
129 F ## f(in ## 1, x); \ 154 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
130 vpxor out ## 1, x, out ## 1; \ 155
131 F ## f(in ## 2, x); \ 156#define get_round_keys(nn) \
132 vpxor out ## 2, x, out ## 2; \ 157 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
158 vpand R1ST, RKR, RKRF; \
159 vpsubq RKRF, R32, RKRR; \
160 vpsrldq $1, RKR, RKR;
133 161
134#define Q(n) \ 162#define Q(n) \
135 vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ 163 get_round_keys(4*n+0); \
136 vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ 164 qop(RD, RC, 1); \
137 vpsubq RKRF, R32, RKRR; \
138 qop(RD, RC, RX, 1); \
139 \ 165 \
140 vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ 166 get_round_keys(4*n+1); \
141 vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ 167 qop(RC, RB, 2); \
142 vpsubq RKRF, R32, RKRR; \
143 qop(RC, RB, RX, 2); \
144 \ 168 \
145 vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ 169 get_round_keys(4*n+2); \
146 vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ 170 qop(RB, RA, 3); \
147 vpsubq RKRF, R32, RKRR; \
148 qop(RB, RA, RX, 3); \
149 \ 171 \
150 vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ 172 get_round_keys(4*n+3); \
151 vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ 173 qop(RA, RD, 1);
152 vpsubq RKRF, R32, RKRR; \
153 qop(RA, RD, RX, 1);
154 174
155#define QBAR(n) \ 175#define QBAR(n) \
156 vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \ 176 get_round_keys(4*n+3); \
157 vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \ 177 qop(RA, RD, 1); \
158 vpsubq RKRF, R32, RKRR; \
159 qop(RA, RD, RX, 1); \
160 \ 178 \
161 vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \ 179 get_round_keys(4*n+2); \
162 vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \ 180 qop(RB, RA, 3); \
163 vpsubq RKRF, R32, RKRR; \
164 qop(RB, RA, RX, 3); \
165 \ 181 \
166 vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \ 182 get_round_keys(4*n+1); \
167 vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \ 183 qop(RC, RB, 2); \
168 vpsubq RKRF, R32, RKRR; \
169 qop(RC, RB, RX, 2); \
170 \ 184 \
171 vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \ 185 get_round_keys(4*n+0); \
172 vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \ 186 qop(RD, RC, 1);
173 vpsubq RKRF, R32, RKRR; \ 187
174 qop(RD, RC, RX, 1); 188#define shuffle(mask) \
189 vpshufb mask, RKR, RKR;
175 190
191#define preload_rkr(n, do_mask, mask) \
192 vbroadcastss .L16_mask, RKR; \
193 /* add 16-bit rotation to key rotations (mod 32) */ \
194 vpxor (kr+n*16)(CTX), RKR, RKR; \
195 do_mask(mask);
176 196
177#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 197#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
178 vpunpckldq x1, x0, t0; \ 198 vpunpckldq x1, x0, t0; \
@@ -185,37 +205,37 @@
185 vpunpcklqdq x3, t2, x2; \ 205 vpunpcklqdq x3, t2, x2; \
186 vpunpckhqdq x3, t2, x3; 206 vpunpckhqdq x3, t2, x3;
187 207
188#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 208#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2, rmask) \
189 vmovdqu (0*4*4)(in), x0; \ 209 vmovdqu (0*4*4)(in), x0; \
190 vmovdqu (1*4*4)(in), x1; \ 210 vmovdqu (1*4*4)(in), x1; \
191 vmovdqu (2*4*4)(in), x2; \ 211 vmovdqu (2*4*4)(in), x2; \
192 vmovdqu (3*4*4)(in), x3; \ 212 vmovdqu (3*4*4)(in), x3; \
193 vpshufb RMASK, x0, x0; \ 213 vpshufb rmask, x0, x0; \
194 vpshufb RMASK, x1, x1; \ 214 vpshufb rmask, x1, x1; \
195 vpshufb RMASK, x2, x2; \ 215 vpshufb rmask, x2, x2; \
196 vpshufb RMASK, x3, x3; \ 216 vpshufb rmask, x3, x3; \
197 \ 217 \
198 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 218 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
199 219
200#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 220#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
201 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 221 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
202 \ 222 \
203 vpshufb RMASK, x0, x0; \ 223 vpshufb rmask, x0, x0; \
204 vpshufb RMASK, x1, x1; \ 224 vpshufb rmask, x1, x1; \
205 vpshufb RMASK, x2, x2; \ 225 vpshufb rmask, x2, x2; \
206 vpshufb RMASK, x3, x3; \ 226 vpshufb rmask, x3, x3; \
207 vmovdqu x0, (0*4*4)(out); \ 227 vmovdqu x0, (0*4*4)(out); \
208 vmovdqu x1, (1*4*4)(out); \ 228 vmovdqu x1, (1*4*4)(out); \
209 vmovdqu x2, (2*4*4)(out); \ 229 vmovdqu x2, (2*4*4)(out); \
210 vmovdqu x3, (3*4*4)(out); 230 vmovdqu x3, (3*4*4)(out);
211 231
212#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 232#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2, rmask) \
213 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 233 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
214 \ 234 \
215 vpshufb RMASK, x0, x0; \ 235 vpshufb rmask, x0, x0; \
216 vpshufb RMASK, x1, x1; \ 236 vpshufb rmask, x1, x1; \
217 vpshufb RMASK, x2, x2; \ 237 vpshufb rmask, x2, x2; \
218 vpshufb RMASK, x3, x3; \ 238 vpshufb rmask, x3, x3; \
219 vpxor (0*4*4)(out), x0, x0; \ 239 vpxor (0*4*4)(out), x0, x0; \
220 vmovdqu x0, (0*4*4)(out); \ 240 vmovdqu x0, (0*4*4)(out); \
221 vpxor (1*4*4)(out), x1, x1; \ 241 vpxor (1*4*4)(out), x1, x1; \
@@ -225,11 +245,29 @@
225 vpxor (3*4*4)(out), x3, x3; \ 245 vpxor (3*4*4)(out), x3, x3; \
226 vmovdqu x3, (3*4*4)(out); 246 vmovdqu x3, (3*4*4)(out);
227 247
248.data
249
228.align 16 250.align 16
229.Lbswap_mask: 251.Lbswap_mask:
230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 252 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
253.Lrkr_enc_Q_Q_QBAR_QBAR:
254 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
255.Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
256 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
257.Lrkr_dec_Q_Q_Q_Q:
258 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
259.Lrkr_dec_Q_Q_QBAR_QBAR:
260 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
261.Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
262 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
263.L16_mask:
264 .byte 16, 16, 16, 16
231.L32_mask: 265.L32_mask:
232 .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0 266 .byte 32, 0, 0, 0
267.Lfirst_mask:
268 .byte 0x1f, 0, 0, 0
269
270.text
233 271
234.align 16 272.align 16
235.global __cast6_enc_blk_8way 273.global __cast6_enc_blk_8way
@@ -243,28 +281,31 @@ __cast6_enc_blk_8way:
243 * %rcx: bool, if true: xor output 281 * %rcx: bool, if true: xor output
244 */ 282 */
245 283
284 pushq %rbp;
246 pushq %rbx; 285 pushq %rbx;
247 pushq %rcx; 286 pushq %rcx;
248 287
249 vmovdqu .Lbswap_mask, RMASK; 288 vmovdqa .Lbswap_mask, RKM;
250 vmovdqu .L32_mask, R32; 289 vmovd .Lfirst_mask, R1ST;
251 vpxor RKRF, RKRF, RKRF; 290 vmovd .L32_mask, R32;
252 291
253 leaq (4*4*4)(%rdx), %rax; 292 leaq (4*4*4)(%rdx), %rax;
254 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); 293 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
255 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); 294 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
256 295
257 xorq RID1, RID1; 296 movq %rsi, %r11;
258 xorq RID2, RID2;
259 297
298 preload_rkr(0, dummy, none);
260 Q(0); 299 Q(0);
261 Q(1); 300 Q(1);
262 Q(2); 301 Q(2);
263 Q(3); 302 Q(3);
303 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
264 Q(4); 304 Q(4);
265 Q(5); 305 Q(5);
266 QBAR(6); 306 QBAR(6);
267 QBAR(7); 307 QBAR(7);
308 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
268 QBAR(8); 309 QBAR(8);
269 QBAR(9); 310 QBAR(9);
270 QBAR(10); 311 QBAR(10);
@@ -272,20 +313,22 @@ __cast6_enc_blk_8way:
272 313
273 popq %rcx; 314 popq %rcx;
274 popq %rbx; 315 popq %rbx;
316 popq %rbp;
275 317
276 leaq (4*4*4)(%rsi), %rax; 318 vmovdqa .Lbswap_mask, RKM;
319 leaq (4*4*4)(%r11), %rax;
277 320
278 testb %cl, %cl; 321 testb %cl, %cl;
279 jnz __enc_xor8; 322 jnz __enc_xor8;
280 323
281 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); 324 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
282 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); 325 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
283 326
284 ret; 327 ret;
285 328
286__enc_xor8: 329__enc_xor8:
287 outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); 330 outunpack_xor_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
288 outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); 331 outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
289 332
290 ret; 333 ret;
291 334
@@ -300,36 +343,41 @@ cast6_dec_blk_8way:
300 * %rdx: src 343 * %rdx: src
301 */ 344 */
302 345
346 pushq %rbp;
303 pushq %rbx; 347 pushq %rbx;
304 348
305 vmovdqu .Lbswap_mask, RMASK; 349 vmovdqa .Lbswap_mask, RKM;
306 vmovdqu .L32_mask, R32; 350 vmovd .Lfirst_mask, R1ST;
307 vpxor RKRF, RKRF, RKRF; 351 vmovd .L32_mask, R32;
308 352
309 leaq (4*4*4)(%rdx), %rax; 353 leaq (4*4*4)(%rdx), %rax;
310 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM); 354 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
311 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); 355 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
312 356
313 xorq RID1, RID1; 357 movq %rsi, %r11;
314 xorq RID2, RID2;
315 358
359 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
316 Q(11); 360 Q(11);
317 Q(10); 361 Q(10);
318 Q(9); 362 Q(9);
319 Q(8); 363 Q(8);
364 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
320 Q(7); 365 Q(7);
321 Q(6); 366 Q(6);
322 QBAR(5); 367 QBAR(5);
323 QBAR(4); 368 QBAR(4);
369 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
324 QBAR(3); 370 QBAR(3);
325 QBAR(2); 371 QBAR(2);
326 QBAR(1); 372 QBAR(1);
327 QBAR(0); 373 QBAR(0);
328 374
329 popq %rbx; 375 popq %rbx;
376 popq %rbp;
330 377
331 leaq (4*4*4)(%rsi), %rax; 378 vmovdqa .Lbswap_mask, RKM;
332 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM); 379 leaq (4*4*4)(%r11), %rax;
333 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM); 380 outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
381 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
334 382
335 ret; 383 ret;