aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
diff options
context:
space:
mode:
authorJohannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>2012-07-11 13:38:57 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-08-01 05:47:30 -0400
commit4ea1277d301eb776e321684cd4ea95116b4e8847 (patch)
tree675ef40d239946bc3232861cdf5a84259da09dc6 /arch/x86/crypto/cast6-avx-x86_64-asm_64.S
parent9b8b04051d0df1e2c7c31206caff05673a2c685f (diff)
crypto: cast6 - add x86_64/avx assembler implementation
This patch adds a x86_64/avx assembler implementation of the Cast6 block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast6-avx-x86_64 vs. cast6-generic 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 1.00x 1.01x 1.01x 0.99x 0.97x 0.98x 1.01x 0.96x 0.98x 64B 0.98x 0.99x 1.02x 1.01x 0.99x 1.00x 1.01x 0.99x 1.00x 0.99x 256B 1.77x 1.84x 0.99x 1.85x 1.77x 1.77x 1.70x 1.74x 1.69x 1.72x 1024B 1.93x 1.95x 0.99x 1.96x 1.93x 1.93x 1.84x 1.85x 1.89x 1.87x 8192B 1.91x 1.95x 0.99x 1.97x 1.95x 1.91x 1.86x 1.87x 1.93x 1.90x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.97x 0.99x 1.02x 1.01x 0.98x 0.99x 1.00x 1.00x 0.98x 0.98x 64B 0.98x 0.99x 1.01x 1.00x 1.00x 1.00x 1.01x 1.01x 0.97x 1.00x 256B 1.77x 1.83x 1.00x 1.86x 1.79x 1.78x 1.70x 1.76x 1.71x 1.69x 1024B 1.92x 1.95x 0.99x 1.96x 1.93x 1.93x 1.83x 1.86x 1.89x 1.87x 8192B 1.94x 1.95x 0.99x 1.97x 1.95x 1.95x 1.87x 1.87x 1.93x 1.91x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/cast6-avx-x86_64-asm_64.S')
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S335
1 files changed, 335 insertions, 0 deletions
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
new file mode 100644
index 00000000000..d258ce0d2e0
--- /dev/null
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -0,0 +1,335 @@
1/*
2 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24.file "cast6-avx-x86_64-asm_64.S"
25.text
26
27.extern cast6_s1
28.extern cast6_s2
29.extern cast6_s3
30.extern cast6_s4
31
32/* structure of crypto context */
33#define km 0
34#define kr (12*4*4)
35
36/* s-boxes */
37#define s1 cast6_s1
38#define s2 cast6_s2
39#define s3 cast6_s3
40#define s4 cast6_s4
41
42/**********************************************************************
43 8-way AVX cast6
44 **********************************************************************/
45#define CTX %rdi
46
47#define RA1 %xmm0
48#define RB1 %xmm1
49#define RC1 %xmm2
50#define RD1 %xmm3
51
52#define RA2 %xmm4
53#define RB2 %xmm5
54#define RC2 %xmm6
55#define RD2 %xmm7
56
57#define RX %xmm8
58
59#define RKM %xmm9
60#define RKRF %xmm10
61#define RKRR %xmm11
62
63#define RTMP %xmm12
64#define RMASK %xmm13
65#define R32 %xmm14
66
67#define RID1 %rax
68#define RID1b %al
69#define RID2 %rbx
70#define RID2b %bl
71
72#define RGI1 %rdx
73#define RGI1bl %dl
74#define RGI1bh %dh
75#define RGI2 %rcx
76#define RGI2bl %cl
77#define RGI2bh %ch
78
79#define RFS1 %r8
80#define RFS1d %r8d
81#define RFS2 %r9
82#define RFS2d %r9d
83#define RFS3 %r10
84#define RFS3d %r10d
85
86
87#define lookup_32bit(src, dst, op1, op2, op3) \
88 movb src ## bl, RID1b; \
89 movb src ## bh, RID2b; \
90 movl s1(, RID1, 4), dst ## d; \
91 op1 s2(, RID2, 4), dst ## d; \
92 shrq $16, src; \
93 movb src ## bl, RID1b; \
94 movb src ## bh, RID2b; \
95 op2 s3(, RID1, 4), dst ## d; \
96 op3 s4(, RID2, 4), dst ## d;
97
98#define F(a, x, op0, op1, op2, op3) \
99 op0 a, RKM, x; \
100 vpslld RKRF, x, RTMP; \
101 vpsrld RKRR, x, x; \
102 vpor RTMP, x, x; \
103 \
104 vpshufb RMASK, x, x; \
105 vmovq x, RGI1; \
106 vpsrldq $8, x, x; \
107 vmovq x, RGI2; \
108 \
109 lookup_32bit(RGI1, RFS1, op1, op2, op3); \
110 shrq $16, RGI1; \
111 lookup_32bit(RGI1, RFS2, op1, op2, op3); \
112 shlq $32, RFS2; \
113 orq RFS1, RFS2; \
114 \
115 lookup_32bit(RGI2, RFS1, op1, op2, op3); \
116 shrq $16, RGI2; \
117 lookup_32bit(RGI2, RFS3, op1, op2, op3); \
118 shlq $32, RFS3; \
119 orq RFS1, RFS3; \
120 \
121 vmovq RFS2, x; \
122 vpinsrq $1, RFS3, x, x;
123
124#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
125#define F2(b, x) F(b, x, vpxor, subl, addl, xorl)
126#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
127
128#define qop(in, out, x, f) \
129 F ## f(in ## 1, x); \
130 vpxor out ## 1, x, out ## 1; \
131 F ## f(in ## 2, x); \
132 vpxor out ## 2, x, out ## 2; \
133
134#define Q(n) \
135 vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \
136 vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
137 vpsubq RKRF, R32, RKRR; \
138 qop(RD, RC, RX, 1); \
139 \
140 vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \
141 vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
142 vpsubq RKRF, R32, RKRR; \
143 qop(RC, RB, RX, 2); \
144 \
145 vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \
146 vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
147 vpsubq RKRF, R32, RKRR; \
148 qop(RB, RA, RX, 3); \
149 \
150 vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \
151 vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
152 vpsubq RKRF, R32, RKRR; \
153 qop(RA, RD, RX, 1);
154
155#define QBAR(n) \
156 vbroadcastss (km+(4*(4*n+3)))(CTX), RKM; \
157 vpinsrb $0, (kr+(4*n+3))(CTX), RKRF, RKRF; \
158 vpsubq RKRF, R32, RKRR; \
159 qop(RA, RD, RX, 1); \
160 \
161 vbroadcastss (km+(4*(4*n+2)))(CTX), RKM; \
162 vpinsrb $0, (kr+(4*n+2))(CTX), RKRF, RKRF; \
163 vpsubq RKRF, R32, RKRR; \
164 qop(RB, RA, RX, 3); \
165 \
166 vbroadcastss (km+(4*(4*n+1)))(CTX), RKM; \
167 vpinsrb $0, (kr+(4*n+1))(CTX), RKRF, RKRF; \
168 vpsubq RKRF, R32, RKRR; \
169 qop(RC, RB, RX, 2); \
170 \
171 vbroadcastss (km+(4*(4*n+0)))(CTX), RKM; \
172 vpinsrb $0, (kr+(4*n+0))(CTX), RKRF, RKRF; \
173 vpsubq RKRF, R32, RKRR; \
174 qop(RD, RC, RX, 1);
175
176
177#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
178 vpunpckldq x1, x0, t0; \
179 vpunpckhdq x1, x0, t2; \
180 vpunpckldq x3, x2, t1; \
181 vpunpckhdq x3, x2, x3; \
182 \
183 vpunpcklqdq t1, t0, x0; \
184 vpunpckhqdq t1, t0, x1; \
185 vpunpcklqdq x3, t2, x2; \
186 vpunpckhqdq x3, t2, x3;
187
188#define inpack_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
189 vmovdqu (0*4*4)(in), x0; \
190 vmovdqu (1*4*4)(in), x1; \
191 vmovdqu (2*4*4)(in), x2; \
192 vmovdqu (3*4*4)(in), x3; \
193 vpshufb RMASK, x0, x0; \
194 vpshufb RMASK, x1, x1; \
195 vpshufb RMASK, x2, x2; \
196 vpshufb RMASK, x3, x3; \
197 \
198 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
199
200#define outunpack_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
201 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
202 \
203 vpshufb RMASK, x0, x0; \
204 vpshufb RMASK, x1, x1; \
205 vpshufb RMASK, x2, x2; \
206 vpshufb RMASK, x3, x3; \
207 vmovdqu x0, (0*4*4)(out); \
208 vmovdqu x1, (1*4*4)(out); \
209 vmovdqu x2, (2*4*4)(out); \
210 vmovdqu x3, (3*4*4)(out);
211
212#define outunpack_xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
213 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
214 \
215 vpshufb RMASK, x0, x0; \
216 vpshufb RMASK, x1, x1; \
217 vpshufb RMASK, x2, x2; \
218 vpshufb RMASK, x3, x3; \
219 vpxor (0*4*4)(out), x0, x0; \
220 vmovdqu x0, (0*4*4)(out); \
221 vpxor (1*4*4)(out), x1, x1; \
222 vmovdqu x1, (1*4*4)(out); \
223 vpxor (2*4*4)(out), x2, x2; \
224 vmovdqu x2, (2*4*4)(out); \
225 vpxor (3*4*4)(out), x3, x3; \
226 vmovdqu x3, (3*4*4)(out);
227
228.align 16
229.Lbswap_mask:
230 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
231.L32_mask:
232 .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0
233
234.align 16
235.global __cast6_enc_blk_8way
236.type __cast6_enc_blk_8way,@function;
237
238__cast6_enc_blk_8way:
239 /* input:
240 * %rdi: ctx, CTX
241 * %rsi: dst
242 * %rdx: src
243 * %rcx: bool, if true: xor output
244 */
245
246 pushq %rbx;
247 pushq %rcx;
248
249 vmovdqu .Lbswap_mask, RMASK;
250 vmovdqu .L32_mask, R32;
251 vpxor RKRF, RKRF, RKRF;
252
253 leaq (4*4*4)(%rdx), %rax;
254 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
255 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
256
257 xorq RID1, RID1;
258 xorq RID2, RID2;
259
260 Q(0);
261 Q(1);
262 Q(2);
263 Q(3);
264 Q(4);
265 Q(5);
266 QBAR(6);
267 QBAR(7);
268 QBAR(8);
269 QBAR(9);
270 QBAR(10);
271 QBAR(11);
272
273 popq %rcx;
274 popq %rbx;
275
276 leaq (4*4*4)(%rsi), %rax;
277
278 testb %cl, %cl;
279 jnz __enc_xor8;
280
281 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
282 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
283
284 ret;
285
286__enc_xor8:
287 outunpack_xor_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
288 outunpack_xor_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
289
290 ret;
291
292.align 16
293.global cast6_dec_blk_8way
294.type cast6_dec_blk_8way,@function;
295
296cast6_dec_blk_8way:
297 /* input:
298 * %rdi: ctx, CTX
299 * %rsi: dst
300 * %rdx: src
301 */
302
303 pushq %rbx;
304
305 vmovdqu .Lbswap_mask, RMASK;
306 vmovdqu .L32_mask, R32;
307 vpxor RKRF, RKRF, RKRF;
308
309 leaq (4*4*4)(%rdx), %rax;
310 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
311 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
312
313 xorq RID1, RID1;
314 xorq RID2, RID2;
315
316 Q(11);
317 Q(10);
318 Q(9);
319 Q(8);
320 Q(7);
321 Q(6);
322 QBAR(5);
323 QBAR(4);
324 QBAR(3);
325 QBAR(2);
326 QBAR(1);
327 QBAR(0);
328
329 popq %rbx;
330
331 leaq (4*4*4)(%rsi), %rax;
332 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RTMP, RX, RKM);
333 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RTMP, RX, RKM);
334
335 ret;