aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
diff options
context:
space:
mode:
authorJohannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>2012-07-11 13:37:37 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-08-01 05:47:30 -0400
commit4d6d6a2c850f89bc9283d02519cb536baba72032 (patch)
tree8433747260d88000d79849bcd4db0e56b86aa6e4 /arch/x86/crypto/cast5-avx-x86_64-asm_64.S
parenta2c5826095562983bf316e3a7eb137ef04a71a24 (diff)
crypto: cast5 - add x86_64/avx assembler implementation
This patch adds a x86_64/avx assembler implementation of the Cast5 block cipher. The implementation processes sixteen blocks in parallel (four 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the functions from the generic module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) cast5-avx-x86_64 vs. cast5-generic 64bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.02x 1.01x 64B 1.00x 1.00x 0.98x 1.00x 1.01x 1.02x 256B 2.03x 2.01x 0.95x 2.11x 2.12x 2.13x 1024B 2.30x 2.24x 0.95x 2.29x 2.35x 2.35x 8192B 2.31x 2.27x 0.95x 2.31x 2.39x 2.39x 128bit key: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16B 0.99x 0.99x 1.00x 1.00x 1.01x 1.01x 64B 1.00x 1.00x 0.98x 1.01x 1.02x 1.01x 256B 2.17x 2.13x 0.96x 2.19x 2.19x 2.19x 1024B 2.29x 2.32x 0.95x 2.34x 2.37x 2.38x 8192B 2.35x 2.32x 0.95x 2.35x 2.39x 2.39x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/cast5-avx-x86_64-asm_64.S')
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S322
1 files changed, 322 insertions, 0 deletions
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..94693c877e3b
--- /dev/null
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -0,0 +1,322 @@
1/*
2 * Cast5 Cipher 16-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24.file "cast5-avx-x86_64-asm_64.S"
25.text
26
27.extern cast5_s1
28.extern cast5_s2
29.extern cast5_s3
30.extern cast5_s4
31
32/* structure of crypto context */
33#define km 0
34#define kr (16*4)
35#define rr ((16*4)+16)
36
37/* s-boxes */
38#define s1 cast5_s1
39#define s2 cast5_s2
40#define s3 cast5_s3
41#define s4 cast5_s4
42
43/**********************************************************************
44 16-way AVX cast5
45 **********************************************************************/
46#define CTX %rdi
47
48#define RL1 %xmm0
49#define RR1 %xmm1
50#define RL2 %xmm2
51#define RR2 %xmm3
52#define RL3 %xmm4
53#define RR3 %xmm5
54#define RL4 %xmm6
55#define RR4 %xmm7
56
57#define RX %xmm8
58
59#define RKM %xmm9
60#define RKRF %xmm10
61#define RKRR %xmm11
62
63#define RTMP %xmm12
64#define RMASK %xmm13
65#define R32 %xmm14
66
67#define RID1 %rax
68#define RID1b %al
69#define RID2 %rbx
70#define RID2b %bl
71
72#define RGI1 %rdx
73#define RGI1bl %dl
74#define RGI1bh %dh
75#define RGI2 %rcx
76#define RGI2bl %cl
77#define RGI2bh %ch
78
79#define RFS1 %r8
80#define RFS1d %r8d
81#define RFS2 %r9
82#define RFS2d %r9d
83#define RFS3 %r10
84#define RFS3d %r10d
85
86
87#define lookup_32bit(src, dst, op1, op2, op3) \
88 movb src ## bl, RID1b; \
89 movb src ## bh, RID2b; \
90 movl s1(, RID1, 4), dst ## d; \
91 op1 s2(, RID2, 4), dst ## d; \
92 shrq $16, src; \
93 movb src ## bl, RID1b; \
94 movb src ## bh, RID2b; \
95 op2 s3(, RID1, 4), dst ## d; \
96 op3 s4(, RID2, 4), dst ## d;
97
98#define F(a, x, op0, op1, op2, op3) \
99 op0 a, RKM, x; \
100 vpslld RKRF, x, RTMP; \
101 vpsrld RKRR, x, x; \
102 vpor RTMP, x, x; \
103 \
104 vpshufb RMASK, x, x; \
105 vmovq x, RGI1; \
106 vpsrldq $8, x, x; \
107 vmovq x, RGI2; \
108 \
109 lookup_32bit(RGI1, RFS1, op1, op2, op3); \
110 shrq $16, RGI1; \
111 lookup_32bit(RGI1, RFS2, op1, op2, op3); \
112 shlq $32, RFS2; \
113 orq RFS1, RFS2; \
114 \
115 lookup_32bit(RGI2, RFS1, op1, op2, op3); \
116 shrq $16, RGI2; \
117 lookup_32bit(RGI2, RFS3, op1, op2, op3); \
118 shlq $32, RFS3; \
119 orq RFS1, RFS3; \
120 \
121 vmovq RFS2, x; \
122 vpinsrq $1, RFS3, x, x;
123
124#define F1(b, x) F(b, x, vpaddd, xorl, subl, addl)
125#define F2(b, x) F(b, x, vpxor, subl, addl, xorl)
126#define F3(b, x) F(b, x, vpsubd, addl, xorl, subl)
127
128#define subround(a, b, x, n, f) \
129 F ## f(b, x); \
130 vpxor a, x, a;
131
132#define round(l, r, n, f) \
133 vbroadcastss (km+(4*n))(CTX), RKM; \
134 vpinsrb $0, (kr+n)(CTX), RKRF, RKRF; \
135 vpsubq RKRF, R32, RKRR; \
136 subround(l ## 1, r ## 1, RX, n, f); \
137 subround(l ## 2, r ## 2, RX, n, f); \
138 subround(l ## 3, r ## 3, RX, n, f); \
139 subround(l ## 4, r ## 4, RX, n, f);
140
141
142#define transpose_2x4(x0, x1, t0, t1) \
143 vpunpckldq x1, x0, t0; \
144 vpunpckhdq x1, x0, t1; \
145 \
146 vpunpcklqdq t1, t0, x0; \
147 vpunpckhqdq t1, t0, x1;
148
149#define inpack_blocks(in, x0, x1, t0, t1) \
150 vmovdqu (0*4*4)(in), x0; \
151 vmovdqu (1*4*4)(in), x1; \
152 vpshufb RMASK, x0, x0; \
153 vpshufb RMASK, x1, x1; \
154 \
155 transpose_2x4(x0, x1, t0, t1)
156
157#define outunpack_blocks(out, x0, x1, t0, t1) \
158 transpose_2x4(x0, x1, t0, t1) \
159 \
160 vpshufb RMASK, x0, x0; \
161 vpshufb RMASK, x1, x1; \
162 vmovdqu x0, (0*4*4)(out); \
163 vmovdqu x1, (1*4*4)(out);
164
165#define outunpack_xor_blocks(out, x0, x1, t0, t1) \
166 transpose_2x4(x0, x1, t0, t1) \
167 \
168 vpshufb RMASK, x0, x0; \
169 vpshufb RMASK, x1, x1; \
170 vpxor (0*4*4)(out), x0, x0; \
171 vmovdqu x0, (0*4*4)(out); \
172 vpxor (1*4*4)(out), x1, x1; \
173 vmovdqu x1, (1*4*4)(out);
174
175.align 16
176.Lbswap_mask:
177 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
178.L32_mask:
179 .byte 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0
180
181.align 16
182.global __cast5_enc_blk_16way
183.type __cast5_enc_blk_16way,@function;
184
185__cast5_enc_blk_16way:
186 /* input:
187 * %rdi: ctx, CTX
188 * %rsi: dst
189 * %rdx: src
190 * %rcx: bool, if true: xor output
191 */
192
193 pushq %rbx;
194 pushq %rcx;
195
196 vmovdqu .Lbswap_mask, RMASK;
197 vmovdqu .L32_mask, R32;
198 vpxor RKRF, RKRF, RKRF;
199
200 inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
201 leaq (2*4*4)(%rdx), %rax;
202 inpack_blocks(%rax, RL2, RR2, RTMP, RX);
203 leaq (2*4*4)(%rax), %rax;
204 inpack_blocks(%rax, RL3, RR3, RTMP, RX);
205 leaq (2*4*4)(%rax), %rax;
206 inpack_blocks(%rax, RL4, RR4, RTMP, RX);
207
208 xorq RID1, RID1;
209 xorq RID2, RID2;
210
211 round(RL, RR, 0, 1);
212 round(RR, RL, 1, 2);
213 round(RL, RR, 2, 3);
214 round(RR, RL, 3, 1);
215 round(RL, RR, 4, 2);
216 round(RR, RL, 5, 3);
217 round(RL, RR, 6, 1);
218 round(RR, RL, 7, 2);
219 round(RL, RR, 8, 3);
220 round(RR, RL, 9, 1);
221 round(RL, RR, 10, 2);
222 round(RR, RL, 11, 3);
223
224 movb rr(CTX), %al;
225 testb %al, %al;
226 jnz __skip_enc;
227
228 round(RL, RR, 12, 1);
229 round(RR, RL, 13, 2);
230 round(RL, RR, 14, 3);
231 round(RR, RL, 15, 1);
232
233__skip_enc:
234 popq %rcx;
235 popq %rbx;
236
237 testb %cl, %cl;
238 jnz __enc_xor16;
239
240 outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
241 leaq (2*4*4)(%rsi), %rax;
242 outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
243 leaq (2*4*4)(%rax), %rax;
244 outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
245 leaq (2*4*4)(%rax), %rax;
246 outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
247
248 ret;
249
250__enc_xor16:
251 outunpack_xor_blocks(%rsi, RR1, RL1, RTMP, RX);
252 leaq (2*4*4)(%rsi), %rax;
253 outunpack_xor_blocks(%rax, RR2, RL2, RTMP, RX);
254 leaq (2*4*4)(%rax), %rax;
255 outunpack_xor_blocks(%rax, RR3, RL3, RTMP, RX);
256 leaq (2*4*4)(%rax), %rax;
257 outunpack_xor_blocks(%rax, RR4, RL4, RTMP, RX);
258
259 ret;
260
261.align 16
262.global cast5_dec_blk_16way
263.type cast5_dec_blk_16way,@function;
264
265cast5_dec_blk_16way:
266 /* input:
267 * %rdi: ctx, CTX
268 * %rsi: dst
269 * %rdx: src
270 */
271
272 pushq %rbx;
273
274 vmovdqu .Lbswap_mask, RMASK;
275 vmovdqu .L32_mask, R32;
276 vpxor RKRF, RKRF, RKRF;
277
278 inpack_blocks(%rdx, RL1, RR1, RTMP, RX);
279 leaq (2*4*4)(%rdx), %rax;
280 inpack_blocks(%rax, RL2, RR2, RTMP, RX);
281 leaq (2*4*4)(%rax), %rax;
282 inpack_blocks(%rax, RL3, RR3, RTMP, RX);
283 leaq (2*4*4)(%rax), %rax;
284 inpack_blocks(%rax, RL4, RR4, RTMP, RX);
285
286 xorq RID1, RID1;
287 xorq RID2, RID2;
288
289 movb rr(CTX), %al;
290 testb %al, %al;
291 jnz __skip_dec;
292
293 round(RL, RR, 15, 1);
294 round(RR, RL, 14, 3);
295 round(RL, RR, 13, 2);
296 round(RR, RL, 12, 1);
297
298__skip_dec:
299 round(RL, RR, 11, 3);
300 round(RR, RL, 10, 2);
301 round(RL, RR, 9, 1);
302 round(RR, RL, 8, 3);
303 round(RL, RR, 7, 2);
304 round(RR, RL, 6, 1);
305 round(RL, RR, 5, 3);
306 round(RR, RL, 4, 2);
307 round(RL, RR, 3, 1);
308 round(RR, RL, 2, 3);
309 round(RL, RR, 1, 2);
310 round(RR, RL, 0, 1);
311
312 popq %rbx;
313
314 outunpack_blocks(%rsi, RR1, RL1, RTMP, RX);
315 leaq (2*4*4)(%rsi), %rax;
316 outunpack_blocks(%rax, RR2, RL2, RTMP, RX);
317 leaq (2*4*4)(%rax), %rax;
318 outunpack_blocks(%rax, RR3, RL3, RTMP, RX);
319 leaq (2*4*4)(%rax), %rax;
320 outunpack_blocks(%rax, RR4, RL4, RTMP, RX);
321
322 ret;