aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
diff options
context:
space:
mode:
authorJohannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>2012-05-28 09:54:24 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2012-06-12 04:46:07 -0400
commit107778b592576c0c8e8d2ca7a2aa5415a4908223 (patch)
tree0e07f6abd2acaf69bf25efacf520584d748c860b /arch/x86/crypto/twofish-avx-x86_64-asm_64.S
parent4d03c5047a07a62563e1a8fa798ea258f048bfde (diff)
crypto: twofish - add x86_64/avx assembler implementation
This patch adds a x86_64/avx assembler implementation of the Twofish block cipher. The implementation processes eight blocks in parallel (two 4 block chunk AVX operations). The table-lookups are done in general-purpose registers. For small blocksizes the 3way-parallel functions from the twofish-x86_64-3way module are called. A good performance increase is provided for blocksizes greater or equal to 128B. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmark results: Intel Core i5-2500 CPU (fam:6, model:42, step:7) twofish-avx-x86_64 vs. twofish-x86_64-3way 128bit key: (lrw:256bit) (xts:256bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.97x 1.00x 0.95x 0.97x 0.97x 0.96x 0.95x 0.95x 0.98x 64B 0.99x 0.99x 1.00x 0.99x 0.98x 0.98x 0.99x 0.98x 0.99x 0.98x 256B 1.20x 1.21x 1.00x 1.19x 1.15x 1.14x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.24x 1.26x 1.28x 1.26x 1.27x 8192B 1.31x 1.32x 1.00x 1.31x 1.25x 1.25x 1.28x 1.29x 1.28x 1.30x 256bit key: (lrw:384bit) (xts:512bit) size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec lrw-enc lrw-dec xts-enc xts-dec 16B 0.96x 0.96x 1.00x 0.96x 0.97x 0.98x 0.95x 0.95x 0.95x 0.96x 64B 1.00x 0.99x 1.00x 0.98x 0.98x 1.01x 0.98x 0.98x 0.98x 0.98x 256B 1.20x 1.21x 1.00x 1.21x 1.15x 1.15x 1.19x 1.20x 1.18x 1.19x 1024B 1.29x 1.30x 1.00x 1.28x 1.23x 1.23x 1.26x 1.27x 1.26x 1.27x 8192B 1.31x 1.33x 1.00x 1.31x 1.26x 1.26x 1.29x 1.29x 1.28x 1.30x twofish-avx-x86_64 vs aes-asm (8kB block): 128bit 256bit ecb-enc 1.19x 1.63x ecb-dec 1.18x 1.62x cbc-enc 0.75x 1.03x cbc-dec 1.23x 1.67x ctr-enc 1.24x 1.65x ctr-dec 1.24x 1.65x lrw-enc 1.15x 1.53x lrw-dec 1.14x 1.52x xts-enc 1.16x 1.56x xts-dec 1.16x 1.56x Signed-off-by: Johannes Goetzfried <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/twofish-avx-x86_64-asm_64.S')
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S301
1 files changed, 301 insertions, 0 deletions
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
new file mode 100644
index 000000000000..fc31b89ba4c3
--- /dev/null
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -0,0 +1,301 @@
1/*
2 * Twofish Cipher 8-way parallel algorithm (AVX/x86_64)
3 *
4 * Copyright (C) 2012 Johannes Goetzfried
5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 * USA
21 *
22 */
23
24.file "twofish-avx-x86_64-asm_64.S"
25.text
26
27/* structure of crypto context */
28#define s0 0
29#define s1 1024
30#define s2 2048
31#define s3 3072
32#define w 4096
33#define k 4128
34
35/**********************************************************************
36 8-way AVX twofish
37 **********************************************************************/
38#define CTX %rdi
39
40#define RA1 %xmm0
41#define RB1 %xmm1
42#define RC1 %xmm2
43#define RD1 %xmm3
44
45#define RA2 %xmm4
46#define RB2 %xmm5
47#define RC2 %xmm6
48#define RD2 %xmm7
49
50#define RX %xmm8
51#define RY %xmm9
52
53#define RK1 %xmm10
54#define RK2 %xmm11
55
56#define RID1 %rax
57#define RID1b %al
58#define RID2 %rbx
59#define RID2b %bl
60
61#define RGI1 %rdx
62#define RGI1bl %dl
63#define RGI1bh %dh
64#define RGI2 %rcx
65#define RGI2bl %cl
66#define RGI2bh %ch
67
68#define RGS1 %r8
69#define RGS1d %r8d
70#define RGS2 %r9
71#define RGS2d %r9d
72#define RGS3 %r10
73#define RGS3d %r10d
74
75
76#define lookup_32bit(t0, t1, t2, t3, src, dst) \
77 movb src ## bl, RID1b; \
78 movb src ## bh, RID2b; \
79 movl t0(CTX, RID1, 4), dst ## d; \
80 xorl t1(CTX, RID2, 4), dst ## d; \
81 shrq $16, src; \
82 movb src ## bl, RID1b; \
83 movb src ## bh, RID2b; \
84 xorl t2(CTX, RID1, 4), dst ## d; \
85 xorl t3(CTX, RID2, 4), dst ## d;
86
87#define G(a, x, t0, t1, t2, t3) \
88 vmovq a, RGI1; \
89 vpsrldq $8, a, x; \
90 vmovq x, RGI2; \
91 \
92 lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
93 shrq $16, RGI1; \
94 lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
95 shlq $32, RGS2; \
96 orq RGS1, RGS2; \
97 \
98 lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
99 shrq $16, RGI2; \
100 lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
101 shlq $32, RGS3; \
102 orq RGS1, RGS3; \
103 \
104 vmovq RGS2, x; \
105 vpinsrq $1, RGS3, x, x;
106
107#define encround(a, b, c, d, x, y) \
108 G(a, x, s0, s1, s2, s3); \
109 G(b, y, s1, s2, s3, s0); \
110 vpaddd x, y, x; \
111 vpaddd y, x, y; \
112 vpaddd x, RK1, x; \
113 vpaddd y, RK2, y; \
114 vpxor x, c, c; \
115 vpsrld $1, c, x; \
116 vpslld $(32 - 1), c, c; \
117 vpor c, x, c; \
118 vpslld $1, d, x; \
119 vpsrld $(32 - 1), d, d; \
120 vpor d, x, d; \
121 vpxor d, y, d;
122
123#define decround(a, b, c, d, x, y) \
124 G(a, x, s0, s1, s2, s3); \
125 G(b, y, s1, s2, s3, s0); \
126 vpaddd x, y, x; \
127 vpaddd y, x, y; \
128 vpaddd y, RK2, y; \
129 vpxor d, y, d; \
130 vpsrld $1, d, y; \
131 vpslld $(32 - 1), d, d; \
132 vpor d, y, d; \
133 vpslld $1, c, y; \
134 vpsrld $(32 - 1), c, c; \
135 vpor c, y, c; \
136 vpaddd x, RK1, x; \
137 vpxor x, c, c;
138
139#define encrypt_round(n, a, b, c, d) \
140 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
141 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
142 encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
143 encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
144
145#define decrypt_round(n, a, b, c, d) \
146 vbroadcastss (k+4*(2*(n)))(CTX), RK1; \
147 vbroadcastss (k+4*(2*(n)+1))(CTX), RK2; \
148 decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
149 decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
150
151#define encrypt_cycle(n) \
152 encrypt_round((2*n), RA, RB, RC, RD); \
153 encrypt_round(((2*n) + 1), RC, RD, RA, RB);
154
155#define decrypt_cycle(n) \
156 decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
157 decrypt_round((2*n), RA, RB, RC, RD);
158
159
160#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
161 vpunpckldq x1, x0, t0; \
162 vpunpckhdq x1, x0, t2; \
163 vpunpckldq x3, x2, t1; \
164 vpunpckhdq x3, x2, x3; \
165 \
166 vpunpcklqdq t1, t0, x0; \
167 vpunpckhqdq t1, t0, x1; \
168 vpunpcklqdq x3, t2, x2; \
169 vpunpckhqdq x3, t2, x3;
170
171#define inpack_blocks(in, x0, x1, x2, x3, wkey, t0, t1, t2) \
172 vpxor (0*4*4)(in), wkey, x0; \
173 vpxor (1*4*4)(in), wkey, x1; \
174 vpxor (2*4*4)(in), wkey, x2; \
175 vpxor (3*4*4)(in), wkey, x3; \
176 \
177 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
178
179#define outunpack_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
180 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
181 \
182 vpxor x0, wkey, x0; \
183 vmovdqu x0, (0*4*4)(out); \
184 vpxor x1, wkey, x1; \
185 vmovdqu x1, (1*4*4)(out); \
186 vpxor x2, wkey, x2; \
187 vmovdqu x2, (2*4*4)(out); \
188 vpxor x3, wkey, x3; \
189 vmovdqu x3, (3*4*4)(out);
190
191#define outunpack_xor_blocks(out, x0, x1, x2, x3, wkey, t0, t1, t2) \
192 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
193 \
194 vpxor x0, wkey, x0; \
195 vpxor (0*4*4)(out), x0, x0; \
196 vmovdqu x0, (0*4*4)(out); \
197 vpxor x1, wkey, x1; \
198 vpxor (1*4*4)(out), x1, x1; \
199 vmovdqu x1, (1*4*4)(out); \
200 vpxor x2, wkey, x2; \
201 vpxor (2*4*4)(out), x2, x2; \
202 vmovdqu x2, (2*4*4)(out); \
203 vpxor x3, wkey, x3; \
204 vpxor (3*4*4)(out), x3, x3; \
205 vmovdqu x3, (3*4*4)(out);
206
207.align 8
208.global __twofish_enc_blk_8way
209.type __twofish_enc_blk_8way,@function;
210
211__twofish_enc_blk_8way:
212 /* input:
213 * %rdi: ctx, CTX
214 * %rsi: dst
215 * %rdx: src
216 * %rcx: bool, if true: xor output
217 */
218
219 pushq %rbx;
220 pushq %rcx;
221
222 vmovdqu w(CTX), RK1;
223
224 leaq (4*4*4)(%rdx), %rax;
225 inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
226 inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
227
228 xorq RID1, RID1;
229 xorq RID2, RID2;
230
231 encrypt_cycle(0);
232 encrypt_cycle(1);
233 encrypt_cycle(2);
234 encrypt_cycle(3);
235 encrypt_cycle(4);
236 encrypt_cycle(5);
237 encrypt_cycle(6);
238 encrypt_cycle(7);
239
240 vmovdqu (w+4*4)(CTX), RK1;
241
242 popq %rcx;
243 popq %rbx;
244
245 leaq (4*4*4)(%rsi), %rax;
246 leaq (4*4*4)(%rax), %rdx;
247
248 testb %cl, %cl;
249 jnz __enc_xor8;
250
251 outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
252 outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
253
254 ret;
255
256__enc_xor8:
257 outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
258 outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
259
260 ret;
261
262.align 8
263.global twofish_dec_blk_8way
264.type twofish_dec_blk_8way,@function;
265
266twofish_dec_blk_8way:
267 /* input:
268 * %rdi: ctx, CTX
269 * %rsi: dst
270 * %rdx: src
271 */
272
273 pushq %rbx;
274
275 vmovdqu (w+4*4)(CTX), RK1;
276
277 leaq (4*4*4)(%rdx), %rax;
278 inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
279 inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
280
281 xorq RID1, RID1;
282 xorq RID2, RID2;
283
284 decrypt_cycle(7);
285 decrypt_cycle(6);
286 decrypt_cycle(5);
287 decrypt_cycle(4);
288 decrypt_cycle(3);
289 decrypt_cycle(2);
290 decrypt_cycle(1);
291 decrypt_cycle(0);
292
293 vmovdqu (w)(CTX), RK1;
294
295 popq %rbx;
296
297 leaq (4*4*4)(%rsi), %rax;
298 outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
299 outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
300
301 ret;