aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2011-11-09 09:26:31 -0500
committerHerbert Xu <herbert@gondor.apana.org.au>2011-11-21 03:13:23 -0500
commit251496dbfc1be38bc43b49651f3d33c02faccc47 (patch)
treee17a6704b90b94d0da126eba603fe20cb7ca822c /arch/x86/crypto
parent937c30d7f560210b0163035edd42b2aef78fed9e (diff)
crypto: serpent - add 4-way parallel i586/SSE2 assembler implementation
Patch adds i586/SSE2 assembler implementation of serpent cipher. Assembler functions crypt data in four block chunks. Patch has been tested with tcrypt and automated filesystem tests. Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios): Intel Atom N270: size ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec 16 0.95x 1.12x 1.02x 1.07x 0.97x 0.98x 64 1.73x 1.82x 1.08x 1.82x 1.72x 1.73x 256 2.08x 2.00x 1.04x 2.07x 1.99x 2.01x 1024 2.28x 2.18x 1.05x 2.23x 2.17x 2.20x 8192 2.28x 2.13x 1.05x 2.23x 2.18x 2.20x Full output: http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-generic.txt http://koti.mbnet.fi/axh/kernel/crypto/atom-n270/serpent-sse2.txt Userspace test results: Encryption/decryption of sse2-i586 vs generic on Intel Atom N270: encrypt: 2.35x decrypt: 2.54x Encryption/decryption of sse2-i586 vs generic on AMD Phenom II: encrypt: 1.82x decrypt: 2.51x Encryption/decryption of sse2-i586 vs generic on Intel Xeon E7330: encrypt: 2.99x decrypt: 3.48x Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto')
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/serpent-sse2-i586-asm_32.S638
2 files changed, 640 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 12ebdbd80ccb..2b0b9631474b 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -5,6 +5,7 @@
5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
8obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
8 9
9obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 10obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 11obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
@@ -21,6 +22,7 @@ obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o
21aes-i586-y := aes-i586-asm_32.o aes_glue.o 22aes-i586-y := aes-i586-asm_32.o aes_glue.o
22twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 23twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
23salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 24salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
25serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
24 26
25aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 27aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
26blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 28blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
diff --git a/arch/x86/crypto/serpent-sse2-i586-asm_32.S b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
new file mode 100644
index 000000000000..4e37677ca851
--- /dev/null
+++ b/arch/x86/crypto/serpent-sse2-i586-asm_32.S
@@ -0,0 +1,638 @@
1/*
2 * Serpent Cipher 4-way parallel algorithm (i586/SSE2)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * Based on crypto/serpent.c by
7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
8 * 2003 Herbert Valerio Riedel <hvr@gnu.org>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
23 * USA
24 *
25 */
26
27.file "serpent-sse2-i586-asm_32.S"
28.text
29
30#define arg_ctx 4
31#define arg_dst 8
32#define arg_src 12
33#define arg_xor 16
34
35/**********************************************************************
36 4-way SSE2 serpent
37 **********************************************************************/
38#define CTX %edx
39
40#define RA %xmm0
41#define RB %xmm1
42#define RC %xmm2
43#define RD %xmm3
44#define RE %xmm4
45
46#define RT0 %xmm5
47#define RT1 %xmm6
48
49#define RNOT %xmm7
50
51#define get_key(i, j, t) \
52 movd (4*(i)+(j))*4(CTX), t; \
53 pshufd $0, t, t;
54
55#define K(x0, x1, x2, x3, x4, i) \
56 get_key(i, 0, x4); \
57 get_key(i, 1, RT0); \
58 get_key(i, 2, RT1); \
59 pxor x4, x0; \
60 pxor RT0, x1; \
61 pxor RT1, x2; \
62 get_key(i, 3, x4); \
63 pxor x4, x3;
64
65#define LK(x0, x1, x2, x3, x4, i) \
66 movdqa x0, x4; \
67 pslld $13, x0; \
68 psrld $(32 - 13), x4; \
69 por x4, x0; \
70 pxor x0, x1; \
71 movdqa x2, x4; \
72 pslld $3, x2; \
73 psrld $(32 - 3), x4; \
74 por x4, x2; \
75 pxor x2, x1; \
76 movdqa x1, x4; \
77 pslld $1, x1; \
78 psrld $(32 - 1), x4; \
79 por x4, x1; \
80 movdqa x0, x4; \
81 pslld $3, x4; \
82 pxor x2, x3; \
83 pxor x4, x3; \
84 movdqa x3, x4; \
85 pslld $7, x3; \
86 psrld $(32 - 7), x4; \
87 por x4, x3; \
88 movdqa x1, x4; \
89 pslld $7, x4; \
90 pxor x1, x0; \
91 pxor x3, x0; \
92 pxor x3, x2; \
93 pxor x4, x2; \
94 movdqa x0, x4; \
95 get_key(i, 1, RT0); \
96 pxor RT0, x1; \
97 get_key(i, 3, RT0); \
98 pxor RT0, x3; \
99 pslld $5, x0; \
100 psrld $(32 - 5), x4; \
101 por x4, x0; \
102 movdqa x2, x4; \
103 pslld $22, x2; \
104 psrld $(32 - 22), x4; \
105 por x4, x2; \
106 get_key(i, 0, RT0); \
107 pxor RT0, x0; \
108 get_key(i, 2, RT0); \
109 pxor RT0, x2;
110
111#define KL(x0, x1, x2, x3, x4, i) \
112 K(x0, x1, x2, x3, x4, i); \
113 movdqa x0, x4; \
114 psrld $5, x0; \
115 pslld $(32 - 5), x4; \
116 por x4, x0; \
117 movdqa x2, x4; \
118 psrld $22, x2; \
119 pslld $(32 - 22), x4; \
120 por x4, x2; \
121 pxor x3, x2; \
122 pxor x3, x0; \
123 movdqa x1, x4; \
124 pslld $7, x4; \
125 pxor x1, x0; \
126 pxor x4, x2; \
127 movdqa x1, x4; \
128 psrld $1, x1; \
129 pslld $(32 - 1), x4; \
130 por x4, x1; \
131 movdqa x3, x4; \
132 psrld $7, x3; \
133 pslld $(32 - 7), x4; \
134 por x4, x3; \
135 pxor x0, x1; \
136 movdqa x0, x4; \
137 pslld $3, x4; \
138 pxor x4, x3; \
139 movdqa x0, x4; \
140 psrld $13, x0; \
141 pslld $(32 - 13), x4; \
142 por x4, x0; \
143 pxor x2, x1; \
144 pxor x2, x3; \
145 movdqa x2, x4; \
146 psrld $3, x2; \
147 pslld $(32 - 3), x4; \
148 por x4, x2;
149
150#define S0(x0, x1, x2, x3, x4) \
151 movdqa x3, x4; \
152 por x0, x3; \
153 pxor x4, x0; \
154 pxor x2, x4; \
155 pxor RNOT, x4; \
156 pxor x1, x3; \
157 pand x0, x1; \
158 pxor x4, x1; \
159 pxor x0, x2; \
160 pxor x3, x0; \
161 por x0, x4; \
162 pxor x2, x0; \
163 pand x1, x2; \
164 pxor x2, x3; \
165 pxor RNOT, x1; \
166 pxor x4, x2; \
167 pxor x2, x1;
168
169#define S1(x0, x1, x2, x3, x4) \
170 movdqa x1, x4; \
171 pxor x0, x1; \
172 pxor x3, x0; \
173 pxor RNOT, x3; \
174 pand x1, x4; \
175 por x1, x0; \
176 pxor x2, x3; \
177 pxor x3, x0; \
178 pxor x3, x1; \
179 pxor x4, x3; \
180 por x4, x1; \
181 pxor x2, x4; \
182 pand x0, x2; \
183 pxor x1, x2; \
184 por x0, x1; \
185 pxor RNOT, x0; \
186 pxor x2, x0; \
187 pxor x1, x4;
188
189#define S2(x0, x1, x2, x3, x4) \
190 pxor RNOT, x3; \
191 pxor x0, x1; \
192 movdqa x0, x4; \
193 pand x2, x0; \
194 pxor x3, x0; \
195 por x4, x3; \
196 pxor x1, x2; \
197 pxor x1, x3; \
198 pand x0, x1; \
199 pxor x2, x0; \
200 pand x3, x2; \
201 por x1, x3; \
202 pxor RNOT, x0; \
203 pxor x0, x3; \
204 pxor x0, x4; \
205 pxor x2, x0; \
206 por x2, x1;
207
208#define S3(x0, x1, x2, x3, x4) \
209 movdqa x1, x4; \
210 pxor x3, x1; \
211 por x0, x3; \
212 pand x0, x4; \
213 pxor x2, x0; \
214 pxor x1, x2; \
215 pand x3, x1; \
216 pxor x3, x2; \
217 por x4, x0; \
218 pxor x3, x4; \
219 pxor x0, x1; \
220 pand x3, x0; \
221 pand x4, x3; \
222 pxor x2, x3; \
223 por x1, x4; \
224 pand x1, x2; \
225 pxor x3, x4; \
226 pxor x3, x0; \
227 pxor x2, x3;
228
229#define S4(x0, x1, x2, x3, x4) \
230 movdqa x3, x4; \
231 pand x0, x3; \
232 pxor x4, x0; \
233 pxor x2, x3; \
234 por x4, x2; \
235 pxor x1, x0; \
236 pxor x3, x4; \
237 por x0, x2; \
238 pxor x1, x2; \
239 pand x0, x1; \
240 pxor x4, x1; \
241 pand x2, x4; \
242 pxor x3, x2; \
243 pxor x0, x4; \
244 por x1, x3; \
245 pxor RNOT, x1; \
246 pxor x0, x3;
247
248#define S5(x0, x1, x2, x3, x4) \
249 movdqa x1, x4; \
250 por x0, x1; \
251 pxor x1, x2; \
252 pxor RNOT, x3; \
253 pxor x0, x4; \
254 pxor x2, x0; \
255 pand x4, x1; \
256 por x3, x4; \
257 pxor x0, x4; \
258 pand x3, x0; \
259 pxor x3, x1; \
260 pxor x2, x3; \
261 pxor x1, x0; \
262 pand x4, x2; \
263 pxor x2, x1; \
264 pand x0, x2; \
265 pxor x2, x3;
266
267#define S6(x0, x1, x2, x3, x4) \
268 movdqa x1, x4; \
269 pxor x0, x3; \
270 pxor x2, x1; \
271 pxor x0, x2; \
272 pand x3, x0; \
273 por x3, x1; \
274 pxor RNOT, x4; \
275 pxor x1, x0; \
276 pxor x2, x1; \
277 pxor x4, x3; \
278 pxor x0, x4; \
279 pand x0, x2; \
280 pxor x1, x4; \
281 pxor x3, x2; \
282 pand x1, x3; \
283 pxor x0, x3; \
284 pxor x2, x1;
285
286#define S7(x0, x1, x2, x3, x4) \
287 pxor RNOT, x1; \
288 movdqa x1, x4; \
289 pxor RNOT, x0; \
290 pand x2, x1; \
291 pxor x3, x1; \
292 por x4, x3; \
293 pxor x2, x4; \
294 pxor x3, x2; \
295 pxor x0, x3; \
296 por x1, x0; \
297 pand x0, x2; \
298 pxor x4, x0; \
299 pxor x3, x4; \
300 pand x0, x3; \
301 pxor x1, x4; \
302 pxor x4, x2; \
303 pxor x1, x3; \
304 por x0, x4; \
305 pxor x1, x4;
306
307#define SI0(x0, x1, x2, x3, x4) \
308 movdqa x3, x4; \
309 pxor x0, x1; \
310 por x1, x3; \
311 pxor x1, x4; \
312 pxor RNOT, x0; \
313 pxor x3, x2; \
314 pxor x0, x3; \
315 pand x1, x0; \
316 pxor x2, x0; \
317 pand x3, x2; \
318 pxor x4, x3; \
319 pxor x3, x2; \
320 pxor x3, x1; \
321 pand x0, x3; \
322 pxor x0, x1; \
323 pxor x2, x0; \
324 pxor x3, x4;
325
326#define SI1(x0, x1, x2, x3, x4) \
327 pxor x3, x1; \
328 movdqa x0, x4; \
329 pxor x2, x0; \
330 pxor RNOT, x2; \
331 por x1, x4; \
332 pxor x3, x4; \
333 pand x1, x3; \
334 pxor x2, x1; \
335 pand x4, x2; \
336 pxor x1, x4; \
337 por x3, x1; \
338 pxor x0, x3; \
339 pxor x0, x2; \
340 por x4, x0; \
341 pxor x4, x2; \
342 pxor x0, x1; \
343 pxor x1, x4;
344
345#define SI2(x0, x1, x2, x3, x4) \
346 pxor x1, x2; \
347 movdqa x3, x4; \
348 pxor RNOT, x3; \
349 por x2, x3; \
350 pxor x4, x2; \
351 pxor x0, x4; \
352 pxor x1, x3; \
353 por x2, x1; \
354 pxor x0, x2; \
355 pxor x4, x1; \
356 por x3, x4; \
357 pxor x3, x2; \
358 pxor x2, x4; \
359 pand x1, x2; \
360 pxor x3, x2; \
361 pxor x4, x3; \
362 pxor x0, x4;
363
364#define SI3(x0, x1, x2, x3, x4) \
365 pxor x1, x2; \
366 movdqa x1, x4; \
367 pand x2, x1; \
368 pxor x0, x1; \
369 por x4, x0; \
370 pxor x3, x4; \
371 pxor x3, x0; \
372 por x1, x3; \
373 pxor x2, x1; \
374 pxor x3, x1; \
375 pxor x2, x0; \
376 pxor x3, x2; \
377 pand x1, x3; \
378 pxor x0, x1; \
379 pand x2, x0; \
380 pxor x3, x4; \
381 pxor x0, x3; \
382 pxor x1, x0;
383
384#define SI4(x0, x1, x2, x3, x4) \
385 pxor x3, x2; \
386 movdqa x0, x4; \
387 pand x1, x0; \
388 pxor x2, x0; \
389 por x3, x2; \
390 pxor RNOT, x4; \
391 pxor x0, x1; \
392 pxor x2, x0; \
393 pand x4, x2; \
394 pxor x0, x2; \
395 por x4, x0; \
396 pxor x3, x0; \
397 pand x2, x3; \
398 pxor x3, x4; \
399 pxor x1, x3; \
400 pand x0, x1; \
401 pxor x1, x4; \
402 pxor x3, x0;
403
404#define SI5(x0, x1, x2, x3, x4) \
405 movdqa x1, x4; \
406 por x2, x1; \
407 pxor x4, x2; \
408 pxor x3, x1; \
409 pand x4, x3; \
410 pxor x3, x2; \
411 por x0, x3; \
412 pxor RNOT, x0; \
413 pxor x2, x3; \
414 por x0, x2; \
415 pxor x1, x4; \
416 pxor x4, x2; \
417 pand x0, x4; \
418 pxor x1, x0; \
419 pxor x3, x1; \
420 pand x2, x0; \
421 pxor x3, x2; \
422 pxor x2, x0; \
423 pxor x4, x2; \
424 pxor x3, x4;
425
426#define SI6(x0, x1, x2, x3, x4) \
427 pxor x2, x0; \
428 movdqa x0, x4; \
429 pand x3, x0; \
430 pxor x3, x2; \
431 pxor x2, x0; \
432 pxor x1, x3; \
433 por x4, x2; \
434 pxor x3, x2; \
435 pand x0, x3; \
436 pxor RNOT, x0; \
437 pxor x1, x3; \
438 pand x2, x1; \
439 pxor x0, x4; \
440 pxor x4, x3; \
441 pxor x2, x4; \
442 pxor x1, x0; \
443 pxor x0, x2;
444
445#define SI7(x0, x1, x2, x3, x4) \
446 movdqa x3, x4; \
447 pand x0, x3; \
448 pxor x2, x0; \
449 por x4, x2; \
450 pxor x1, x4; \
451 pxor RNOT, x0; \
452 por x3, x1; \
453 pxor x0, x4; \
454 pand x2, x0; \
455 pxor x1, x0; \
456 pand x2, x1; \
457 pxor x2, x3; \
458 pxor x3, x4; \
459 pand x3, x2; \
460 por x0, x3; \
461 pxor x4, x1; \
462 pxor x4, x3; \
463 pand x0, x4; \
464 pxor x2, x4;
465
466#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
467 movdqa x2, t3; \
468 movdqa x0, t1; \
469 unpcklps x3, t3; \
470 movdqa x0, t2; \
471 unpcklps x1, t1; \
472 unpckhps x1, t2; \
473 movdqa t3, x1; \
474 unpckhps x3, x2; \
475 movdqa t1, x0; \
476 movhlps t1, x1; \
477 movdqa t2, t1; \
478 movlhps t3, x0; \
479 movlhps x2, t1; \
480 movhlps t2, x2; \
481 movdqa x2, x3; \
482 movdqa t1, x2;
483
484#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
485 movdqu (0*4*4)(in), x0; \
486 movdqu (1*4*4)(in), x1; \
487 movdqu (2*4*4)(in), x2; \
488 movdqu (3*4*4)(in), x3; \
489 \
490 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
491
492#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
493 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
494 \
495 movdqu x0, (0*4*4)(out); \
496 movdqu x1, (1*4*4)(out); \
497 movdqu x2, (2*4*4)(out); \
498 movdqu x3, (3*4*4)(out);
499
500#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
501 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
502 \
503 movdqu (0*4*4)(out), t0; \
504 pxor t0, x0; \
505 movdqu x0, (0*4*4)(out); \
506 movdqu (1*4*4)(out), t0; \
507 pxor t0, x1; \
508 movdqu x1, (1*4*4)(out); \
509 movdqu (2*4*4)(out), t0; \
510 pxor t0, x2; \
511 movdqu x2, (2*4*4)(out); \
512 movdqu (3*4*4)(out), t0; \
513 pxor t0, x3; \
514 movdqu x3, (3*4*4)(out);
515
516.align 8
517.global __serpent_enc_blk_4way
518.type __serpent_enc_blk_4way,@function;
519
520__serpent_enc_blk_4way:
521 /* input:
522 * arg_ctx(%esp): ctx, CTX
523 * arg_dst(%esp): dst
524 * arg_src(%esp): src
525 * arg_xor(%esp): bool, if true: xor output
526 */
527
528 pcmpeqd RNOT, RNOT;
529
530 movl arg_ctx(%esp), CTX;
531
532 movl arg_src(%esp), %eax;
533 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
534
535 K(RA, RB, RC, RD, RE, 0);
536 S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1);
537 S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2);
538 S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3);
539 S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4);
540 S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5);
541 S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6);
542 S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7);
543 S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8);
544 S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9);
545 S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10);
546 S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11);
547 S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12);
548 S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13);
549 S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14);
550 S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15);
551 S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16);
552 S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17);
553 S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18);
554 S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19);
555 S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20);
556 S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21);
557 S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22);
558 S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23);
559 S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24);
560 S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25);
561 S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26);
562 S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27);
563 S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28);
564 S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29);
565 S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30);
566 S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31);
567 S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32);
568
569 movl arg_dst(%esp), %eax;
570
571 cmpb $0, arg_xor(%esp);
572 jnz __enc_xor4;
573
574 write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
575
576 ret;
577
578__enc_xor4:
579 xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
580
581 ret;
582
583.align 8
584.global serpent_dec_blk_4way
585.type serpent_dec_blk_4way,@function;
586
587serpent_dec_blk_4way:
588 /* input:
589 * arg_ctx(%esp): ctx, CTX
590 * arg_dst(%esp): dst
591 * arg_src(%esp): src
592 */
593
594 pcmpeqd RNOT, RNOT;
595
596 movl arg_ctx(%esp), CTX;
597
598 movl arg_src(%esp), %eax;
599 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE);
600
601 K(RA, RB, RC, RD, RE, 32);
602 SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31);
603 SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30);
604 SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29);
605 SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28);
606 SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27);
607 SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26);
608 SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25);
609 SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24);
610 SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23);
611 SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22);
612 SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21);
613 SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20);
614 SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19);
615 SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18);
616 SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17);
617 SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16);
618 SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15);
619 SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14);
620 SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13);
621 SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12);
622 SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11);
623 SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10);
624 SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9);
625 SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8);
626 SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7);
627 SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6);
628 SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5);
629 SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4);
630 SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3);
631 SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2);
632 SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1);
633 SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0);
634
635 movl arg_dst(%esp), %eax;
636 write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA);
637
638 ret;