aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorTan Swee Heng <thesweeheng@gmail.com>2007-12-17 11:04:40 -0500
committerHerbert Xu <herbert@gondor.apana.org.au>2008-01-10 16:16:57 -0500
commit9a7dafbba47384c330779c75a1546684efaa8c1a (patch)
tree0fde4a938ebc3c9deb0873b709dc5d2d69ab25c3 /arch/x86
parent974e4b752ee623854c5dc2bbfc7c7725029ce173 (diff)
[CRYPTO] salsa20: Add x86-64 assembly version
This is the x86-64 version of the Salsa20 stream cipher algorithm. The original assembly code came from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>. It has been reformatted for clarity. Signed-off-by: Tan Swee Heng <thesweeheng@gmail.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/salsa20-x86_64-asm_64.S920
-rw-r--r--arch/x86/crypto/salsa20_glue.c2
3 files changed, 924 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 25cc8441046a..09200e12f14d 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
8 8
9obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 9obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
11 12
12aes-i586-y := aes-i586-asm_32.o aes_glue.o 13aes-i586-y := aes-i586-asm_32.o aes_glue.o
13twofish-i586-y := twofish-i586-asm_32.o twofish_32.o 14twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
@@ -15,3 +16,4 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
15 16
16aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 17aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
17twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o 18twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
19salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
new file mode 100644
index 000000000000..6214a9b09706
--- /dev/null
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -0,0 +1,920 @@
1# enter ECRYPT_encrypt_bytes
2.text
3.p2align 5
4.globl ECRYPT_encrypt_bytes
5ECRYPT_encrypt_bytes:
6 mov %rsp,%r11
7 and $31,%r11
8 add $256,%r11
9 sub %r11,%rsp
10 # x = arg1
11 mov %rdi,%r8
12 # m = arg2
13 mov %rsi,%rsi
14 # out = arg3
15 mov %rdx,%rdi
16 # bytes = arg4
17 mov %rcx,%rdx
18 # unsigned>? bytes - 0
19 cmp $0,%rdx
20 # comment:fp stack unchanged by jump
21 # goto done if !unsigned>
22 jbe ._done
23 # comment:fp stack unchanged by fallthrough
24# start:
25._start:
26 # r11_stack = r11
27 movq %r11,0(%rsp)
28 # r12_stack = r12
29 movq %r12,8(%rsp)
30 # r13_stack = r13
31 movq %r13,16(%rsp)
32 # r14_stack = r14
33 movq %r14,24(%rsp)
34 # r15_stack = r15
35 movq %r15,32(%rsp)
36 # rbx_stack = rbx
37 movq %rbx,40(%rsp)
38 # rbp_stack = rbp
39 movq %rbp,48(%rsp)
40 # in0 = *(uint64 *) (x + 0)
41 movq 0(%r8),%rcx
42 # in2 = *(uint64 *) (x + 8)
43 movq 8(%r8),%r9
44 # in4 = *(uint64 *) (x + 16)
45 movq 16(%r8),%rax
46 # in6 = *(uint64 *) (x + 24)
47 movq 24(%r8),%r10
48 # in8 = *(uint64 *) (x + 32)
49 movq 32(%r8),%r11
50 # in10 = *(uint64 *) (x + 40)
51 movq 40(%r8),%r12
52 # in12 = *(uint64 *) (x + 48)
53 movq 48(%r8),%r13
54 # in14 = *(uint64 *) (x + 56)
55 movq 56(%r8),%r14
56 # j0 = in0
57 movq %rcx,56(%rsp)
58 # j2 = in2
59 movq %r9,64(%rsp)
60 # j4 = in4
61 movq %rax,72(%rsp)
62 # j6 = in6
63 movq %r10,80(%rsp)
64 # j8 = in8
65 movq %r11,88(%rsp)
66 # j10 = in10
67 movq %r12,96(%rsp)
68 # j12 = in12
69 movq %r13,104(%rsp)
70 # j14 = in14
71 movq %r14,112(%rsp)
72 # x_backup = x
73 movq %r8,120(%rsp)
74# bytesatleast1:
75._bytesatleast1:
76 # unsigned<? bytes - 64
77 cmp $64,%rdx
78 # comment:fp stack unchanged by jump
79 # goto nocopy if !unsigned<
80 jae ._nocopy
81 # ctarget = out
82 movq %rdi,128(%rsp)
83 # out = &tmp
84 leaq 192(%rsp),%rdi
85 # i = bytes
86 mov %rdx,%rcx
87 # while (i) { *out++ = *m++; --i }
88 rep movsb
89 # out = &tmp
90 leaq 192(%rsp),%rdi
91 # m = &tmp
92 leaq 192(%rsp),%rsi
93 # comment:fp stack unchanged by fallthrough
94# nocopy:
95._nocopy:
96 # out_backup = out
97 movq %rdi,136(%rsp)
98 # m_backup = m
99 movq %rsi,144(%rsp)
100 # bytes_backup = bytes
101 movq %rdx,152(%rsp)
102 # x1 = j0
103 movq 56(%rsp),%rdi
104 # x0 = x1
105 mov %rdi,%rdx
106 # (uint64) x1 >>= 32
107 shr $32,%rdi
108 # x3 = j2
109 movq 64(%rsp),%rsi
110 # x2 = x3
111 mov %rsi,%rcx
112 # (uint64) x3 >>= 32
113 shr $32,%rsi
114 # x5 = j4
115 movq 72(%rsp),%r8
116 # x4 = x5
117 mov %r8,%r9
118 # (uint64) x5 >>= 32
119 shr $32,%r8
120 # x5_stack = x5
121 movq %r8,160(%rsp)
122 # x7 = j6
123 movq 80(%rsp),%r8
124 # x6 = x7
125 mov %r8,%rax
126 # (uint64) x7 >>= 32
127 shr $32,%r8
128 # x9 = j8
129 movq 88(%rsp),%r10
130 # x8 = x9
131 mov %r10,%r11
132 # (uint64) x9 >>= 32
133 shr $32,%r10
134 # x11 = j10
135 movq 96(%rsp),%r12
136 # x10 = x11
137 mov %r12,%r13
138 # x10_stack = x10
139 movq %r13,168(%rsp)
140 # (uint64) x11 >>= 32
141 shr $32,%r12
142 # x13 = j12
143 movq 104(%rsp),%r13
144 # x12 = x13
145 mov %r13,%r14
146 # (uint64) x13 >>= 32
147 shr $32,%r13
148 # x15 = j14
149 movq 112(%rsp),%r15
150 # x14 = x15
151 mov %r15,%rbx
152 # (uint64) x15 >>= 32
153 shr $32,%r15
154 # x15_stack = x15
155 movq %r15,176(%rsp)
156 # i = 20
157 mov $20,%r15
158# mainloop:
159._mainloop:
160 # i_backup = i
161 movq %r15,184(%rsp)
162 # x5 = x5_stack
163 movq 160(%rsp),%r15
164 # a = x12 + x0
165 lea (%r14,%rdx),%rbp
166 # (uint32) a <<<= 7
167 rol $7,%ebp
168 # x4 ^= a
169 xor %rbp,%r9
170 # b = x1 + x5
171 lea (%rdi,%r15),%rbp
172 # (uint32) b <<<= 7
173 rol $7,%ebp
174 # x9 ^= b
175 xor %rbp,%r10
176 # a = x0 + x4
177 lea (%rdx,%r9),%rbp
178 # (uint32) a <<<= 9
179 rol $9,%ebp
180 # x8 ^= a
181 xor %rbp,%r11
182 # b = x5 + x9
183 lea (%r15,%r10),%rbp
184 # (uint32) b <<<= 9
185 rol $9,%ebp
186 # x13 ^= b
187 xor %rbp,%r13
188 # a = x4 + x8
189 lea (%r9,%r11),%rbp
190 # (uint32) a <<<= 13
191 rol $13,%ebp
192 # x12 ^= a
193 xor %rbp,%r14
194 # b = x9 + x13
195 lea (%r10,%r13),%rbp
196 # (uint32) b <<<= 13
197 rol $13,%ebp
198 # x1 ^= b
199 xor %rbp,%rdi
200 # a = x8 + x12
201 lea (%r11,%r14),%rbp
202 # (uint32) a <<<= 18
203 rol $18,%ebp
204 # x0 ^= a
205 xor %rbp,%rdx
206 # b = x13 + x1
207 lea (%r13,%rdi),%rbp
208 # (uint32) b <<<= 18
209 rol $18,%ebp
210 # x5 ^= b
211 xor %rbp,%r15
212 # x10 = x10_stack
213 movq 168(%rsp),%rbp
214 # x5_stack = x5
215 movq %r15,160(%rsp)
216 # c = x6 + x10
217 lea (%rax,%rbp),%r15
218 # (uint32) c <<<= 7
219 rol $7,%r15d
220 # x14 ^= c
221 xor %r15,%rbx
222 # c = x10 + x14
223 lea (%rbp,%rbx),%r15
224 # (uint32) c <<<= 9
225 rol $9,%r15d
226 # x2 ^= c
227 xor %r15,%rcx
228 # c = x14 + x2
229 lea (%rbx,%rcx),%r15
230 # (uint32) c <<<= 13
231 rol $13,%r15d
232 # x6 ^= c
233 xor %r15,%rax
234 # c = x2 + x6
235 lea (%rcx,%rax),%r15
236 # (uint32) c <<<= 18
237 rol $18,%r15d
238 # x10 ^= c
239 xor %r15,%rbp
240 # x15 = x15_stack
241 movq 176(%rsp),%r15
242 # x10_stack = x10
243 movq %rbp,168(%rsp)
244 # d = x11 + x15
245 lea (%r12,%r15),%rbp
246 # (uint32) d <<<= 7
247 rol $7,%ebp
248 # x3 ^= d
249 xor %rbp,%rsi
250 # d = x15 + x3
251 lea (%r15,%rsi),%rbp
252 # (uint32) d <<<= 9
253 rol $9,%ebp
254 # x7 ^= d
255 xor %rbp,%r8
256 # d = x3 + x7
257 lea (%rsi,%r8),%rbp
258 # (uint32) d <<<= 13
259 rol $13,%ebp
260 # x11 ^= d
261 xor %rbp,%r12
262 # d = x7 + x11
263 lea (%r8,%r12),%rbp
264 # (uint32) d <<<= 18
265 rol $18,%ebp
266 # x15 ^= d
267 xor %rbp,%r15
268 # x15_stack = x15
269 movq %r15,176(%rsp)
270 # x5 = x5_stack
271 movq 160(%rsp),%r15
272 # a = x3 + x0
273 lea (%rsi,%rdx),%rbp
274 # (uint32) a <<<= 7
275 rol $7,%ebp
276 # x1 ^= a
277 xor %rbp,%rdi
278 # b = x4 + x5
279 lea (%r9,%r15),%rbp
280 # (uint32) b <<<= 7
281 rol $7,%ebp
282 # x6 ^= b
283 xor %rbp,%rax
284 # a = x0 + x1
285 lea (%rdx,%rdi),%rbp
286 # (uint32) a <<<= 9
287 rol $9,%ebp
288 # x2 ^= a
289 xor %rbp,%rcx
290 # b = x5 + x6
291 lea (%r15,%rax),%rbp
292 # (uint32) b <<<= 9
293 rol $9,%ebp
294 # x7 ^= b
295 xor %rbp,%r8
296 # a = x1 + x2
297 lea (%rdi,%rcx),%rbp
298 # (uint32) a <<<= 13
299 rol $13,%ebp
300 # x3 ^= a
301 xor %rbp,%rsi
302 # b = x6 + x7
303 lea (%rax,%r8),%rbp
304 # (uint32) b <<<= 13
305 rol $13,%ebp
306 # x4 ^= b
307 xor %rbp,%r9
308 # a = x2 + x3
309 lea (%rcx,%rsi),%rbp
310 # (uint32) a <<<= 18
311 rol $18,%ebp
312 # x0 ^= a
313 xor %rbp,%rdx
314 # b = x7 + x4
315 lea (%r8,%r9),%rbp
316 # (uint32) b <<<= 18
317 rol $18,%ebp
318 # x5 ^= b
319 xor %rbp,%r15
320 # x10 = x10_stack
321 movq 168(%rsp),%rbp
322 # x5_stack = x5
323 movq %r15,160(%rsp)
324 # c = x9 + x10
325 lea (%r10,%rbp),%r15
326 # (uint32) c <<<= 7
327 rol $7,%r15d
328 # x11 ^= c
329 xor %r15,%r12
330 # c = x10 + x11
331 lea (%rbp,%r12),%r15
332 # (uint32) c <<<= 9
333 rol $9,%r15d
334 # x8 ^= c
335 xor %r15,%r11
336 # c = x11 + x8
337 lea (%r12,%r11),%r15
338 # (uint32) c <<<= 13
339 rol $13,%r15d
340 # x9 ^= c
341 xor %r15,%r10
342 # c = x8 + x9
343 lea (%r11,%r10),%r15
344 # (uint32) c <<<= 18
345 rol $18,%r15d
346 # x10 ^= c
347 xor %r15,%rbp
348 # x15 = x15_stack
349 movq 176(%rsp),%r15
350 # x10_stack = x10
351 movq %rbp,168(%rsp)
352 # d = x14 + x15
353 lea (%rbx,%r15),%rbp
354 # (uint32) d <<<= 7
355 rol $7,%ebp
356 # x12 ^= d
357 xor %rbp,%r14
358 # d = x15 + x12
359 lea (%r15,%r14),%rbp
360 # (uint32) d <<<= 9
361 rol $9,%ebp
362 # x13 ^= d
363 xor %rbp,%r13
364 # d = x12 + x13
365 lea (%r14,%r13),%rbp
366 # (uint32) d <<<= 13
367 rol $13,%ebp
368 # x14 ^= d
369 xor %rbp,%rbx
370 # d = x13 + x14
371 lea (%r13,%rbx),%rbp
372 # (uint32) d <<<= 18
373 rol $18,%ebp
374 # x15 ^= d
375 xor %rbp,%r15
376 # x15_stack = x15
377 movq %r15,176(%rsp)
378 # x5 = x5_stack
379 movq 160(%rsp),%r15
380 # a = x12 + x0
381 lea (%r14,%rdx),%rbp
382 # (uint32) a <<<= 7
383 rol $7,%ebp
384 # x4 ^= a
385 xor %rbp,%r9
386 # b = x1 + x5
387 lea (%rdi,%r15),%rbp
388 # (uint32) b <<<= 7
389 rol $7,%ebp
390 # x9 ^= b
391 xor %rbp,%r10
392 # a = x0 + x4
393 lea (%rdx,%r9),%rbp
394 # (uint32) a <<<= 9
395 rol $9,%ebp
396 # x8 ^= a
397 xor %rbp,%r11
398 # b = x5 + x9
399 lea (%r15,%r10),%rbp
400 # (uint32) b <<<= 9
401 rol $9,%ebp
402 # x13 ^= b
403 xor %rbp,%r13
404 # a = x4 + x8
405 lea (%r9,%r11),%rbp
406 # (uint32) a <<<= 13
407 rol $13,%ebp
408 # x12 ^= a
409 xor %rbp,%r14
410 # b = x9 + x13
411 lea (%r10,%r13),%rbp
412 # (uint32) b <<<= 13
413 rol $13,%ebp
414 # x1 ^= b
415 xor %rbp,%rdi
416 # a = x8 + x12
417 lea (%r11,%r14),%rbp
418 # (uint32) a <<<= 18
419 rol $18,%ebp
420 # x0 ^= a
421 xor %rbp,%rdx
422 # b = x13 + x1
423 lea (%r13,%rdi),%rbp
424 # (uint32) b <<<= 18
425 rol $18,%ebp
426 # x5 ^= b
427 xor %rbp,%r15
428 # x10 = x10_stack
429 movq 168(%rsp),%rbp
430 # x5_stack = x5
431 movq %r15,160(%rsp)
432 # c = x6 + x10
433 lea (%rax,%rbp),%r15
434 # (uint32) c <<<= 7
435 rol $7,%r15d
436 # x14 ^= c
437 xor %r15,%rbx
438 # c = x10 + x14
439 lea (%rbp,%rbx),%r15
440 # (uint32) c <<<= 9
441 rol $9,%r15d
442 # x2 ^= c
443 xor %r15,%rcx
444 # c = x14 + x2
445 lea (%rbx,%rcx),%r15
446 # (uint32) c <<<= 13
447 rol $13,%r15d
448 # x6 ^= c
449 xor %r15,%rax
450 # c = x2 + x6
451 lea (%rcx,%rax),%r15
452 # (uint32) c <<<= 18
453 rol $18,%r15d
454 # x10 ^= c
455 xor %r15,%rbp
456 # x15 = x15_stack
457 movq 176(%rsp),%r15
458 # x10_stack = x10
459 movq %rbp,168(%rsp)
460 # d = x11 + x15
461 lea (%r12,%r15),%rbp
462 # (uint32) d <<<= 7
463 rol $7,%ebp
464 # x3 ^= d
465 xor %rbp,%rsi
466 # d = x15 + x3
467 lea (%r15,%rsi),%rbp
468 # (uint32) d <<<= 9
469 rol $9,%ebp
470 # x7 ^= d
471 xor %rbp,%r8
472 # d = x3 + x7
473 lea (%rsi,%r8),%rbp
474 # (uint32) d <<<= 13
475 rol $13,%ebp
476 # x11 ^= d
477 xor %rbp,%r12
478 # d = x7 + x11
479 lea (%r8,%r12),%rbp
480 # (uint32) d <<<= 18
481 rol $18,%ebp
482 # x15 ^= d
483 xor %rbp,%r15
484 # x15_stack = x15
485 movq %r15,176(%rsp)
486 # x5 = x5_stack
487 movq 160(%rsp),%r15
488 # a = x3 + x0
489 lea (%rsi,%rdx),%rbp
490 # (uint32) a <<<= 7
491 rol $7,%ebp
492 # x1 ^= a
493 xor %rbp,%rdi
494 # b = x4 + x5
495 lea (%r9,%r15),%rbp
496 # (uint32) b <<<= 7
497 rol $7,%ebp
498 # x6 ^= b
499 xor %rbp,%rax
500 # a = x0 + x1
501 lea (%rdx,%rdi),%rbp
502 # (uint32) a <<<= 9
503 rol $9,%ebp
504 # x2 ^= a
505 xor %rbp,%rcx
506 # b = x5 + x6
507 lea (%r15,%rax),%rbp
508 # (uint32) b <<<= 9
509 rol $9,%ebp
510 # x7 ^= b
511 xor %rbp,%r8
512 # a = x1 + x2
513 lea (%rdi,%rcx),%rbp
514 # (uint32) a <<<= 13
515 rol $13,%ebp
516 # x3 ^= a
517 xor %rbp,%rsi
518 # b = x6 + x7
519 lea (%rax,%r8),%rbp
520 # (uint32) b <<<= 13
521 rol $13,%ebp
522 # x4 ^= b
523 xor %rbp,%r9
524 # a = x2 + x3
525 lea (%rcx,%rsi),%rbp
526 # (uint32) a <<<= 18
527 rol $18,%ebp
528 # x0 ^= a
529 xor %rbp,%rdx
530 # b = x7 + x4
531 lea (%r8,%r9),%rbp
532 # (uint32) b <<<= 18
533 rol $18,%ebp
534 # x5 ^= b
535 xor %rbp,%r15
536 # x10 = x10_stack
537 movq 168(%rsp),%rbp
538 # x5_stack = x5
539 movq %r15,160(%rsp)
540 # c = x9 + x10
541 lea (%r10,%rbp),%r15
542 # (uint32) c <<<= 7
543 rol $7,%r15d
544 # x11 ^= c
545 xor %r15,%r12
546 # c = x10 + x11
547 lea (%rbp,%r12),%r15
548 # (uint32) c <<<= 9
549 rol $9,%r15d
550 # x8 ^= c
551 xor %r15,%r11
552 # c = x11 + x8
553 lea (%r12,%r11),%r15
554 # (uint32) c <<<= 13
555 rol $13,%r15d
556 # x9 ^= c
557 xor %r15,%r10
558 # c = x8 + x9
559 lea (%r11,%r10),%r15
560 # (uint32) c <<<= 18
561 rol $18,%r15d
562 # x10 ^= c
563 xor %r15,%rbp
564 # x15 = x15_stack
565 movq 176(%rsp),%r15
566 # x10_stack = x10
567 movq %rbp,168(%rsp)
568 # d = x14 + x15
569 lea (%rbx,%r15),%rbp
570 # (uint32) d <<<= 7
571 rol $7,%ebp
572 # x12 ^= d
573 xor %rbp,%r14
574 # d = x15 + x12
575 lea (%r15,%r14),%rbp
576 # (uint32) d <<<= 9
577 rol $9,%ebp
578 # x13 ^= d
579 xor %rbp,%r13
580 # d = x12 + x13
581 lea (%r14,%r13),%rbp
582 # (uint32) d <<<= 13
583 rol $13,%ebp
584 # x14 ^= d
585 xor %rbp,%rbx
586 # d = x13 + x14
587 lea (%r13,%rbx),%rbp
588 # (uint32) d <<<= 18
589 rol $18,%ebp
590 # x15 ^= d
591 xor %rbp,%r15
592 # x15_stack = x15
593 movq %r15,176(%rsp)
594 # i = i_backup
595 movq 184(%rsp),%r15
596 # unsigned>? i -= 4
597 sub $4,%r15
598 # comment:fp stack unchanged by jump
599 # goto mainloop if unsigned>
600 ja ._mainloop
601 # (uint32) x2 += j2
602 addl 64(%rsp),%ecx
603 # x3 <<= 32
604 shl $32,%rsi
605 # x3 += j2
606 addq 64(%rsp),%rsi
607 # (uint64) x3 >>= 32
608 shr $32,%rsi
609 # x3 <<= 32
610 shl $32,%rsi
611 # x2 += x3
612 add %rsi,%rcx
613 # (uint32) x6 += j6
614 addl 80(%rsp),%eax
615 # x7 <<= 32
616 shl $32,%r8
617 # x7 += j6
618 addq 80(%rsp),%r8
619 # (uint64) x7 >>= 32
620 shr $32,%r8
621 # x7 <<= 32
622 shl $32,%r8
623 # x6 += x7
624 add %r8,%rax
625 # (uint32) x8 += j8
626 addl 88(%rsp),%r11d
627 # x9 <<= 32
628 shl $32,%r10
629 # x9 += j8
630 addq 88(%rsp),%r10
631 # (uint64) x9 >>= 32
632 shr $32,%r10
633 # x9 <<= 32
634 shl $32,%r10
635 # x8 += x9
636 add %r10,%r11
637 # (uint32) x12 += j12
638 addl 104(%rsp),%r14d
639 # x13 <<= 32
640 shl $32,%r13
641 # x13 += j12
642 addq 104(%rsp),%r13
643 # (uint64) x13 >>= 32
644 shr $32,%r13
645 # x13 <<= 32
646 shl $32,%r13
647 # x12 += x13
648 add %r13,%r14
649 # (uint32) x0 += j0
650 addl 56(%rsp),%edx
651 # x1 <<= 32
652 shl $32,%rdi
653 # x1 += j0
654 addq 56(%rsp),%rdi
655 # (uint64) x1 >>= 32
656 shr $32,%rdi
657 # x1 <<= 32
658 shl $32,%rdi
659 # x0 += x1
660 add %rdi,%rdx
661 # x5 = x5_stack
662 movq 160(%rsp),%rdi
663 # (uint32) x4 += j4
664 addl 72(%rsp),%r9d
665 # x5 <<= 32
666 shl $32,%rdi
667 # x5 += j4
668 addq 72(%rsp),%rdi
669 # (uint64) x5 >>= 32
670 shr $32,%rdi
671 # x5 <<= 32
672 shl $32,%rdi
673 # x4 += x5
674 add %rdi,%r9
675 # x10 = x10_stack
676 movq 168(%rsp),%r8
677 # (uint32) x10 += j10
678 addl 96(%rsp),%r8d
679 # x11 <<= 32
680 shl $32,%r12
681 # x11 += j10
682 addq 96(%rsp),%r12
683 # (uint64) x11 >>= 32
684 shr $32,%r12
685 # x11 <<= 32
686 shl $32,%r12
687 # x10 += x11
688 add %r12,%r8
689 # x15 = x15_stack
690 movq 176(%rsp),%rdi
691 # (uint32) x14 += j14
692 addl 112(%rsp),%ebx
693 # x15 <<= 32
694 shl $32,%rdi
695 # x15 += j14
696 addq 112(%rsp),%rdi
697 # (uint64) x15 >>= 32
698 shr $32,%rdi
699 # x15 <<= 32
700 shl $32,%rdi
701 # x14 += x15
702 add %rdi,%rbx
703 # out = out_backup
704 movq 136(%rsp),%rdi
705 # m = m_backup
706 movq 144(%rsp),%rsi
707 # x0 ^= *(uint64 *) (m + 0)
708 xorq 0(%rsi),%rdx
709 # *(uint64 *) (out + 0) = x0
710 movq %rdx,0(%rdi)
711 # x2 ^= *(uint64 *) (m + 8)
712 xorq 8(%rsi),%rcx
713 # *(uint64 *) (out + 8) = x2
714 movq %rcx,8(%rdi)
715 # x4 ^= *(uint64 *) (m + 16)
716 xorq 16(%rsi),%r9
717 # *(uint64 *) (out + 16) = x4
718 movq %r9,16(%rdi)
719 # x6 ^= *(uint64 *) (m + 24)
720 xorq 24(%rsi),%rax
721 # *(uint64 *) (out + 24) = x6
722 movq %rax,24(%rdi)
723 # x8 ^= *(uint64 *) (m + 32)
724 xorq 32(%rsi),%r11
725 # *(uint64 *) (out + 32) = x8
726 movq %r11,32(%rdi)
727 # x10 ^= *(uint64 *) (m + 40)
728 xorq 40(%rsi),%r8
729 # *(uint64 *) (out + 40) = x10
730 movq %r8,40(%rdi)
731 # x12 ^= *(uint64 *) (m + 48)
732 xorq 48(%rsi),%r14
733 # *(uint64 *) (out + 48) = x12
734 movq %r14,48(%rdi)
735 # x14 ^= *(uint64 *) (m + 56)
736 xorq 56(%rsi),%rbx
737 # *(uint64 *) (out + 56) = x14
738 movq %rbx,56(%rdi)
739 # bytes = bytes_backup
740 movq 152(%rsp),%rdx
741 # in8 = j8
742 movq 88(%rsp),%rcx
743 # in8 += 1
744 add $1,%rcx
745 # j8 = in8
746 movq %rcx,88(%rsp)
747 # unsigned>? unsigned<? bytes - 64
748 cmp $64,%rdx
749 # comment:fp stack unchanged by jump
750 # goto bytesatleast65 if unsigned>
751 ja ._bytesatleast65
752 # comment:fp stack unchanged by jump
753 # goto bytesatleast64 if !unsigned<
754 jae ._bytesatleast64
755 # m = out
756 mov %rdi,%rsi
757 # out = ctarget
758 movq 128(%rsp),%rdi
759 # i = bytes
760 mov %rdx,%rcx
761 # while (i) { *out++ = *m++; --i }
762 rep movsb
763 # comment:fp stack unchanged by fallthrough
764# bytesatleast64:
765._bytesatleast64:
766 # x = x_backup
767 movq 120(%rsp),%rdi
768 # in8 = j8
769 movq 88(%rsp),%rsi
770 # *(uint64 *) (x + 32) = in8
771 movq %rsi,32(%rdi)
772 # r11 = r11_stack
773 movq 0(%rsp),%r11
774 # r12 = r12_stack
775 movq 8(%rsp),%r12
776 # r13 = r13_stack
777 movq 16(%rsp),%r13
778 # r14 = r14_stack
779 movq 24(%rsp),%r14
780 # r15 = r15_stack
781 movq 32(%rsp),%r15
782 # rbx = rbx_stack
783 movq 40(%rsp),%rbx
784 # rbp = rbp_stack
785 movq 48(%rsp),%rbp
786 # comment:fp stack unchanged by fallthrough
787# done:
788._done:
789 # leave
790 add %r11,%rsp
791 mov %rdi,%rax
792 mov %rsi,%rdx
793 ret
794# bytesatleast65:
795._bytesatleast65:
796 # bytes -= 64
797 sub $64,%rdx
798 # out += 64
799 add $64,%rdi
800 # m += 64
801 add $64,%rsi
802 # comment:fp stack unchanged by jump
803 # goto bytesatleast1
804 jmp ._bytesatleast1
805# enter ECRYPT_keysetup
806.text
807.p2align 5
808.globl ECRYPT_keysetup
809ECRYPT_keysetup:
810 mov %rsp,%r11
811 and $31,%r11
812 add $256,%r11
813 sub %r11,%rsp
814 # k = arg2
815 mov %rsi,%rsi
816 # kbits = arg3
817 mov %rdx,%rdx
818 # x = arg1
819 mov %rdi,%rdi
820 # in0 = *(uint64 *) (k + 0)
821 movq 0(%rsi),%r8
822 # in2 = *(uint64 *) (k + 8)
823 movq 8(%rsi),%r9
824 # *(uint64 *) (x + 4) = in0
825 movq %r8,4(%rdi)
826 # *(uint64 *) (x + 12) = in2
827 movq %r9,12(%rdi)
828 # unsigned<? kbits - 256
829 cmp $256,%rdx
830 # comment:fp stack unchanged by jump
831 # goto kbits128 if unsigned<
832 jb ._kbits128
833# kbits256:
834._kbits256:
835 # in10 = *(uint64 *) (k + 16)
836 movq 16(%rsi),%rdx
837 # in12 = *(uint64 *) (k + 24)
838 movq 24(%rsi),%rsi
839 # *(uint64 *) (x + 44) = in10
840 movq %rdx,44(%rdi)
841 # *(uint64 *) (x + 52) = in12
842 movq %rsi,52(%rdi)
843 # in0 = 1634760805
844 mov $1634760805,%rsi
845 # in4 = 857760878
846 mov $857760878,%rdx
847 # in10 = 2036477234
848 mov $2036477234,%rcx
849 # in14 = 1797285236
850 mov $1797285236,%r8
851 # *(uint32 *) (x + 0) = in0
852 movl %esi,0(%rdi)
853 # *(uint32 *) (x + 20) = in4
854 movl %edx,20(%rdi)
855 # *(uint32 *) (x + 40) = in10
856 movl %ecx,40(%rdi)
857 # *(uint32 *) (x + 60) = in14
858 movl %r8d,60(%rdi)
859 # comment:fp stack unchanged by jump
860 # goto keysetupdone
861 jmp ._keysetupdone
862# kbits128:
863._kbits128:
864 # in10 = *(uint64 *) (k + 0)
865 movq 0(%rsi),%rdx
866 # in12 = *(uint64 *) (k + 8)
867 movq 8(%rsi),%rsi
868 # *(uint64 *) (x + 44) = in10
869 movq %rdx,44(%rdi)
870 # *(uint64 *) (x + 52) = in12
871 movq %rsi,52(%rdi)
872 # in0 = 1634760805
873 mov $1634760805,%rsi
874 # in4 = 824206446
875 mov $824206446,%rdx
876 # in10 = 2036477238
877 mov $2036477238,%rcx
878 # in14 = 1797285236
879 mov $1797285236,%r8
880 # *(uint32 *) (x + 0) = in0
881 movl %esi,0(%rdi)
882 # *(uint32 *) (x + 20) = in4
883 movl %edx,20(%rdi)
884 # *(uint32 *) (x + 40) = in10
885 movl %ecx,40(%rdi)
886 # *(uint32 *) (x + 60) = in14
887 movl %r8d,60(%rdi)
888# keysetupdone:
889._keysetupdone:
890 # leave
891 add %r11,%rsp
892 mov %rdi,%rax
893 mov %rsi,%rdx
894 ret
895# enter ECRYPT_ivsetup
896.text
897.p2align 5
898.globl ECRYPT_ivsetup
899ECRYPT_ivsetup:
900 mov %rsp,%r11
901 and $31,%r11
902 add $256,%r11
903 sub %r11,%rsp
904 # iv = arg2
905 mov %rsi,%rsi
906 # x = arg1
907 mov %rdi,%rdi
908 # in6 = *(uint64 *) (iv + 0)
909 movq 0(%rsi),%rsi
910 # in8 = 0
911 mov $0,%r8
912 # *(uint64 *) (x + 24) = in6
913 movq %rsi,24(%rdi)
914 # *(uint64 *) (x + 32) = in8
915 movq %r8,32(%rdi)
916 # leave
917 add %r11,%rsp
918 mov %rdi,%rax
919 mov %rsi,%rdx
920 ret
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index 3be443995ed6..bccb76d80987 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -8,6 +8,8 @@
8 * and to remove extraneous comments and functions that are not needed. 8 * and to remove extraneous comments and functions that are not needed.
9 * - i586 version, renamed as salsa20-i586-asm_32.S 9 * - i586 version, renamed as salsa20-i586-asm_32.S
10 * available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s> 10 * available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
11 * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
12 * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
11 * 13 *
12 * This program is free software; you can redistribute it and/or modify it 14 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the Free 15 * under the terms of the GNU General Public License as published by the Free