aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Biggers <ebiggers@google.com>2018-05-26 03:08:58 -0400
committerHerbert Xu <herbert@gondor.apana.org.au>2018-05-30 12:13:57 -0400
commitb7b73cd5d74694ed59abcdb4974dacb4ff8b2a2a (patch)
treeee9c6e6e31bbda855a25f76052c0922b5f4fe094
parent0b3a830bb407dce79468a26f382260131b50b3c5 (diff)
crypto: x86/salsa20 - remove x86 salsa20 implementations
The x86 assembly implementations of Salsa20 use the frame base pointer register (%ebp or %rbp), which breaks frame pointer convention and breaks stack traces when unwinding from an interrupt in the crypto code. Recent (v4.10+) kernels will warn about this, e.g. WARNING: kernel stack regs at 00000000a8291e69 in syzkaller047086:4677 has bad 'bp' value 000000001077994c [...] But after looking into it, I believe there's very little reason to still retain the x86 Salsa20 code. First, these are *not* vectorized (SSE2/SSSE3/AVX2) implementations, which would be needed to get anywhere close to the best Salsa20 performance on any remotely modern x86 processor; they're just regular x86 assembly. Second, it's still unclear that anyone is actually using the kernel's Salsa20 at all, especially given that now ChaCha20 is supported too, and with much more efficient SSSE3 and AVX2 implementations. Finally, in benchmarks I did on both Intel and AMD processors with both gcc 8.1.0 and gcc 4.9.4, the x86_64 salsa20-asm is actually slightly *slower* than salsa20-generic (~3% slower on Skylake, ~10% slower on Zen), while the i686 salsa20-asm is only slightly faster than salsa20-generic (~15% faster on Skylake, ~20% faster on Zen). The gcc version made little difference. So, the x86_64 salsa20-asm is pretty clearly useless. That leaves just the i686 salsa20-asm, which based on my tests provides a 15-20% speed boost. But that's without updating the code to not use %ebp. And given the maintenance cost, the small speed difference vs. salsa20-generic, the fact that few people still use i686 kernels, the doubt that anyone is even using the kernel's Salsa20 at all, and the fact that a SSE2 implementation would almost certainly be much faster on any remotely modern x86 processor yet no one has cared enough to add one yet, I don't think it's worthwhile to keep. Thus, just remove both the x86_64 and i686 salsa20-asm implementations. Reported-by: syzbot+ffa3a158337bbc01ff09@syzkaller.appspotmail.com Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/salsa20-i586-asm_32.S938
-rw-r--r--arch/x86/crypto/salsa20-x86_64-asm_64.S805
-rw-r--r--arch/x86/crypto/salsa20_glue.c91
-rw-r--r--crypto/Kconfig28
5 files changed, 0 insertions, 1866 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 48e731d782e9..a450ad573dcb 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o
15 15
16obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 16obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
17obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 17obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
18obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
19obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o 18obj-$(CONFIG_CRYPTO_SERPENT_SSE2_586) += serpent-sse2-i586.o
20 19
21obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o 20obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
@@ -24,7 +23,6 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
24obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o 23obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
25obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 24obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
26obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o 25obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
27obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
28obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o 26obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
29obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o 27obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
30obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 28obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
@@ -71,7 +69,6 @@ endif
71 69
72aes-i586-y := aes-i586-asm_32.o aes_glue.o 70aes-i586-y := aes-i586-asm_32.o aes_glue.o
73twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 71twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
74salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
75serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o 72serpent-sse2-i586-y := serpent-sse2-i586-asm_32.o serpent_sse2_glue.o
76 73
77aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o 74aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
@@ -80,7 +77,6 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
80blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o 77blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
81twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 78twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
82twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o 79twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
83salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
84chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o 80chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
85serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o 81serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
86 82
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
deleted file mode 100644
index 6014b7b9e52a..000000000000
--- a/arch/x86/crypto/salsa20-i586-asm_32.S
+++ /dev/null
@@ -1,938 +0,0 @@
1# Derived from:
2# salsa20_pm.s version 20051229
3# D. J. Bernstein
4# Public domain.
5
6#include <linux/linkage.h>
7
8.text
9
10# enter salsa20_encrypt_bytes
11ENTRY(salsa20_encrypt_bytes)
12 mov %esp,%eax
13 and $31,%eax
14 add $256,%eax
15 sub %eax,%esp
16 # eax_stack = eax
17 movl %eax,80(%esp)
18 # ebx_stack = ebx
19 movl %ebx,84(%esp)
20 # esi_stack = esi
21 movl %esi,88(%esp)
22 # edi_stack = edi
23 movl %edi,92(%esp)
24 # ebp_stack = ebp
25 movl %ebp,96(%esp)
26 # x = arg1
27 movl 4(%esp,%eax),%edx
28 # m = arg2
29 movl 8(%esp,%eax),%esi
30 # out = arg3
31 movl 12(%esp,%eax),%edi
32 # bytes = arg4
33 movl 16(%esp,%eax),%ebx
34 # bytes -= 0
35 sub $0,%ebx
36 # goto done if unsigned<=
37 jbe ._done
38._start:
39 # in0 = *(uint32 *) (x + 0)
40 movl 0(%edx),%eax
41 # in1 = *(uint32 *) (x + 4)
42 movl 4(%edx),%ecx
43 # in2 = *(uint32 *) (x + 8)
44 movl 8(%edx),%ebp
45 # j0 = in0
46 movl %eax,164(%esp)
47 # in3 = *(uint32 *) (x + 12)
48 movl 12(%edx),%eax
49 # j1 = in1
50 movl %ecx,168(%esp)
51 # in4 = *(uint32 *) (x + 16)
52 movl 16(%edx),%ecx
53 # j2 = in2
54 movl %ebp,172(%esp)
55 # in5 = *(uint32 *) (x + 20)
56 movl 20(%edx),%ebp
57 # j3 = in3
58 movl %eax,176(%esp)
59 # in6 = *(uint32 *) (x + 24)
60 movl 24(%edx),%eax
61 # j4 = in4
62 movl %ecx,180(%esp)
63 # in7 = *(uint32 *) (x + 28)
64 movl 28(%edx),%ecx
65 # j5 = in5
66 movl %ebp,184(%esp)
67 # in8 = *(uint32 *) (x + 32)
68 movl 32(%edx),%ebp
69 # j6 = in6
70 movl %eax,188(%esp)
71 # in9 = *(uint32 *) (x + 36)
72 movl 36(%edx),%eax
73 # j7 = in7
74 movl %ecx,192(%esp)
75 # in10 = *(uint32 *) (x + 40)
76 movl 40(%edx),%ecx
77 # j8 = in8
78 movl %ebp,196(%esp)
79 # in11 = *(uint32 *) (x + 44)
80 movl 44(%edx),%ebp
81 # j9 = in9
82 movl %eax,200(%esp)
83 # in12 = *(uint32 *) (x + 48)
84 movl 48(%edx),%eax
85 # j10 = in10
86 movl %ecx,204(%esp)
87 # in13 = *(uint32 *) (x + 52)
88 movl 52(%edx),%ecx
89 # j11 = in11
90 movl %ebp,208(%esp)
91 # in14 = *(uint32 *) (x + 56)
92 movl 56(%edx),%ebp
93 # j12 = in12
94 movl %eax,212(%esp)
95 # in15 = *(uint32 *) (x + 60)
96 movl 60(%edx),%eax
97 # j13 = in13
98 movl %ecx,216(%esp)
99 # j14 = in14
100 movl %ebp,220(%esp)
101 # j15 = in15
102 movl %eax,224(%esp)
103 # x_backup = x
104 movl %edx,64(%esp)
105._bytesatleast1:
106 # bytes - 64
107 cmp $64,%ebx
108 # goto nocopy if unsigned>=
109 jae ._nocopy
110 # ctarget = out
111 movl %edi,228(%esp)
112 # out = &tmp
113 leal 0(%esp),%edi
114 # i = bytes
115 mov %ebx,%ecx
116 # while (i) { *out++ = *m++; --i }
117 rep movsb
118 # out = &tmp
119 leal 0(%esp),%edi
120 # m = &tmp
121 leal 0(%esp),%esi
122._nocopy:
123 # out_backup = out
124 movl %edi,72(%esp)
125 # m_backup = m
126 movl %esi,68(%esp)
127 # bytes_backup = bytes
128 movl %ebx,76(%esp)
129 # in0 = j0
130 movl 164(%esp),%eax
131 # in1 = j1
132 movl 168(%esp),%ecx
133 # in2 = j2
134 movl 172(%esp),%edx
135 # in3 = j3
136 movl 176(%esp),%ebx
137 # x0 = in0
138 movl %eax,100(%esp)
139 # x1 = in1
140 movl %ecx,104(%esp)
141 # x2 = in2
142 movl %edx,108(%esp)
143 # x3 = in3
144 movl %ebx,112(%esp)
145 # in4 = j4
146 movl 180(%esp),%eax
147 # in5 = j5
148 movl 184(%esp),%ecx
149 # in6 = j6
150 movl 188(%esp),%edx
151 # in7 = j7
152 movl 192(%esp),%ebx
153 # x4 = in4
154 movl %eax,116(%esp)
155 # x5 = in5
156 movl %ecx,120(%esp)
157 # x6 = in6
158 movl %edx,124(%esp)
159 # x7 = in7
160 movl %ebx,128(%esp)
161 # in8 = j8
162 movl 196(%esp),%eax
163 # in9 = j9
164 movl 200(%esp),%ecx
165 # in10 = j10
166 movl 204(%esp),%edx
167 # in11 = j11
168 movl 208(%esp),%ebx
169 # x8 = in8
170 movl %eax,132(%esp)
171 # x9 = in9
172 movl %ecx,136(%esp)
173 # x10 = in10
174 movl %edx,140(%esp)
175 # x11 = in11
176 movl %ebx,144(%esp)
177 # in12 = j12
178 movl 212(%esp),%eax
179 # in13 = j13
180 movl 216(%esp),%ecx
181 # in14 = j14
182 movl 220(%esp),%edx
183 # in15 = j15
184 movl 224(%esp),%ebx
185 # x12 = in12
186 movl %eax,148(%esp)
187 # x13 = in13
188 movl %ecx,152(%esp)
189 # x14 = in14
190 movl %edx,156(%esp)
191 # x15 = in15
192 movl %ebx,160(%esp)
193 # i = 20
194 mov $20,%ebp
195 # p = x0
196 movl 100(%esp),%eax
197 # s = x5
198 movl 120(%esp),%ecx
199 # t = x10
200 movl 140(%esp),%edx
201 # w = x15
202 movl 160(%esp),%ebx
203._mainloop:
204 # x0 = p
205 movl %eax,100(%esp)
206 # x10 = t
207 movl %edx,140(%esp)
208 # p += x12
209 addl 148(%esp),%eax
210 # x5 = s
211 movl %ecx,120(%esp)
212 # t += x6
213 addl 124(%esp),%edx
214 # x15 = w
215 movl %ebx,160(%esp)
216 # r = x1
217 movl 104(%esp),%esi
218 # r += s
219 add %ecx,%esi
220 # v = x11
221 movl 144(%esp),%edi
222 # v += w
223 add %ebx,%edi
224 # p <<<= 7
225 rol $7,%eax
226 # p ^= x4
227 xorl 116(%esp),%eax
228 # t <<<= 7
229 rol $7,%edx
230 # t ^= x14
231 xorl 156(%esp),%edx
232 # r <<<= 7
233 rol $7,%esi
234 # r ^= x9
235 xorl 136(%esp),%esi
236 # v <<<= 7
237 rol $7,%edi
238 # v ^= x3
239 xorl 112(%esp),%edi
240 # x4 = p
241 movl %eax,116(%esp)
242 # x14 = t
243 movl %edx,156(%esp)
244 # p += x0
245 addl 100(%esp),%eax
246 # x9 = r
247 movl %esi,136(%esp)
248 # t += x10
249 addl 140(%esp),%edx
250 # x3 = v
251 movl %edi,112(%esp)
252 # p <<<= 9
253 rol $9,%eax
254 # p ^= x8
255 xorl 132(%esp),%eax
256 # t <<<= 9
257 rol $9,%edx
258 # t ^= x2
259 xorl 108(%esp),%edx
260 # s += r
261 add %esi,%ecx
262 # s <<<= 9
263 rol $9,%ecx
264 # s ^= x13
265 xorl 152(%esp),%ecx
266 # w += v
267 add %edi,%ebx
268 # w <<<= 9
269 rol $9,%ebx
270 # w ^= x7
271 xorl 128(%esp),%ebx
272 # x8 = p
273 movl %eax,132(%esp)
274 # x2 = t
275 movl %edx,108(%esp)
276 # p += x4
277 addl 116(%esp),%eax
278 # x13 = s
279 movl %ecx,152(%esp)
280 # t += x14
281 addl 156(%esp),%edx
282 # x7 = w
283 movl %ebx,128(%esp)
284 # p <<<= 13
285 rol $13,%eax
286 # p ^= x12
287 xorl 148(%esp),%eax
288 # t <<<= 13
289 rol $13,%edx
290 # t ^= x6
291 xorl 124(%esp),%edx
292 # r += s
293 add %ecx,%esi
294 # r <<<= 13
295 rol $13,%esi
296 # r ^= x1
297 xorl 104(%esp),%esi
298 # v += w
299 add %ebx,%edi
300 # v <<<= 13
301 rol $13,%edi
302 # v ^= x11
303 xorl 144(%esp),%edi
304 # x12 = p
305 movl %eax,148(%esp)
306 # x6 = t
307 movl %edx,124(%esp)
308 # p += x8
309 addl 132(%esp),%eax
310 # x1 = r
311 movl %esi,104(%esp)
312 # t += x2
313 addl 108(%esp),%edx
314 # x11 = v
315 movl %edi,144(%esp)
316 # p <<<= 18
317 rol $18,%eax
318 # p ^= x0
319 xorl 100(%esp),%eax
320 # t <<<= 18
321 rol $18,%edx
322 # t ^= x10
323 xorl 140(%esp),%edx
324 # s += r
325 add %esi,%ecx
326 # s <<<= 18
327 rol $18,%ecx
328 # s ^= x5
329 xorl 120(%esp),%ecx
330 # w += v
331 add %edi,%ebx
332 # w <<<= 18
333 rol $18,%ebx
334 # w ^= x15
335 xorl 160(%esp),%ebx
336 # x0 = p
337 movl %eax,100(%esp)
338 # x10 = t
339 movl %edx,140(%esp)
340 # p += x3
341 addl 112(%esp),%eax
342 # p <<<= 7
343 rol $7,%eax
344 # x5 = s
345 movl %ecx,120(%esp)
346 # t += x9
347 addl 136(%esp),%edx
348 # x15 = w
349 movl %ebx,160(%esp)
350 # r = x4
351 movl 116(%esp),%esi
352 # r += s
353 add %ecx,%esi
354 # v = x14
355 movl 156(%esp),%edi
356 # v += w
357 add %ebx,%edi
358 # p ^= x1
359 xorl 104(%esp),%eax
360 # t <<<= 7
361 rol $7,%edx
362 # t ^= x11
363 xorl 144(%esp),%edx
364 # r <<<= 7
365 rol $7,%esi
366 # r ^= x6
367 xorl 124(%esp),%esi
368 # v <<<= 7
369 rol $7,%edi
370 # v ^= x12
371 xorl 148(%esp),%edi
372 # x1 = p
373 movl %eax,104(%esp)
374 # x11 = t
375 movl %edx,144(%esp)
376 # p += x0
377 addl 100(%esp),%eax
378 # x6 = r
379 movl %esi,124(%esp)
380 # t += x10
381 addl 140(%esp),%edx
382 # x12 = v
383 movl %edi,148(%esp)
384 # p <<<= 9
385 rol $9,%eax
386 # p ^= x2
387 xorl 108(%esp),%eax
388 # t <<<= 9
389 rol $9,%edx
390 # t ^= x8
391 xorl 132(%esp),%edx
392 # s += r
393 add %esi,%ecx
394 # s <<<= 9
395 rol $9,%ecx
396 # s ^= x7
397 xorl 128(%esp),%ecx
398 # w += v
399 add %edi,%ebx
400 # w <<<= 9
401 rol $9,%ebx
402 # w ^= x13
403 xorl 152(%esp),%ebx
404 # x2 = p
405 movl %eax,108(%esp)
406 # x8 = t
407 movl %edx,132(%esp)
408 # p += x1
409 addl 104(%esp),%eax
410 # x7 = s
411 movl %ecx,128(%esp)
412 # t += x11
413 addl 144(%esp),%edx
414 # x13 = w
415 movl %ebx,152(%esp)
416 # p <<<= 13
417 rol $13,%eax
418 # p ^= x3
419 xorl 112(%esp),%eax
420 # t <<<= 13
421 rol $13,%edx
422 # t ^= x9
423 xorl 136(%esp),%edx
424 # r += s
425 add %ecx,%esi
426 # r <<<= 13
427 rol $13,%esi
428 # r ^= x4
429 xorl 116(%esp),%esi
430 # v += w
431 add %ebx,%edi
432 # v <<<= 13
433 rol $13,%edi
434 # v ^= x14
435 xorl 156(%esp),%edi
436 # x3 = p
437 movl %eax,112(%esp)
438 # x9 = t
439 movl %edx,136(%esp)
440 # p += x2
441 addl 108(%esp),%eax
442 # x4 = r
443 movl %esi,116(%esp)
444 # t += x8
445 addl 132(%esp),%edx
446 # x14 = v
447 movl %edi,156(%esp)
448 # p <<<= 18
449 rol $18,%eax
450 # p ^= x0
451 xorl 100(%esp),%eax
452 # t <<<= 18
453 rol $18,%edx
454 # t ^= x10
455 xorl 140(%esp),%edx
456 # s += r
457 add %esi,%ecx
458 # s <<<= 18
459 rol $18,%ecx
460 # s ^= x5
461 xorl 120(%esp),%ecx
462 # w += v
463 add %edi,%ebx
464 # w <<<= 18
465 rol $18,%ebx
466 # w ^= x15
467 xorl 160(%esp),%ebx
468 # x0 = p
469 movl %eax,100(%esp)
470 # x10 = t
471 movl %edx,140(%esp)
472 # p += x12
473 addl 148(%esp),%eax
474 # x5 = s
475 movl %ecx,120(%esp)
476 # t += x6
477 addl 124(%esp),%edx
478 # x15 = w
479 movl %ebx,160(%esp)
480 # r = x1
481 movl 104(%esp),%esi
482 # r += s
483 add %ecx,%esi
484 # v = x11
485 movl 144(%esp),%edi
486 # v += w
487 add %ebx,%edi
488 # p <<<= 7
489 rol $7,%eax
490 # p ^= x4
491 xorl 116(%esp),%eax
492 # t <<<= 7
493 rol $7,%edx
494 # t ^= x14
495 xorl 156(%esp),%edx
496 # r <<<= 7
497 rol $7,%esi
498 # r ^= x9
499 xorl 136(%esp),%esi
500 # v <<<= 7
501 rol $7,%edi
502 # v ^= x3
503 xorl 112(%esp),%edi
504 # x4 = p
505 movl %eax,116(%esp)
506 # x14 = t
507 movl %edx,156(%esp)
508 # p += x0
509 addl 100(%esp),%eax
510 # x9 = r
511 movl %esi,136(%esp)
512 # t += x10
513 addl 140(%esp),%edx
514 # x3 = v
515 movl %edi,112(%esp)
516 # p <<<= 9
517 rol $9,%eax
518 # p ^= x8
519 xorl 132(%esp),%eax
520 # t <<<= 9
521 rol $9,%edx
522 # t ^= x2
523 xorl 108(%esp),%edx
524 # s += r
525 add %esi,%ecx
526 # s <<<= 9
527 rol $9,%ecx
528 # s ^= x13
529 xorl 152(%esp),%ecx
530 # w += v
531 add %edi,%ebx
532 # w <<<= 9
533 rol $9,%ebx
534 # w ^= x7
535 xorl 128(%esp),%ebx
536 # x8 = p
537 movl %eax,132(%esp)
538 # x2 = t
539 movl %edx,108(%esp)
540 # p += x4
541 addl 116(%esp),%eax
542 # x13 = s
543 movl %ecx,152(%esp)
544 # t += x14
545 addl 156(%esp),%edx
546 # x7 = w
547 movl %ebx,128(%esp)
548 # p <<<= 13
549 rol $13,%eax
550 # p ^= x12
551 xorl 148(%esp),%eax
552 # t <<<= 13
553 rol $13,%edx
554 # t ^= x6
555 xorl 124(%esp),%edx
556 # r += s
557 add %ecx,%esi
558 # r <<<= 13
559 rol $13,%esi
560 # r ^= x1
561 xorl 104(%esp),%esi
562 # v += w
563 add %ebx,%edi
564 # v <<<= 13
565 rol $13,%edi
566 # v ^= x11
567 xorl 144(%esp),%edi
568 # x12 = p
569 movl %eax,148(%esp)
570 # x6 = t
571 movl %edx,124(%esp)
572 # p += x8
573 addl 132(%esp),%eax
574 # x1 = r
575 movl %esi,104(%esp)
576 # t += x2
577 addl 108(%esp),%edx
578 # x11 = v
579 movl %edi,144(%esp)
580 # p <<<= 18
581 rol $18,%eax
582 # p ^= x0
583 xorl 100(%esp),%eax
584 # t <<<= 18
585 rol $18,%edx
586 # t ^= x10
587 xorl 140(%esp),%edx
588 # s += r
589 add %esi,%ecx
590 # s <<<= 18
591 rol $18,%ecx
592 # s ^= x5
593 xorl 120(%esp),%ecx
594 # w += v
595 add %edi,%ebx
596 # w <<<= 18
597 rol $18,%ebx
598 # w ^= x15
599 xorl 160(%esp),%ebx
600 # x0 = p
601 movl %eax,100(%esp)
602 # x10 = t
603 movl %edx,140(%esp)
604 # p += x3
605 addl 112(%esp),%eax
606 # p <<<= 7
607 rol $7,%eax
608 # x5 = s
609 movl %ecx,120(%esp)
610 # t += x9
611 addl 136(%esp),%edx
612 # x15 = w
613 movl %ebx,160(%esp)
614 # r = x4
615 movl 116(%esp),%esi
616 # r += s
617 add %ecx,%esi
618 # v = x14
619 movl 156(%esp),%edi
620 # v += w
621 add %ebx,%edi
622 # p ^= x1
623 xorl 104(%esp),%eax
624 # t <<<= 7
625 rol $7,%edx
626 # t ^= x11
627 xorl 144(%esp),%edx
628 # r <<<= 7
629 rol $7,%esi
630 # r ^= x6
631 xorl 124(%esp),%esi
632 # v <<<= 7
633 rol $7,%edi
634 # v ^= x12
635 xorl 148(%esp),%edi
636 # x1 = p
637 movl %eax,104(%esp)
638 # x11 = t
639 movl %edx,144(%esp)
640 # p += x0
641 addl 100(%esp),%eax
642 # x6 = r
643 movl %esi,124(%esp)
644 # t += x10
645 addl 140(%esp),%edx
646 # x12 = v
647 movl %edi,148(%esp)
648 # p <<<= 9
649 rol $9,%eax
650 # p ^= x2
651 xorl 108(%esp),%eax
652 # t <<<= 9
653 rol $9,%edx
654 # t ^= x8
655 xorl 132(%esp),%edx
656 # s += r
657 add %esi,%ecx
658 # s <<<= 9
659 rol $9,%ecx
660 # s ^= x7
661 xorl 128(%esp),%ecx
662 # w += v
663 add %edi,%ebx
664 # w <<<= 9
665 rol $9,%ebx
666 # w ^= x13
667 xorl 152(%esp),%ebx
668 # x2 = p
669 movl %eax,108(%esp)
670 # x8 = t
671 movl %edx,132(%esp)
672 # p += x1
673 addl 104(%esp),%eax
674 # x7 = s
675 movl %ecx,128(%esp)
676 # t += x11
677 addl 144(%esp),%edx
678 # x13 = w
679 movl %ebx,152(%esp)
680 # p <<<= 13
681 rol $13,%eax
682 # p ^= x3
683 xorl 112(%esp),%eax
684 # t <<<= 13
685 rol $13,%edx
686 # t ^= x9
687 xorl 136(%esp),%edx
688 # r += s
689 add %ecx,%esi
690 # r <<<= 13
691 rol $13,%esi
692 # r ^= x4
693 xorl 116(%esp),%esi
694 # v += w
695 add %ebx,%edi
696 # v <<<= 13
697 rol $13,%edi
698 # v ^= x14
699 xorl 156(%esp),%edi
700 # x3 = p
701 movl %eax,112(%esp)
702 # x9 = t
703 movl %edx,136(%esp)
704 # p += x2
705 addl 108(%esp),%eax
706 # x4 = r
707 movl %esi,116(%esp)
708 # t += x8
709 addl 132(%esp),%edx
710 # x14 = v
711 movl %edi,156(%esp)
712 # p <<<= 18
713 rol $18,%eax
714 # p ^= x0
715 xorl 100(%esp),%eax
716 # t <<<= 18
717 rol $18,%edx
718 # t ^= x10
719 xorl 140(%esp),%edx
720 # s += r
721 add %esi,%ecx
722 # s <<<= 18
723 rol $18,%ecx
724 # s ^= x5
725 xorl 120(%esp),%ecx
726 # w += v
727 add %edi,%ebx
728 # w <<<= 18
729 rol $18,%ebx
730 # w ^= x15
731 xorl 160(%esp),%ebx
732 # i -= 4
733 sub $4,%ebp
734 # goto mainloop if unsigned >
735 ja ._mainloop
736 # x0 = p
737 movl %eax,100(%esp)
738 # x5 = s
739 movl %ecx,120(%esp)
740 # x10 = t
741 movl %edx,140(%esp)
742 # x15 = w
743 movl %ebx,160(%esp)
744 # out = out_backup
745 movl 72(%esp),%edi
746 # m = m_backup
747 movl 68(%esp),%esi
748 # in0 = x0
749 movl 100(%esp),%eax
750 # in1 = x1
751 movl 104(%esp),%ecx
752 # in0 += j0
753 addl 164(%esp),%eax
754 # in1 += j1
755 addl 168(%esp),%ecx
756 # in0 ^= *(uint32 *) (m + 0)
757 xorl 0(%esi),%eax
758 # in1 ^= *(uint32 *) (m + 4)
759 xorl 4(%esi),%ecx
760 # *(uint32 *) (out + 0) = in0
761 movl %eax,0(%edi)
762 # *(uint32 *) (out + 4) = in1
763 movl %ecx,4(%edi)
764 # in2 = x2
765 movl 108(%esp),%eax
766 # in3 = x3
767 movl 112(%esp),%ecx
768 # in2 += j2
769 addl 172(%esp),%eax
770 # in3 += j3
771 addl 176(%esp),%ecx
772 # in2 ^= *(uint32 *) (m + 8)
773 xorl 8(%esi),%eax
774 # in3 ^= *(uint32 *) (m + 12)
775 xorl 12(%esi),%ecx
776 # *(uint32 *) (out + 8) = in2
777 movl %eax,8(%edi)
778 # *(uint32 *) (out + 12) = in3
779 movl %ecx,12(%edi)
780 # in4 = x4
781 movl 116(%esp),%eax
782 # in5 = x5
783 movl 120(%esp),%ecx
784 # in4 += j4
785 addl 180(%esp),%eax
786 # in5 += j5
787 addl 184(%esp),%ecx
788 # in4 ^= *(uint32 *) (m + 16)
789 xorl 16(%esi),%eax
790 # in5 ^= *(uint32 *) (m + 20)
791 xorl 20(%esi),%ecx
792 # *(uint32 *) (out + 16) = in4
793 movl %eax,16(%edi)
794 # *(uint32 *) (out + 20) = in5
795 movl %ecx,20(%edi)
796 # in6 = x6
797 movl 124(%esp),%eax
798 # in7 = x7
799 movl 128(%esp),%ecx
800 # in6 += j6
801 addl 188(%esp),%eax
802 # in7 += j7
803 addl 192(%esp),%ecx
804 # in6 ^= *(uint32 *) (m + 24)
805 xorl 24(%esi),%eax
806 # in7 ^= *(uint32 *) (m + 28)
807 xorl 28(%esi),%ecx
808 # *(uint32 *) (out + 24) = in6
809 movl %eax,24(%edi)
810 # *(uint32 *) (out + 28) = in7
811 movl %ecx,28(%edi)
812 # in8 = x8
813 movl 132(%esp),%eax
814 # in9 = x9
815 movl 136(%esp),%ecx
816 # in8 += j8
817 addl 196(%esp),%eax
818 # in9 += j9
819 addl 200(%esp),%ecx
820 # in8 ^= *(uint32 *) (m + 32)
821 xorl 32(%esi),%eax
822 # in9 ^= *(uint32 *) (m + 36)
823 xorl 36(%esi),%ecx
824 # *(uint32 *) (out + 32) = in8
825 movl %eax,32(%edi)
826 # *(uint32 *) (out + 36) = in9
827 movl %ecx,36(%edi)
828 # in10 = x10
829 movl 140(%esp),%eax
830 # in11 = x11
831 movl 144(%esp),%ecx
832 # in10 += j10
833 addl 204(%esp),%eax
834 # in11 += j11
835 addl 208(%esp),%ecx
836 # in10 ^= *(uint32 *) (m + 40)
837 xorl 40(%esi),%eax
838 # in11 ^= *(uint32 *) (m + 44)
839 xorl 44(%esi),%ecx
840 # *(uint32 *) (out + 40) = in10
841 movl %eax,40(%edi)
842 # *(uint32 *) (out + 44) = in11
843 movl %ecx,44(%edi)
844 # in12 = x12
845 movl 148(%esp),%eax
846 # in13 = x13
847 movl 152(%esp),%ecx
848 # in12 += j12
849 addl 212(%esp),%eax
850 # in13 += j13
851 addl 216(%esp),%ecx
852 # in12 ^= *(uint32 *) (m + 48)
853 xorl 48(%esi),%eax
854 # in13 ^= *(uint32 *) (m + 52)
855 xorl 52(%esi),%ecx
856 # *(uint32 *) (out + 48) = in12
857 movl %eax,48(%edi)
858 # *(uint32 *) (out + 52) = in13
859 movl %ecx,52(%edi)
860 # in14 = x14
861 movl 156(%esp),%eax
862 # in15 = x15
863 movl 160(%esp),%ecx
864 # in14 += j14
865 addl 220(%esp),%eax
866 # in15 += j15
867 addl 224(%esp),%ecx
868 # in14 ^= *(uint32 *) (m + 56)
869 xorl 56(%esi),%eax
870 # in15 ^= *(uint32 *) (m + 60)
871 xorl 60(%esi),%ecx
872 # *(uint32 *) (out + 56) = in14
873 movl %eax,56(%edi)
874 # *(uint32 *) (out + 60) = in15
875 movl %ecx,60(%edi)
876 # bytes = bytes_backup
877 movl 76(%esp),%ebx
878 # in8 = j8
879 movl 196(%esp),%eax
880 # in9 = j9
881 movl 200(%esp),%ecx
882 # in8 += 1
883 add $1,%eax
884 # in9 += 0 + carry
885 adc $0,%ecx
886 # j8 = in8
887 movl %eax,196(%esp)
888 # j9 = in9
889 movl %ecx,200(%esp)
890 # bytes - 64
891 cmp $64,%ebx
892 # goto bytesatleast65 if unsigned>
893 ja ._bytesatleast65
894 # goto bytesatleast64 if unsigned>=
895 jae ._bytesatleast64
896 # m = out
897 mov %edi,%esi
898 # out = ctarget
899 movl 228(%esp),%edi
900 # i = bytes
901 mov %ebx,%ecx
902 # while (i) { *out++ = *m++; --i }
903 rep movsb
904._bytesatleast64:
905 # x = x_backup
906 movl 64(%esp),%eax
907 # in8 = j8
908 movl 196(%esp),%ecx
909 # in9 = j9
910 movl 200(%esp),%edx
911 # *(uint32 *) (x + 32) = in8
912 movl %ecx,32(%eax)
913 # *(uint32 *) (x + 36) = in9
914 movl %edx,36(%eax)
915._done:
916 # eax = eax_stack
917 movl 80(%esp),%eax
918 # ebx = ebx_stack
919 movl 84(%esp),%ebx
920 # esi = esi_stack
921 movl 88(%esp),%esi
922 # edi = edi_stack
923 movl 92(%esp),%edi
924 # ebp = ebp_stack
925 movl 96(%esp),%ebp
926 # leave
927 add %eax,%esp
928 ret
929._bytesatleast65:
930 # bytes -= 64
931 sub $64,%ebx
932 # out += 64
933 add $64,%edi
934 # m += 64
935 add $64,%esi
936 # goto bytesatleast1
937 jmp ._bytesatleast1
938ENDPROC(salsa20_encrypt_bytes)
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
deleted file mode 100644
index 03a4918f41ee..000000000000
--- a/arch/x86/crypto/salsa20-x86_64-asm_64.S
+++ /dev/null
@@ -1,805 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#include <linux/linkage.h>
3
4# enter salsa20_encrypt_bytes
5ENTRY(salsa20_encrypt_bytes)
6 mov %rsp,%r11
7 and $31,%r11
8 add $256,%r11
9 sub %r11,%rsp
10 # x = arg1
11 mov %rdi,%r8
12 # m = arg2
13 mov %rsi,%rsi
14 # out = arg3
15 mov %rdx,%rdi
16 # bytes = arg4
17 mov %rcx,%rdx
18 # unsigned>? bytes - 0
19 cmp $0,%rdx
20 # comment:fp stack unchanged by jump
21 # goto done if !unsigned>
22 jbe ._done
23 # comment:fp stack unchanged by fallthrough
24# start:
25._start:
26 # r11_stack = r11
27 movq %r11,0(%rsp)
28 # r12_stack = r12
29 movq %r12,8(%rsp)
30 # r13_stack = r13
31 movq %r13,16(%rsp)
32 # r14_stack = r14
33 movq %r14,24(%rsp)
34 # r15_stack = r15
35 movq %r15,32(%rsp)
36 # rbx_stack = rbx
37 movq %rbx,40(%rsp)
38 # rbp_stack = rbp
39 movq %rbp,48(%rsp)
40 # in0 = *(uint64 *) (x + 0)
41 movq 0(%r8),%rcx
42 # in2 = *(uint64 *) (x + 8)
43 movq 8(%r8),%r9
44 # in4 = *(uint64 *) (x + 16)
45 movq 16(%r8),%rax
46 # in6 = *(uint64 *) (x + 24)
47 movq 24(%r8),%r10
48 # in8 = *(uint64 *) (x + 32)
49 movq 32(%r8),%r11
50 # in10 = *(uint64 *) (x + 40)
51 movq 40(%r8),%r12
52 # in12 = *(uint64 *) (x + 48)
53 movq 48(%r8),%r13
54 # in14 = *(uint64 *) (x + 56)
55 movq 56(%r8),%r14
56 # j0 = in0
57 movq %rcx,56(%rsp)
58 # j2 = in2
59 movq %r9,64(%rsp)
60 # j4 = in4
61 movq %rax,72(%rsp)
62 # j6 = in6
63 movq %r10,80(%rsp)
64 # j8 = in8
65 movq %r11,88(%rsp)
66 # j10 = in10
67 movq %r12,96(%rsp)
68 # j12 = in12
69 movq %r13,104(%rsp)
70 # j14 = in14
71 movq %r14,112(%rsp)
72 # x_backup = x
73 movq %r8,120(%rsp)
74# bytesatleast1:
75._bytesatleast1:
76 # unsigned<? bytes - 64
77 cmp $64,%rdx
78 # comment:fp stack unchanged by jump
79 # goto nocopy if !unsigned<
80 jae ._nocopy
81 # ctarget = out
82 movq %rdi,128(%rsp)
83 # out = &tmp
84 leaq 192(%rsp),%rdi
85 # i = bytes
86 mov %rdx,%rcx
87 # while (i) { *out++ = *m++; --i }
88 rep movsb
89 # out = &tmp
90 leaq 192(%rsp),%rdi
91 # m = &tmp
92 leaq 192(%rsp),%rsi
93 # comment:fp stack unchanged by fallthrough
94# nocopy:
95._nocopy:
96 # out_backup = out
97 movq %rdi,136(%rsp)
98 # m_backup = m
99 movq %rsi,144(%rsp)
100 # bytes_backup = bytes
101 movq %rdx,152(%rsp)
102 # x1 = j0
103 movq 56(%rsp),%rdi
104 # x0 = x1
105 mov %rdi,%rdx
106 # (uint64) x1 >>= 32
107 shr $32,%rdi
108 # x3 = j2
109 movq 64(%rsp),%rsi
110 # x2 = x3
111 mov %rsi,%rcx
112 # (uint64) x3 >>= 32
113 shr $32,%rsi
114 # x5 = j4
115 movq 72(%rsp),%r8
116 # x4 = x5
117 mov %r8,%r9
118 # (uint64) x5 >>= 32
119 shr $32,%r8
120 # x5_stack = x5
121 movq %r8,160(%rsp)
122 # x7 = j6
123 movq 80(%rsp),%r8
124 # x6 = x7
125 mov %r8,%rax
126 # (uint64) x7 >>= 32
127 shr $32,%r8
128 # x9 = j8
129 movq 88(%rsp),%r10
130 # x8 = x9
131 mov %r10,%r11
132 # (uint64) x9 >>= 32
133 shr $32,%r10
134 # x11 = j10
135 movq 96(%rsp),%r12
136 # x10 = x11
137 mov %r12,%r13
138 # x10_stack = x10
139 movq %r13,168(%rsp)
140 # (uint64) x11 >>= 32
141 shr $32,%r12
142 # x13 = j12
143 movq 104(%rsp),%r13
144 # x12 = x13
145 mov %r13,%r14
146 # (uint64) x13 >>= 32
147 shr $32,%r13
148 # x15 = j14
149 movq 112(%rsp),%r15
150 # x14 = x15
151 mov %r15,%rbx
152 # (uint64) x15 >>= 32
153 shr $32,%r15
154 # x15_stack = x15
155 movq %r15,176(%rsp)
156 # i = 20
157 mov $20,%r15
158# mainloop:
159._mainloop:
160 # i_backup = i
161 movq %r15,184(%rsp)
162 # x5 = x5_stack
163 movq 160(%rsp),%r15
164 # a = x12 + x0
165 lea (%r14,%rdx),%rbp
166 # (uint32) a <<<= 7
167 rol $7,%ebp
168 # x4 ^= a
169 xor %rbp,%r9
170 # b = x1 + x5
171 lea (%rdi,%r15),%rbp
172 # (uint32) b <<<= 7
173 rol $7,%ebp
174 # x9 ^= b
175 xor %rbp,%r10
176 # a = x0 + x4
177 lea (%rdx,%r9),%rbp
178 # (uint32) a <<<= 9
179 rol $9,%ebp
180 # x8 ^= a
181 xor %rbp,%r11
182 # b = x5 + x9
183 lea (%r15,%r10),%rbp
184 # (uint32) b <<<= 9
185 rol $9,%ebp
186 # x13 ^= b
187 xor %rbp,%r13
188 # a = x4 + x8
189 lea (%r9,%r11),%rbp
190 # (uint32) a <<<= 13
191 rol $13,%ebp
192 # x12 ^= a
193 xor %rbp,%r14
194 # b = x9 + x13
195 lea (%r10,%r13),%rbp
196 # (uint32) b <<<= 13
197 rol $13,%ebp
198 # x1 ^= b
199 xor %rbp,%rdi
200 # a = x8 + x12
201 lea (%r11,%r14),%rbp
202 # (uint32) a <<<= 18
203 rol $18,%ebp
204 # x0 ^= a
205 xor %rbp,%rdx
206 # b = x13 + x1
207 lea (%r13,%rdi),%rbp
208 # (uint32) b <<<= 18
209 rol $18,%ebp
210 # x5 ^= b
211 xor %rbp,%r15
212 # x10 = x10_stack
213 movq 168(%rsp),%rbp
214 # x5_stack = x5
215 movq %r15,160(%rsp)
216 # c = x6 + x10
217 lea (%rax,%rbp),%r15
218 # (uint32) c <<<= 7
219 rol $7,%r15d
220 # x14 ^= c
221 xor %r15,%rbx
222 # c = x10 + x14
223 lea (%rbp,%rbx),%r15
224 # (uint32) c <<<= 9
225 rol $9,%r15d
226 # x2 ^= c
227 xor %r15,%rcx
228 # c = x14 + x2
229 lea (%rbx,%rcx),%r15
230 # (uint32) c <<<= 13
231 rol $13,%r15d
232 # x6 ^= c
233 xor %r15,%rax
234 # c = x2 + x6
235 lea (%rcx,%rax),%r15
236 # (uint32) c <<<= 18
237 rol $18,%r15d
238 # x10 ^= c
239 xor %r15,%rbp
240 # x15 = x15_stack
241 movq 176(%rsp),%r15
242 # x10_stack = x10
243 movq %rbp,168(%rsp)
244 # d = x11 + x15
245 lea (%r12,%r15),%rbp
246 # (uint32) d <<<= 7
247 rol $7,%ebp
248 # x3 ^= d
249 xor %rbp,%rsi
250 # d = x15 + x3
251 lea (%r15,%rsi),%rbp
252 # (uint32) d <<<= 9
253 rol $9,%ebp
254 # x7 ^= d
255 xor %rbp,%r8
256 # d = x3 + x7
257 lea (%rsi,%r8),%rbp
258 # (uint32) d <<<= 13
259 rol $13,%ebp
260 # x11 ^= d
261 xor %rbp,%r12
262 # d = x7 + x11
263 lea (%r8,%r12),%rbp
264 # (uint32) d <<<= 18
265 rol $18,%ebp
266 # x15 ^= d
267 xor %rbp,%r15
268 # x15_stack = x15
269 movq %r15,176(%rsp)
270 # x5 = x5_stack
271 movq 160(%rsp),%r15
272 # a = x3 + x0
273 lea (%rsi,%rdx),%rbp
274 # (uint32) a <<<= 7
275 rol $7,%ebp
276 # x1 ^= a
277 xor %rbp,%rdi
278 # b = x4 + x5
279 lea (%r9,%r15),%rbp
280 # (uint32) b <<<= 7
281 rol $7,%ebp
282 # x6 ^= b
283 xor %rbp,%rax
284 # a = x0 + x1
285 lea (%rdx,%rdi),%rbp
286 # (uint32) a <<<= 9
287 rol $9,%ebp
288 # x2 ^= a
289 xor %rbp,%rcx
290 # b = x5 + x6
291 lea (%r15,%rax),%rbp
292 # (uint32) b <<<= 9
293 rol $9,%ebp
294 # x7 ^= b
295 xor %rbp,%r8
296 # a = x1 + x2
297 lea (%rdi,%rcx),%rbp
298 # (uint32) a <<<= 13
299 rol $13,%ebp
300 # x3 ^= a
301 xor %rbp,%rsi
302 # b = x6 + x7
303 lea (%rax,%r8),%rbp
304 # (uint32) b <<<= 13
305 rol $13,%ebp
306 # x4 ^= b
307 xor %rbp,%r9
308 # a = x2 + x3
309 lea (%rcx,%rsi),%rbp
310 # (uint32) a <<<= 18
311 rol $18,%ebp
312 # x0 ^= a
313 xor %rbp,%rdx
314 # b = x7 + x4
315 lea (%r8,%r9),%rbp
316 # (uint32) b <<<= 18
317 rol $18,%ebp
318 # x5 ^= b
319 xor %rbp,%r15
320 # x10 = x10_stack
321 movq 168(%rsp),%rbp
322 # x5_stack = x5
323 movq %r15,160(%rsp)
324 # c = x9 + x10
325 lea (%r10,%rbp),%r15
326 # (uint32) c <<<= 7
327 rol $7,%r15d
328 # x11 ^= c
329 xor %r15,%r12
330 # c = x10 + x11
331 lea (%rbp,%r12),%r15
332 # (uint32) c <<<= 9
333 rol $9,%r15d
334 # x8 ^= c
335 xor %r15,%r11
336 # c = x11 + x8
337 lea (%r12,%r11),%r15
338 # (uint32) c <<<= 13
339 rol $13,%r15d
340 # x9 ^= c
341 xor %r15,%r10
342 # c = x8 + x9
343 lea (%r11,%r10),%r15
344 # (uint32) c <<<= 18
345 rol $18,%r15d
346 # x10 ^= c
347 xor %r15,%rbp
348 # x15 = x15_stack
349 movq 176(%rsp),%r15
350 # x10_stack = x10
351 movq %rbp,168(%rsp)
352 # d = x14 + x15
353 lea (%rbx,%r15),%rbp
354 # (uint32) d <<<= 7
355 rol $7,%ebp
356 # x12 ^= d
357 xor %rbp,%r14
358 # d = x15 + x12
359 lea (%r15,%r14),%rbp
360 # (uint32) d <<<= 9
361 rol $9,%ebp
362 # x13 ^= d
363 xor %rbp,%r13
364 # d = x12 + x13
365 lea (%r14,%r13),%rbp
366 # (uint32) d <<<= 13
367 rol $13,%ebp
368 # x14 ^= d
369 xor %rbp,%rbx
370 # d = x13 + x14
371 lea (%r13,%rbx),%rbp
372 # (uint32) d <<<= 18
373 rol $18,%ebp
374 # x15 ^= d
375 xor %rbp,%r15
376 # x15_stack = x15
377 movq %r15,176(%rsp)
378 # x5 = x5_stack
379 movq 160(%rsp),%r15
380 # a = x12 + x0
381 lea (%r14,%rdx),%rbp
382 # (uint32) a <<<= 7
383 rol $7,%ebp
384 # x4 ^= a
385 xor %rbp,%r9
386 # b = x1 + x5
387 lea (%rdi,%r15),%rbp
388 # (uint32) b <<<= 7
389 rol $7,%ebp
390 # x9 ^= b
391 xor %rbp,%r10
392 # a = x0 + x4
393 lea (%rdx,%r9),%rbp
394 # (uint32) a <<<= 9
395 rol $9,%ebp
396 # x8 ^= a
397 xor %rbp,%r11
398 # b = x5 + x9
399 lea (%r15,%r10),%rbp
400 # (uint32) b <<<= 9
401 rol $9,%ebp
402 # x13 ^= b
403 xor %rbp,%r13
404 # a = x4 + x8
405 lea (%r9,%r11),%rbp
406 # (uint32) a <<<= 13
407 rol $13,%ebp
408 # x12 ^= a
409 xor %rbp,%r14
410 # b = x9 + x13
411 lea (%r10,%r13),%rbp
412 # (uint32) b <<<= 13
413 rol $13,%ebp
414 # x1 ^= b
415 xor %rbp,%rdi
416 # a = x8 + x12
417 lea (%r11,%r14),%rbp
418 # (uint32) a <<<= 18
419 rol $18,%ebp
420 # x0 ^= a
421 xor %rbp,%rdx
422 # b = x13 + x1
423 lea (%r13,%rdi),%rbp
424 # (uint32) b <<<= 18
425 rol $18,%ebp
426 # x5 ^= b
427 xor %rbp,%r15
428 # x10 = x10_stack
429 movq 168(%rsp),%rbp
430 # x5_stack = x5
431 movq %r15,160(%rsp)
432 # c = x6 + x10
433 lea (%rax,%rbp),%r15
434 # (uint32) c <<<= 7
435 rol $7,%r15d
436 # x14 ^= c
437 xor %r15,%rbx
438 # c = x10 + x14
439 lea (%rbp,%rbx),%r15
440 # (uint32) c <<<= 9
441 rol $9,%r15d
442 # x2 ^= c
443 xor %r15,%rcx
444 # c = x14 + x2
445 lea (%rbx,%rcx),%r15
446 # (uint32) c <<<= 13
447 rol $13,%r15d
448 # x6 ^= c
449 xor %r15,%rax
450 # c = x2 + x6
451 lea (%rcx,%rax),%r15
452 # (uint32) c <<<= 18
453 rol $18,%r15d
454 # x10 ^= c
455 xor %r15,%rbp
456 # x15 = x15_stack
457 movq 176(%rsp),%r15
458 # x10_stack = x10
459 movq %rbp,168(%rsp)
460 # d = x11 + x15
461 lea (%r12,%r15),%rbp
462 # (uint32) d <<<= 7
463 rol $7,%ebp
464 # x3 ^= d
465 xor %rbp,%rsi
466 # d = x15 + x3
467 lea (%r15,%rsi),%rbp
468 # (uint32) d <<<= 9
469 rol $9,%ebp
470 # x7 ^= d
471 xor %rbp,%r8
472 # d = x3 + x7
473 lea (%rsi,%r8),%rbp
474 # (uint32) d <<<= 13
475 rol $13,%ebp
476 # x11 ^= d
477 xor %rbp,%r12
478 # d = x7 + x11
479 lea (%r8,%r12),%rbp
480 # (uint32) d <<<= 18
481 rol $18,%ebp
482 # x15 ^= d
483 xor %rbp,%r15
484 # x15_stack = x15
485 movq %r15,176(%rsp)
486 # x5 = x5_stack
487 movq 160(%rsp),%r15
488 # a = x3 + x0
489 lea (%rsi,%rdx),%rbp
490 # (uint32) a <<<= 7
491 rol $7,%ebp
492 # x1 ^= a
493 xor %rbp,%rdi
494 # b = x4 + x5
495 lea (%r9,%r15),%rbp
496 # (uint32) b <<<= 7
497 rol $7,%ebp
498 # x6 ^= b
499 xor %rbp,%rax
500 # a = x0 + x1
501 lea (%rdx,%rdi),%rbp
502 # (uint32) a <<<= 9
503 rol $9,%ebp
504 # x2 ^= a
505 xor %rbp,%rcx
506 # b = x5 + x6
507 lea (%r15,%rax),%rbp
508 # (uint32) b <<<= 9
509 rol $9,%ebp
510 # x7 ^= b
511 xor %rbp,%r8
512 # a = x1 + x2
513 lea (%rdi,%rcx),%rbp
514 # (uint32) a <<<= 13
515 rol $13,%ebp
516 # x3 ^= a
517 xor %rbp,%rsi
518 # b = x6 + x7
519 lea (%rax,%r8),%rbp
520 # (uint32) b <<<= 13
521 rol $13,%ebp
522 # x4 ^= b
523 xor %rbp,%r9
524 # a = x2 + x3
525 lea (%rcx,%rsi),%rbp
526 # (uint32) a <<<= 18
527 rol $18,%ebp
528 # x0 ^= a
529 xor %rbp,%rdx
530 # b = x7 + x4
531 lea (%r8,%r9),%rbp
532 # (uint32) b <<<= 18
533 rol $18,%ebp
534 # x5 ^= b
535 xor %rbp,%r15
536 # x10 = x10_stack
537 movq 168(%rsp),%rbp
538 # x5_stack = x5
539 movq %r15,160(%rsp)
540 # c = x9 + x10
541 lea (%r10,%rbp),%r15
542 # (uint32) c <<<= 7
543 rol $7,%r15d
544 # x11 ^= c
545 xor %r15,%r12
546 # c = x10 + x11
547 lea (%rbp,%r12),%r15
548 # (uint32) c <<<= 9
549 rol $9,%r15d
550 # x8 ^= c
551 xor %r15,%r11
552 # c = x11 + x8
553 lea (%r12,%r11),%r15
554 # (uint32) c <<<= 13
555 rol $13,%r15d
556 # x9 ^= c
557 xor %r15,%r10
558 # c = x8 + x9
559 lea (%r11,%r10),%r15
560 # (uint32) c <<<= 18
561 rol $18,%r15d
562 # x10 ^= c
563 xor %r15,%rbp
564 # x15 = x15_stack
565 movq 176(%rsp),%r15
566 # x10_stack = x10
567 movq %rbp,168(%rsp)
568 # d = x14 + x15
569 lea (%rbx,%r15),%rbp
570 # (uint32) d <<<= 7
571 rol $7,%ebp
572 # x12 ^= d
573 xor %rbp,%r14
574 # d = x15 + x12
575 lea (%r15,%r14),%rbp
576 # (uint32) d <<<= 9
577 rol $9,%ebp
578 # x13 ^= d
579 xor %rbp,%r13
580 # d = x12 + x13
581 lea (%r14,%r13),%rbp
582 # (uint32) d <<<= 13
583 rol $13,%ebp
584 # x14 ^= d
585 xor %rbp,%rbx
586 # d = x13 + x14
587 lea (%r13,%rbx),%rbp
588 # (uint32) d <<<= 18
589 rol $18,%ebp
590 # x15 ^= d
591 xor %rbp,%r15
592 # x15_stack = x15
593 movq %r15,176(%rsp)
594 # i = i_backup
595 movq 184(%rsp),%r15
596 # unsigned>? i -= 4
597 sub $4,%r15
598 # comment:fp stack unchanged by jump
599 # goto mainloop if unsigned>
600 ja ._mainloop
601 # (uint32) x2 += j2
602 addl 64(%rsp),%ecx
603 # x3 <<= 32
604 shl $32,%rsi
605 # x3 += j2
606 addq 64(%rsp),%rsi
607 # (uint64) x3 >>= 32
608 shr $32,%rsi
609 # x3 <<= 32
610 shl $32,%rsi
611 # x2 += x3
612 add %rsi,%rcx
613 # (uint32) x6 += j6
614 addl 80(%rsp),%eax
615 # x7 <<= 32
616 shl $32,%r8
617 # x7 += j6
618 addq 80(%rsp),%r8
619 # (uint64) x7 >>= 32
620 shr $32,%r8
621 # x7 <<= 32
622 shl $32,%r8
623 # x6 += x7
624 add %r8,%rax
625 # (uint32) x8 += j8
626 addl 88(%rsp),%r11d
627 # x9 <<= 32
628 shl $32,%r10
629 # x9 += j8
630 addq 88(%rsp),%r10
631 # (uint64) x9 >>= 32
632 shr $32,%r10
633 # x9 <<= 32
634 shl $32,%r10
635 # x8 += x9
636 add %r10,%r11
637 # (uint32) x12 += j12
638 addl 104(%rsp),%r14d
639 # x13 <<= 32
640 shl $32,%r13
641 # x13 += j12
642 addq 104(%rsp),%r13
643 # (uint64) x13 >>= 32
644 shr $32,%r13
645 # x13 <<= 32
646 shl $32,%r13
647 # x12 += x13
648 add %r13,%r14
649 # (uint32) x0 += j0
650 addl 56(%rsp),%edx
651 # x1 <<= 32
652 shl $32,%rdi
653 # x1 += j0
654 addq 56(%rsp),%rdi
655 # (uint64) x1 >>= 32
656 shr $32,%rdi
657 # x1 <<= 32
658 shl $32,%rdi
659 # x0 += x1
660 add %rdi,%rdx
661 # x5 = x5_stack
662 movq 160(%rsp),%rdi
663 # (uint32) x4 += j4
664 addl 72(%rsp),%r9d
665 # x5 <<= 32
666 shl $32,%rdi
667 # x5 += j4
668 addq 72(%rsp),%rdi
669 # (uint64) x5 >>= 32
670 shr $32,%rdi
671 # x5 <<= 32
672 shl $32,%rdi
673 # x4 += x5
674 add %rdi,%r9
675 # x10 = x10_stack
676 movq 168(%rsp),%r8
677 # (uint32) x10 += j10
678 addl 96(%rsp),%r8d
679 # x11 <<= 32
680 shl $32,%r12
681 # x11 += j10
682 addq 96(%rsp),%r12
683 # (uint64) x11 >>= 32
684 shr $32,%r12
685 # x11 <<= 32
686 shl $32,%r12
687 # x10 += x11
688 add %r12,%r8
689 # x15 = x15_stack
690 movq 176(%rsp),%rdi
691 # (uint32) x14 += j14
692 addl 112(%rsp),%ebx
693 # x15 <<= 32
694 shl $32,%rdi
695 # x15 += j14
696 addq 112(%rsp),%rdi
697 # (uint64) x15 >>= 32
698 shr $32,%rdi
699 # x15 <<= 32
700 shl $32,%rdi
701 # x14 += x15
702 add %rdi,%rbx
703 # out = out_backup
704 movq 136(%rsp),%rdi
705 # m = m_backup
706 movq 144(%rsp),%rsi
707 # x0 ^= *(uint64 *) (m + 0)
708 xorq 0(%rsi),%rdx
709 # *(uint64 *) (out + 0) = x0
710 movq %rdx,0(%rdi)
711 # x2 ^= *(uint64 *) (m + 8)
712 xorq 8(%rsi),%rcx
713 # *(uint64 *) (out + 8) = x2
714 movq %rcx,8(%rdi)
715 # x4 ^= *(uint64 *) (m + 16)
716 xorq 16(%rsi),%r9
717 # *(uint64 *) (out + 16) = x4
718 movq %r9,16(%rdi)
719 # x6 ^= *(uint64 *) (m + 24)
720 xorq 24(%rsi),%rax
721 # *(uint64 *) (out + 24) = x6
722 movq %rax,24(%rdi)
723 # x8 ^= *(uint64 *) (m + 32)
724 xorq 32(%rsi),%r11
725 # *(uint64 *) (out + 32) = x8
726 movq %r11,32(%rdi)
727 # x10 ^= *(uint64 *) (m + 40)
728 xorq 40(%rsi),%r8
729 # *(uint64 *) (out + 40) = x10
730 movq %r8,40(%rdi)
731 # x12 ^= *(uint64 *) (m + 48)
732 xorq 48(%rsi),%r14
733 # *(uint64 *) (out + 48) = x12
734 movq %r14,48(%rdi)
735 # x14 ^= *(uint64 *) (m + 56)
736 xorq 56(%rsi),%rbx
737 # *(uint64 *) (out + 56) = x14
738 movq %rbx,56(%rdi)
739 # bytes = bytes_backup
740 movq 152(%rsp),%rdx
741 # in8 = j8
742 movq 88(%rsp),%rcx
743 # in8 += 1
744 add $1,%rcx
745 # j8 = in8
746 movq %rcx,88(%rsp)
747 # unsigned>? unsigned<? bytes - 64
748 cmp $64,%rdx
749 # comment:fp stack unchanged by jump
750 # goto bytesatleast65 if unsigned>
751 ja ._bytesatleast65
752 # comment:fp stack unchanged by jump
753 # goto bytesatleast64 if !unsigned<
754 jae ._bytesatleast64
755 # m = out
756 mov %rdi,%rsi
757 # out = ctarget
758 movq 128(%rsp),%rdi
759 # i = bytes
760 mov %rdx,%rcx
761 # while (i) { *out++ = *m++; --i }
762 rep movsb
763 # comment:fp stack unchanged by fallthrough
764# bytesatleast64:
765._bytesatleast64:
766 # x = x_backup
767 movq 120(%rsp),%rdi
768 # in8 = j8
769 movq 88(%rsp),%rsi
770 # *(uint64 *) (x + 32) = in8
771 movq %rsi,32(%rdi)
772 # r11 = r11_stack
773 movq 0(%rsp),%r11
774 # r12 = r12_stack
775 movq 8(%rsp),%r12
776 # r13 = r13_stack
777 movq 16(%rsp),%r13
778 # r14 = r14_stack
779 movq 24(%rsp),%r14
780 # r15 = r15_stack
781 movq 32(%rsp),%r15
782 # rbx = rbx_stack
783 movq 40(%rsp),%rbx
784 # rbp = rbp_stack
785 movq 48(%rsp),%rbp
786 # comment:fp stack unchanged by fallthrough
787# done:
788._done:
789 # leave
790 add %r11,%rsp
791 mov %rdi,%rax
792 mov %rsi,%rdx
793 ret
794# bytesatleast65:
795._bytesatleast65:
796 # bytes -= 64
797 sub $64,%rdx
798 # out += 64
799 add $64,%rdi
800 # m += 64
801 add $64,%rsi
802 # comment:fp stack unchanged by jump
803 # goto bytesatleast1
804 jmp ._bytesatleast1
805ENDPROC(salsa20_encrypt_bytes)
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
deleted file mode 100644
index b07d7d959806..000000000000
--- a/arch/x86/crypto/salsa20_glue.c
+++ /dev/null
@@ -1,91 +0,0 @@
1/*
2 * Glue code for optimized assembly version of Salsa20.
3 *
4 * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
5 *
6 * The assembly codes are public domain assembly codes written by Daniel. J.
7 * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
8 * and to remove extraneous comments and functions that are not needed.
9 * - i586 version, renamed as salsa20-i586-asm_32.S
10 * available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
11 * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
12 * available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
13 *
14 * Also modified to set up the initial state using the generic C code rather
15 * than in assembly.
16 *
17 * This program is free software; you can redistribute it and/or modify it
18 * under the terms of the GNU General Public License as published by the Free
19 * Software Foundation; either version 2 of the License, or (at your option)
20 * any later version.
21 *
22 */
23
24#include <asm/unaligned.h>
25#include <crypto/internal/skcipher.h>
26#include <crypto/salsa20.h>
27#include <linux/module.h>
28
29asmlinkage void salsa20_encrypt_bytes(u32 state[16], const u8 *src, u8 *dst,
30 u32 bytes);
31
32static int salsa20_asm_crypt(struct skcipher_request *req)
33{
34 struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
35 const struct salsa20_ctx *ctx = crypto_skcipher_ctx(tfm);
36 struct skcipher_walk walk;
37 u32 state[16];
38 int err;
39
40 err = skcipher_walk_virt(&walk, req, true);
41
42 crypto_salsa20_init(state, ctx, walk.iv);
43
44 while (walk.nbytes > 0) {
45 unsigned int nbytes = walk.nbytes;
46
47 if (nbytes < walk.total)
48 nbytes = round_down(nbytes, walk.stride);
49
50 salsa20_encrypt_bytes(state, walk.src.virt.addr,
51 walk.dst.virt.addr, nbytes);
52 err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
53 }
54
55 return err;
56}
57
58static struct skcipher_alg alg = {
59 .base.cra_name = "salsa20",
60 .base.cra_driver_name = "salsa20-asm",
61 .base.cra_priority = 200,
62 .base.cra_blocksize = 1,
63 .base.cra_ctxsize = sizeof(struct salsa20_ctx),
64 .base.cra_module = THIS_MODULE,
65
66 .min_keysize = SALSA20_MIN_KEY_SIZE,
67 .max_keysize = SALSA20_MAX_KEY_SIZE,
68 .ivsize = SALSA20_IV_SIZE,
69 .chunksize = SALSA20_BLOCK_SIZE,
70 .setkey = crypto_salsa20_setkey,
71 .encrypt = salsa20_asm_crypt,
72 .decrypt = salsa20_asm_crypt,
73};
74
75static int __init init(void)
76{
77 return crypto_register_skcipher(&alg);
78}
79
80static void __exit fini(void)
81{
82 crypto_unregister_skcipher(&alg);
83}
84
85module_init(init);
86module_exit(fini);
87
88MODULE_LICENSE("GPL");
89MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
90MODULE_ALIAS_CRYPTO("salsa20");
91MODULE_ALIAS_CRYPTO("salsa20-asm");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 30d54a56e64a..f3e40ac56d93 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -1436,34 +1436,6 @@ config CRYPTO_SALSA20
1436 The Salsa20 stream cipher algorithm is designed by Daniel J. 1436 The Salsa20 stream cipher algorithm is designed by Daniel J.
1437 Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html> 1437 Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
1438 1438
1439config CRYPTO_SALSA20_586
1440 tristate "Salsa20 stream cipher algorithm (i586)"
1441 depends on (X86 || UML_X86) && !64BIT
1442 select CRYPTO_BLKCIPHER
1443 select CRYPTO_SALSA20
1444 help
1445 Salsa20 stream cipher algorithm.
1446
1447 Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
1448 Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
1449
1450 The Salsa20 stream cipher algorithm is designed by Daniel J.
1451 Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
1452
1453config CRYPTO_SALSA20_X86_64
1454 tristate "Salsa20 stream cipher algorithm (x86_64)"
1455 depends on (X86 || UML_X86) && 64BIT
1456 select CRYPTO_BLKCIPHER
1457 select CRYPTO_SALSA20
1458 help
1459 Salsa20 stream cipher algorithm.
1460
1461 Salsa20 is a stream cipher submitted to eSTREAM, the ECRYPT
1462 Stream Cipher Project. See <http://www.ecrypt.eu.org/stream/>
1463
1464 The Salsa20 stream cipher algorithm is designed by Daniel J.
1465 Bernstein <djb@cr.yp.to>. See <http://cr.yp.to/snuffle.html>
1466
1467config CRYPTO_CHACHA20 1439config CRYPTO_CHACHA20
1468 tristate "ChaCha20 cipher algorithm" 1440 tristate "ChaCha20 cipher algorithm"
1469 select CRYPTO_BLKCIPHER 1441 select CRYPTO_BLKCIPHER