[CRYPTO] salsa20: Add x86-64 assembly version

This is the x86-64 version of the Salsa20 stream cipher algorithm. The original assembly code came from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>. It has been reformatted for clarity. Signed-off-by: Tan Swee Heng <thesweeheng@gmail.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
author: Tan Swee Heng <thesweeheng@gmail.com> 2007-12-17 11:04:40 -0500
committer: Herbert Xu <herbert@gondor.apana.org.au> 2008-01-10 16:16:57 -0500
commit: 9a7dafbba47384c330779c75a1546684efaa8c1a (patch)
tree: 0fde4a938ebc3c9deb0873b709dc5d2d69ab25c3 /arch
parent: 974e4b752ee623854c5dc2bbfc7c7725029ce173 (diff)
3 files changed, 924 insertions, 0 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 25cc8441046a..09200e12f14d 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
 aes-i586-y := aes-i586-asm_32.o aes_glue.o
 twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
@@ -15,3 +16,4 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
 aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
 twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
+salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
new file mode 100644
index 000000000000..6214a9b09706
--- /dev/null
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -0,0 +1,920 @@
+# enter ECRYPT_encrypt_bytes
+.text
+.p2align 5
+.globl ECRYPT_encrypt_bytes
+ECRYPT_encrypt_bytes:
+        mov     %rsp,%r11
+        and     $31,%r11
+        add     $256,%r11
+        sub     %r11,%rsp
+        # x = arg1
+        mov     %rdi,%r8
+        # m = arg2
+        mov     %rsi,%rsi
+        # out = arg3
+        mov     %rdx,%rdi
+        # bytes = arg4
+        mov     %rcx,%rdx
+        #               unsigned>? bytes - 0
+        cmp     $0,%rdx
+        # comment:fp stack unchanged by jump
+        # goto done if !unsigned>
+        jbe     ._done
+        # comment:fp stack unchanged by fallthrough
+# start:
+._start:
+        # r11_stack = r11
+        movq    %r11,0(%rsp)
+        # r12_stack = r12
+        movq    %r12,8(%rsp)
+        # r13_stack = r13
+        movq    %r13,16(%rsp)
+        # r14_stack = r14
+        movq    %r14,24(%rsp)
+        # r15_stack = r15
+        movq    %r15,32(%rsp)
+        # rbx_stack = rbx
+        movq    %rbx,40(%rsp)
+        # rbp_stack = rbp
+        movq    %rbp,48(%rsp)
+        # in0 = *(uint64 *) (x + 0)
+        movq    0(%r8),%rcx
+        # in2 = *(uint64 *) (x + 8)
+        movq    8(%r8),%r9
+        # in4 = *(uint64 *) (x + 16)
+        movq    16(%r8),%rax
+        # in6 = *(uint64 *) (x + 24)
+        movq    24(%r8),%r10
+        # in8 = *(uint64 *) (x + 32)
+        movq    32(%r8),%r11
+        # in10 = *(uint64 *) (x + 40)
+        movq    40(%r8),%r12
+        # in12 = *(uint64 *) (x + 48)
+        movq    48(%r8),%r13
+        # in14 = *(uint64 *) (x + 56)
+        movq    56(%r8),%r14
+        # j0 = in0
+        movq    %rcx,56(%rsp)
+        # j2 = in2
+        movq    %r9,64(%rsp)
+        # j4 = in4
+        movq    %rax,72(%rsp)
+        # j6 = in6
+        movq    %r10,80(%rsp)
+        # j8 = in8
+        movq    %r11,88(%rsp)
+        # j10 = in10
+        movq    %r12,96(%rsp)
+        # j12 = in12
+        movq    %r13,104(%rsp)
+        # j14 = in14
+        movq    %r14,112(%rsp)
+        # x_backup = x
+        movq    %r8,120(%rsp)
+# bytesatleast1:
+._bytesatleast1:
+        #                   unsigned<? bytes - 64
+        cmp     $64,%rdx
+        # comment:fp stack unchanged by jump
+        #   goto nocopy if !unsigned<
+        jae     ._nocopy
+        #     ctarget = out
+        movq    %rdi,128(%rsp)
+        #     out = &tmp
+        leaq    192(%rsp),%rdi
+        #     i = bytes
+        mov     %rdx,%rcx
+        #     while (i) { *out++ = *m++; --i }
+        rep     movsb
+        #     out = &tmp
+        leaq    192(%rsp),%rdi
+        #     m = &tmp
+        leaq    192(%rsp),%rsi
+        # comment:fp stack unchanged by fallthrough
+#   nocopy:
+._nocopy:
+        #   out_backup = out
+        movq    %rdi,136(%rsp)
+        #   m_backup = m
+        movq    %rsi,144(%rsp)
+        #   bytes_backup = bytes
+        movq    %rdx,152(%rsp)
+        #   x1 = j0
+        movq    56(%rsp),%rdi
+        #   x0 = x1
+        mov     %rdi,%rdx
+        #   (uint64) x1 >>= 32
+        shr     $32,%rdi
+        #               x3 = j2
+        movq    64(%rsp),%rsi
+        #               x2 = x3
+        mov     %rsi,%rcx
+        #               (uint64) x3 >>= 32
+        shr     $32,%rsi
+        #   x5 = j4
+        movq    72(%rsp),%r8
+        #   x4 = x5
+        mov     %r8,%r9
+        #   (uint64) x5 >>= 32
+        shr     $32,%r8
+        #   x5_stack = x5
+        movq    %r8,160(%rsp)
+        #               x7 = j6
+        movq    80(%rsp),%r8
+        #               x6 = x7
+        mov     %r8,%rax
+        #               (uint64) x7 >>= 32
+        shr     $32,%r8
+        #   x9 = j8
+        movq    88(%rsp),%r10
+        #   x8 = x9
+        mov     %r10,%r11
+        #   (uint64) x9 >>= 32
+        shr     $32,%r10
+        #               x11 = j10
+        movq    96(%rsp),%r12
+        #               x10 = x11
+        mov     %r12,%r13
+        #               x10_stack = x10
+        movq    %r13,168(%rsp)
+        #               (uint64) x11 >>= 32
+        shr     $32,%r12
+        #   x13 = j12
+        movq    104(%rsp),%r13
+        #   x12 = x13
+        mov     %r13,%r14
+        #   (uint64) x13 >>= 32
+        shr     $32,%r13
+        #               x15 = j14
+        movq    112(%rsp),%r15
+        #               x14 = x15
+        mov     %r15,%rbx
+        #               (uint64) x15 >>= 32
+        shr     $32,%r15
+        #               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #   i = 20
+        mov     $20,%r15
+#   mainloop:
+._mainloop:
+        #   i_backup = i
+        movq    %r15,184(%rsp)
+        #               x5 = x5_stack
+        movq    160(%rsp),%r15
+        # a = x12 + x0
+        lea     (%r14,%rdx),%rbp
+        # (uint32) a <<<= 7
+        rol     $7,%ebp
+        # x4 ^= a
+        xor     %rbp,%r9
+        #               b = x1 + x5
+        lea     (%rdi,%r15),%rbp
+        #               (uint32) b <<<= 7
+        rol     $7,%ebp
+        #               x9 ^= b
+        xor     %rbp,%r10
+        # a = x0 + x4
+        lea     (%rdx,%r9),%rbp
+        # (uint32) a <<<= 9
+        rol     $9,%ebp
+        # x8 ^= a
+        xor     %rbp,%r11
+        #               b = x5 + x9
+        lea     (%r15,%r10),%rbp
+        #               (uint32) b <<<= 9
+        rol     $9,%ebp
+        #               x13 ^= b
+        xor     %rbp,%r13
+        # a = x4 + x8
+        lea     (%r9,%r11),%rbp
+        # (uint32) a <<<= 13
+        rol     $13,%ebp
+        # x12 ^= a
+        xor     %rbp,%r14
+        #               b = x9 + x13
+        lea     (%r10,%r13),%rbp
+        #               (uint32) b <<<= 13
+        rol     $13,%ebp
+        #               x1 ^= b
+        xor     %rbp,%rdi
+        # a = x8 + x12
+        lea     (%r11,%r14),%rbp
+        # (uint32) a <<<= 18
+        rol     $18,%ebp
+        # x0 ^= a
+        xor     %rbp,%rdx
+        #               b = x13 + x1
+        lea     (%r13,%rdi),%rbp
+        #               (uint32) b <<<= 18
+        rol     $18,%ebp
+        #               x5 ^= b
+        xor     %rbp,%r15
+        #                               x10 = x10_stack
+        movq    168(%rsp),%rbp
+        #               x5_stack = x5
+        movq    %r15,160(%rsp)
+        #                               c = x6 + x10
+        lea     (%rax,%rbp),%r15
+        #                               (uint32) c <<<= 7
+        rol     $7,%r15d
+        #                               x14 ^= c
+        xor     %r15,%rbx
+        #                               c = x10 + x14
+        lea     (%rbp,%rbx),%r15
+        #                               (uint32) c <<<= 9
+        rol     $9,%r15d
+        #                               x2 ^= c
+        xor     %r15,%rcx
+        #                               c = x14 + x2
+        lea     (%rbx,%rcx),%r15
+        #                               (uint32) c <<<= 13
+        rol     $13,%r15d
+        #                               x6 ^= c
+        xor     %r15,%rax
+        #                               c = x2 + x6
+        lea     (%rcx,%rax),%r15
+        #                               (uint32) c <<<= 18
+        rol     $18,%r15d
+        #                               x10 ^= c
+        xor     %r15,%rbp
+        #                                               x15 = x15_stack
+        movq    176(%rsp),%r15
+        #                               x10_stack = x10
+        movq    %rbp,168(%rsp)
+        #                                               d = x11 + x15
+        lea     (%r12,%r15),%rbp
+        #                                               (uint32) d <<<= 7
+        rol     $7,%ebp
+        #                                               x3 ^= d
+        xor     %rbp,%rsi
+        #                                               d = x15 + x3
+        lea     (%r15,%rsi),%rbp
+        #                                               (uint32) d <<<= 9
+        rol     $9,%ebp
+        #                                               x7 ^= d
+        xor     %rbp,%r8
+        #                                               d = x3 + x7
+        lea     (%rsi,%r8),%rbp
+        #                                               (uint32) d <<<= 13
+        rol     $13,%ebp
+        #                                               x11 ^= d
+        xor     %rbp,%r12
+        #                                               d = x7 + x11
+        lea     (%r8,%r12),%rbp
+        #                                               (uint32) d <<<= 18
+        rol     $18,%ebp
+        #                                               x15 ^= d
+        xor     %rbp,%r15
+        #                                               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #               x5 = x5_stack
+        movq    160(%rsp),%r15
+        # a = x3 + x0
+        lea     (%rsi,%rdx),%rbp
+        # (uint32) a <<<= 7
+        rol     $7,%ebp
+        # x1 ^= a
+        xor     %rbp,%rdi
+        #               b = x4 + x5
+        lea     (%r9,%r15),%rbp
+        #               (uint32) b <<<= 7
+        rol     $7,%ebp
+        #               x6 ^= b
+        xor     %rbp,%rax
+        # a = x0 + x1
+        lea     (%rdx,%rdi),%rbp
+        # (uint32) a <<<= 9
+        rol     $9,%ebp
+        # x2 ^= a
+        xor     %rbp,%rcx
+        #               b = x5 + x6
+        lea     (%r15,%rax),%rbp
+        #               (uint32) b <<<= 9
+        rol     $9,%ebp
+        #               x7 ^= b
+        xor     %rbp,%r8
+        # a = x1 + x2
+        lea     (%rdi,%rcx),%rbp
+        # (uint32) a <<<= 13
+        rol     $13,%ebp
+        # x3 ^= a
+        xor     %rbp,%rsi
+        #               b = x6 + x7
+        lea     (%rax,%r8),%rbp
+        #               (uint32) b <<<= 13
+        rol     $13,%ebp
+        #               x4 ^= b
+        xor     %rbp,%r9
+        # a = x2 + x3
+        lea     (%rcx,%rsi),%rbp
+        # (uint32) a <<<= 18
+        rol     $18,%ebp
+        # x0 ^= a
+        xor     %rbp,%rdx
+        #               b = x7 + x4
+        lea     (%r8,%r9),%rbp
+        #               (uint32) b <<<= 18
+        rol     $18,%ebp
+        #               x5 ^= b
+        xor     %rbp,%r15
+        #                               x10 = x10_stack
+        movq    168(%rsp),%rbp
+        #               x5_stack = x5
+        movq    %r15,160(%rsp)
+        #                               c = x9 + x10
+        lea     (%r10,%rbp),%r15
+        #                               (uint32) c <<<= 7
+        rol     $7,%r15d
+        #                               x11 ^= c
+        xor     %r15,%r12
+        #                               c = x10 + x11
+        lea     (%rbp,%r12),%r15
+        #                               (uint32) c <<<= 9
+        rol     $9,%r15d
+        #                               x8 ^= c
+        xor     %r15,%r11
+        #                               c = x11 + x8
+        lea     (%r12,%r11),%r15
+        #                               (uint32) c <<<= 13
+        rol     $13,%r15d
+        #                               x9 ^= c
+        xor     %r15,%r10
+        #                               c = x8 + x9
+        lea     (%r11,%r10),%r15
+        #                               (uint32) c <<<= 18
+        rol     $18,%r15d
+        #                               x10 ^= c
+        xor     %r15,%rbp
+        #                                               x15 = x15_stack
+        movq    176(%rsp),%r15
+        #                               x10_stack = x10
+        movq    %rbp,168(%rsp)
+        #                                               d = x14 + x15
+        lea     (%rbx,%r15),%rbp
+        #                                               (uint32) d <<<= 7
+        rol     $7,%ebp
+        #                                               x12 ^= d
+        xor     %rbp,%r14
+        #                                               d = x15 + x12
+        lea     (%r15,%r14),%rbp
+        #                                               (uint32) d <<<= 9
+        rol     $9,%ebp
+        #                                               x13 ^= d
+        xor     %rbp,%r13
+        #                                               d = x12 + x13
+        lea     (%r14,%r13),%rbp
+        #                                               (uint32) d <<<= 13
+        rol     $13,%ebp
+        #                                               x14 ^= d
+        xor     %rbp,%rbx
+        #                                               d = x13 + x14
+        lea     (%r13,%rbx),%rbp
+        #                                               (uint32) d <<<= 18
+        rol     $18,%ebp
+        #                                               x15 ^= d
+        xor     %rbp,%r15
+        #                                               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #               x5 = x5_stack
+        movq    160(%rsp),%r15
+        # a = x12 + x0
+        lea     (%r14,%rdx),%rbp
+        # (uint32) a <<<= 7
+        rol     $7,%ebp
+        # x4 ^= a
+        xor     %rbp,%r9
+        #               b = x1 + x5
+        lea     (%rdi,%r15),%rbp
+        #               (uint32) b <<<= 7
+        rol     $7,%ebp
+        #               x9 ^= b
+        xor     %rbp,%r10
+        # a = x0 + x4
+        lea     (%rdx,%r9),%rbp
+        # (uint32) a <<<= 9
+        rol     $9,%ebp
+        # x8 ^= a
+        xor     %rbp,%r11
+        #               b = x5 + x9
+        lea     (%r15,%r10),%rbp
+        #               (uint32) b <<<= 9
+        rol     $9,%ebp
+        #               x13 ^= b
+        xor     %rbp,%r13
+        # a = x4 + x8
+        lea     (%r9,%r11),%rbp
+        # (uint32) a <<<= 13
+        rol     $13,%ebp
+        # x12 ^= a
+        xor     %rbp,%r14
+        #               b = x9 + x13
+        lea     (%r10,%r13),%rbp
+        #               (uint32) b <<<= 13
+        rol     $13,%ebp
+        #               x1 ^= b
+        xor     %rbp,%rdi
+        # a = x8 + x12
+        lea     (%r11,%r14),%rbp
+        # (uint32) a <<<= 18
+        rol     $18,%ebp
+        # x0 ^= a
+        xor     %rbp,%rdx
+        #               b = x13 + x1
+        lea     (%r13,%rdi),%rbp
+        #               (uint32) b <<<= 18
+        rol     $18,%ebp
+        #               x5 ^= b
+        xor     %rbp,%r15
+        #                               x10 = x10_stack
+        movq    168(%rsp),%rbp
+        #               x5_stack = x5
+        movq    %r15,160(%rsp)
+        #                               c = x6 + x10
+        lea     (%rax,%rbp),%r15
+        #                               (uint32) c <<<= 7
+        rol     $7,%r15d
+        #                               x14 ^= c
+        xor     %r15,%rbx
+        #                               c = x10 + x14
+        lea     (%rbp,%rbx),%r15
+        #                               (uint32) c <<<= 9
+        rol     $9,%r15d
+        #                               x2 ^= c
+        xor     %r15,%rcx
+        #                               c = x14 + x2
+        lea     (%rbx,%rcx),%r15
+        #                               (uint32) c <<<= 13
+        rol     $13,%r15d
+        #                               x6 ^= c
+        xor     %r15,%rax
+        #                               c = x2 + x6
+        lea     (%rcx,%rax),%r15
+        #                               (uint32) c <<<= 18
+        rol     $18,%r15d
+        #                               x10 ^= c
+        xor     %r15,%rbp
+        #                                               x15 = x15_stack
+        movq    176(%rsp),%r15
+        #                               x10_stack = x10
+        movq    %rbp,168(%rsp)
+        #                                               d = x11 + x15
+        lea     (%r12,%r15),%rbp
+        #                                               (uint32) d <<<= 7
+        rol     $7,%ebp
+        #                                               x3 ^= d
+        xor     %rbp,%rsi
+        #                                               d = x15 + x3
+        lea     (%r15,%rsi),%rbp
+        #                                               (uint32) d <<<= 9
+        rol     $9,%ebp
+        #                                               x7 ^= d
+        xor     %rbp,%r8
+        #                                               d = x3 + x7
+        lea     (%rsi,%r8),%rbp
+        #                                               (uint32) d <<<= 13
+        rol     $13,%ebp
+        #                                               x11 ^= d
+        xor     %rbp,%r12
+        #                                               d = x7 + x11
+        lea     (%r8,%r12),%rbp
+        #                                               (uint32) d <<<= 18
+        rol     $18,%ebp
+        #                                               x15 ^= d
+        xor     %rbp,%r15
+        #                                               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #               x5 = x5_stack
+        movq    160(%rsp),%r15
+        # a = x3 + x0
+        lea     (%rsi,%rdx),%rbp
+        # (uint32) a <<<= 7
+        rol     $7,%ebp
+        # x1 ^= a
+        xor     %rbp,%rdi
+        #               b = x4 + x5
+        lea     (%r9,%r15),%rbp
+        #               (uint32) b <<<= 7
+        rol     $7,%ebp
+        #               x6 ^= b
+        xor     %rbp,%rax
+        # a = x0 + x1
+        lea     (%rdx,%rdi),%rbp
+        # (uint32) a <<<= 9
+        rol     $9,%ebp
+        # x2 ^= a
+        xor     %rbp,%rcx
+        #               b = x5 + x6
+        lea     (%r15,%rax),%rbp
+        #               (uint32) b <<<= 9
+        rol     $9,%ebp
+        #               x7 ^= b
+        xor     %rbp,%r8
+        # a = x1 + x2
+        lea     (%rdi,%rcx),%rbp
+        # (uint32) a <<<= 13
+        rol     $13,%ebp
+        # x3 ^= a
+        xor     %rbp,%rsi
+        #               b = x6 + x7
+        lea     (%rax,%r8),%rbp
+        #               (uint32) b <<<= 13
+        rol     $13,%ebp
+        #               x4 ^= b
+        xor     %rbp,%r9
+        # a = x2 + x3
+        lea     (%rcx,%rsi),%rbp
+        # (uint32) a <<<= 18
+        rol     $18,%ebp
+        # x0 ^= a
+        xor     %rbp,%rdx
+        #               b = x7 + x4
+        lea     (%r8,%r9),%rbp
+        #               (uint32) b <<<= 18
+        rol     $18,%ebp
+        #               x5 ^= b
+        xor     %rbp,%r15
+        #                               x10 = x10_stack
+        movq    168(%rsp),%rbp
+        #               x5_stack = x5
+        movq    %r15,160(%rsp)
+        #                               c = x9 + x10
+        lea     (%r10,%rbp),%r15
+        #                               (uint32) c <<<= 7
+        rol     $7,%r15d
+        #                               x11 ^= c
+        xor     %r15,%r12
+        #                               c = x10 + x11
+        lea     (%rbp,%r12),%r15
+        #                               (uint32) c <<<= 9
+        rol     $9,%r15d
+        #                               x8 ^= c
+        xor     %r15,%r11
+        #                               c = x11 + x8
+        lea     (%r12,%r11),%r15
+        #                               (uint32) c <<<= 13
+        rol     $13,%r15d
+        #                               x9 ^= c
+        xor     %r15,%r10
+        #                               c = x8 + x9
+        lea     (%r11,%r10),%r15
+        #                               (uint32) c <<<= 18
+        rol     $18,%r15d
+        #                               x10 ^= c
+        xor     %r15,%rbp
+        #                                               x15 = x15_stack
+        movq    176(%rsp),%r15
+        #                               x10_stack = x10
+        movq    %rbp,168(%rsp)
+        #                                               d = x14 + x15
+        lea     (%rbx,%r15),%rbp
+        #                                               (uint32) d <<<= 7
+        rol     $7,%ebp
+        #                                               x12 ^= d
+        xor     %rbp,%r14
+        #                                               d = x15 + x12
+        lea     (%r15,%r14),%rbp
+        #                                               (uint32) d <<<= 9
+        rol     $9,%ebp
+        #                                               x13 ^= d
+        xor     %rbp,%r13
+        #                                               d = x12 + x13
+        lea     (%r14,%r13),%rbp
+        #                                               (uint32) d <<<= 13
+        rol     $13,%ebp
+        #                                               x14 ^= d
+        xor     %rbp,%rbx
+        #                                               d = x13 + x14
+        lea     (%r13,%rbx),%rbp
+        #                                               (uint32) d <<<= 18
+        rol     $18,%ebp
+        #                                               x15 ^= d
+        xor     %rbp,%r15
+        #                                               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #   i = i_backup
+        movq    184(%rsp),%r15
+        #                  unsigned>? i -= 4
+        sub     $4,%r15
+        # comment:fp stack unchanged by jump
+        # goto mainloop if unsigned>
+        ja      ._mainloop
+        #   (uint32) x2 += j2
+        addl    64(%rsp),%ecx
+        #   x3 <<= 32
+        shl     $32,%rsi
+        #   x3 += j2
+        addq    64(%rsp),%rsi
+        #   (uint64) x3 >>= 32
+        shr     $32,%rsi
+        #   x3 <<= 32
+        shl     $32,%rsi
+        #   x2 += x3
+        add     %rsi,%rcx
+        #   (uint32) x6 += j6
+        addl    80(%rsp),%eax
+        #   x7 <<= 32
+        shl     $32,%r8
+        #   x7 += j6
+        addq    80(%rsp),%r8
+        #   (uint64) x7 >>= 32
+        shr     $32,%r8
+        #   x7 <<= 32
+        shl     $32,%r8
+        #   x6 += x7
+        add     %r8,%rax
+        #   (uint32) x8 += j8
+        addl    88(%rsp),%r11d
+        #   x9 <<= 32
+        shl     $32,%r10
+        #   x9 += j8
+        addq    88(%rsp),%r10
+        #   (uint64) x9 >>= 32
+        shr     $32,%r10
+        #   x9 <<= 32
+        shl     $32,%r10
+        #   x8 += x9
+        add     %r10,%r11
+        #   (uint32) x12 += j12
+        addl    104(%rsp),%r14d
+        #   x13 <<= 32
+        shl     $32,%r13
+        #   x13 += j12
+        addq    104(%rsp),%r13
+        #   (uint64) x13 >>= 32
+        shr     $32,%r13
+        #   x13 <<= 32
+        shl     $32,%r13
+        #   x12 += x13
+        add     %r13,%r14
+        #   (uint32) x0 += j0
+        addl    56(%rsp),%edx
+        #   x1 <<= 32
+        shl     $32,%rdi
+        #   x1 += j0
+        addq    56(%rsp),%rdi
+        #   (uint64) x1 >>= 32
+        shr     $32,%rdi
+        #   x1 <<= 32
+        shl     $32,%rdi
+        #   x0 += x1
+        add     %rdi,%rdx
+        #   x5 = x5_stack
+        movq    160(%rsp),%rdi
+        #   (uint32) x4 += j4
+        addl    72(%rsp),%r9d
+        #   x5 <<= 32
+        shl     $32,%rdi
+        #   x5 += j4
+        addq    72(%rsp),%rdi
+        #   (uint64) x5 >>= 32
+        shr     $32,%rdi
+        #   x5 <<= 32
+        shl     $32,%rdi
+        #   x4 += x5
+        add     %rdi,%r9
+        #   x10 = x10_stack
+        movq    168(%rsp),%r8
+        #   (uint32) x10 += j10
+        addl    96(%rsp),%r8d
+        #   x11 <<= 32
+        shl     $32,%r12
+        #   x11 += j10
+        addq    96(%rsp),%r12
+        #   (uint64) x11 >>= 32
+        shr     $32,%r12
+        #   x11 <<= 32
+        shl     $32,%r12
+        #   x10 += x11
+        add     %r12,%r8
+        #   x15 = x15_stack
+        movq    176(%rsp),%rdi
+        #   (uint32) x14 += j14
+        addl    112(%rsp),%ebx
+        #   x15 <<= 32
+        shl     $32,%rdi
+        #   x15 += j14
+        addq    112(%rsp),%rdi
+        #   (uint64) x15 >>= 32
+        shr     $32,%rdi
+        #   x15 <<= 32
+        shl     $32,%rdi
+        #   x14 += x15
+        add     %rdi,%rbx
+        #   out = out_backup
+        movq    136(%rsp),%rdi
+        #   m = m_backup
+        movq    144(%rsp),%rsi
+        #   x0 ^= *(uint64 *) (m + 0)
+        xorq    0(%rsi),%rdx
+        #   *(uint64 *) (out + 0) = x0
+        movq    %rdx,0(%rdi)
+        #   x2 ^= *(uint64 *) (m + 8)
+        xorq    8(%rsi),%rcx
+        #   *(uint64 *) (out + 8) = x2
+        movq    %rcx,8(%rdi)
+        #   x4 ^= *(uint64 *) (m + 16)
+        xorq    16(%rsi),%r9
+        #   *(uint64 *) (out + 16) = x4
+        movq    %r9,16(%rdi)
+        #   x6 ^= *(uint64 *) (m + 24)
+        xorq    24(%rsi),%rax
+        #   *(uint64 *) (out + 24) = x6
+        movq    %rax,24(%rdi)
+        #   x8 ^= *(uint64 *) (m + 32)
+        xorq    32(%rsi),%r11
+        #   *(uint64 *) (out + 32) = x8
+        movq    %r11,32(%rdi)
+        #   x10 ^= *(uint64 *) (m + 40)
+        xorq    40(%rsi),%r8
+        #   *(uint64 *) (out + 40) = x10
+        movq    %r8,40(%rdi)
+        #   x12 ^= *(uint64 *) (m + 48)
+        xorq    48(%rsi),%r14
+        #   *(uint64 *) (out + 48) = x12
+        movq    %r14,48(%rdi)
+        #   x14 ^= *(uint64 *) (m + 56)
+        xorq    56(%rsi),%rbx
+        #   *(uint64 *) (out + 56) = x14
+        movq    %rbx,56(%rdi)
+        #   bytes = bytes_backup
+        movq    152(%rsp),%rdx
+        #   in8 = j8
+        movq    88(%rsp),%rcx
+        #   in8 += 1
+        add     $1,%rcx
+        #   j8 = in8
+        movq    %rcx,88(%rsp)
+        #                          unsigned>? unsigned<? bytes - 64
+        cmp     $64,%rdx
+        # comment:fp stack unchanged by jump
+        #   goto bytesatleast65 if unsigned>
+        ja      ._bytesatleast65
+        # comment:fp stack unchanged by jump
+        #     goto bytesatleast64 if !unsigned<
+        jae     ._bytesatleast64
+        #       m = out
+        mov     %rdi,%rsi
+        #       out = ctarget
+        movq    128(%rsp),%rdi
+        #       i = bytes
+        mov     %rdx,%rcx
+        #       while (i) { *out++ = *m++; --i }
+        rep     movsb
+        # comment:fp stack unchanged by fallthrough
+#     bytesatleast64:
+._bytesatleast64:
+        #     x = x_backup
+        movq    120(%rsp),%rdi
+        #     in8 = j8
+        movq    88(%rsp),%rsi
+        #     *(uint64 *) (x + 32) = in8
+        movq    %rsi,32(%rdi)
+        #     r11 = r11_stack
+        movq    0(%rsp),%r11
+        #     r12 = r12_stack
+        movq    8(%rsp),%r12
+        #     r13 = r13_stack
+        movq    16(%rsp),%r13
+        #     r14 = r14_stack
+        movq    24(%rsp),%r14
+        #     r15 = r15_stack
+        movq    32(%rsp),%r15
+        #     rbx = rbx_stack
+        movq    40(%rsp),%rbx
+        #     rbp = rbp_stack
+        movq    48(%rsp),%rbp
+        # comment:fp stack unchanged by fallthrough
+#     done:
+._done:
+        #     leave
+        add     %r11,%rsp
+        mov     %rdi,%rax
+        mov     %rsi,%rdx
+        ret
+#   bytesatleast65:
+._bytesatleast65:
+        #   bytes -= 64
+        sub     $64,%rdx
+        #   out += 64
+        add     $64,%rdi
+        #   m += 64
+        add     $64,%rsi
+        # comment:fp stack unchanged by jump
+        # goto bytesatleast1
+        jmp     ._bytesatleast1
+# enter ECRYPT_keysetup
+.text
+.p2align 5
+.globl ECRYPT_keysetup
+ECRYPT_keysetup:
+        mov     %rsp,%r11
+        and     $31,%r11
+        add     $256,%r11
+        sub     %r11,%rsp
+        #   k = arg2
+        mov     %rsi,%rsi
+        #   kbits = arg3
+        mov     %rdx,%rdx
+        #   x = arg1
+        mov     %rdi,%rdi
+        #   in0 = *(uint64 *) (k + 0)
+        movq    0(%rsi),%r8
+        #   in2 = *(uint64 *) (k + 8)
+        movq    8(%rsi),%r9
+        #   *(uint64 *) (x + 4) = in0
+        movq    %r8,4(%rdi)
+        #   *(uint64 *) (x + 12) = in2
+        movq    %r9,12(%rdi)
+        #                    unsigned<? kbits - 256
+        cmp     $256,%rdx
+        # comment:fp stack unchanged by jump
+        #   goto kbits128 if unsigned<
+        jb      ._kbits128
+#   kbits256:
+._kbits256:
+        #     in10 = *(uint64 *) (k + 16)
+        movq    16(%rsi),%rdx
+        #     in12 = *(uint64 *) (k + 24)
+        movq    24(%rsi),%rsi
+        #     *(uint64 *) (x + 44) = in10
+        movq    %rdx,44(%rdi)
+        #     *(uint64 *) (x + 52) = in12
+        movq    %rsi,52(%rdi)
+        #     in0 = 1634760805
+        mov     $1634760805,%rsi
+        #     in4 = 857760878
+        mov     $857760878,%rdx
+        #     in10 = 2036477234
+        mov     $2036477234,%rcx
+        #     in14 = 1797285236
+        mov     $1797285236,%r8
+        #     *(uint32 *) (x + 0) = in0
+        movl    %esi,0(%rdi)
+        #     *(uint32 *) (x + 20) = in4
+        movl    %edx,20(%rdi)
+        #     *(uint32 *) (x + 40) = in10
+        movl    %ecx,40(%rdi)
+        #     *(uint32 *) (x + 60) = in14
+        movl    %r8d,60(%rdi)
+        # comment:fp stack unchanged by jump
+        #   goto keysetupdone
+        jmp     ._keysetupdone
+#   kbits128:
+._kbits128:
+        #     in10 = *(uint64 *) (k + 0)
+        movq    0(%rsi),%rdx
+        #     in12 = *(uint64 *) (k + 8)
+        movq    8(%rsi),%rsi
+        #     *(uint64 *) (x + 44) = in10
+        movq    %rdx,44(%rdi)
+        #     *(uint64 *) (x + 52) = in12
+        movq    %rsi,52(%rdi)
+        #     in0 = 1634760805
+        mov     $1634760805,%rsi
+        #     in4 = 824206446
+        mov     $824206446,%rdx
+        #     in10 = 2036477238
+        mov     $2036477238,%rcx
+        #     in14 = 1797285236
+        mov     $1797285236,%r8
+        #     *(uint32 *) (x + 0) = in0
+        movl    %esi,0(%rdi)
+        #     *(uint32 *) (x + 20) = in4
+        movl    %edx,20(%rdi)
+        #     *(uint32 *) (x + 40) = in10
+        movl    %ecx,40(%rdi)
+        #     *(uint32 *) (x + 60) = in14
+        movl    %r8d,60(%rdi)
+#   keysetupdone:
+._keysetupdone:
+        # leave
+        add     %r11,%rsp
+        mov     %rdi,%rax
+        mov     %rsi,%rdx
+        ret
+# enter ECRYPT_ivsetup
+.text
+.p2align 5
+.globl ECRYPT_ivsetup
+ECRYPT_ivsetup:
+        mov     %rsp,%r11
+        and     $31,%r11
+        add     $256,%r11
+        sub     %r11,%rsp
+        #   iv = arg2
+        mov     %rsi,%rsi
+        #   x = arg1
+        mov     %rdi,%rdi
+        #   in6 = *(uint64 *) (iv + 0)
+        movq    0(%rsi),%rsi
+        #   in8 = 0
+        mov     $0,%r8
+        #   *(uint64 *) (x + 24) = in6
+        movq    %rsi,24(%rdi)
+        #   *(uint64 *) (x + 32) = in8
+        movq    %r8,32(%rdi)
+        # leave
+        add     %r11,%rsp
+        mov     %rdi,%rax
+        mov     %rsi,%rdx
+        ret
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
index 3be443995ed6..bccb76d80987 100644
--- a/arch/x86/crypto/salsa20_glue.c
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -8,6 +8,8 @@
 * and to remove extraneous comments and functions that are not needed.
 * - i586 version, renamed as salsa20-i586-asm_32.S
 *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
+ * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
+ *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
author	Tan Swee Heng <thesweeheng@gmail.com>	2007-12-17 11:04:40 -0500
committer	Herbert Xu <herbert@gondor.apana.org.au>	2008-01-10 16:16:57 -0500
commit	9a7dafbba47384c330779c75a1546684efaa8c1a (patch)
tree	0fde4a938ebc3c9deb0873b709dc5d2d69ab25c3 /arch
parent	974e4b752ee623854c5dc2bbfc7c7725029ce173 (diff)

diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 25cc8441046a..09200e12f14d 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
8		8
9	obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o	9	obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10	obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o	10	obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
		11	obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
11		12
12	aes-i586-y := aes-i586-asm_32.o aes_glue.o	13	aes-i586-y := aes-i586-asm_32.o aes_glue.o
13	twofish-i586-y := twofish-i586-asm_32.o twofish_32.o	14	twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
@@ -15,3 +16,4 @@ salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
15		16
16	aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o	17	aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
17	twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o	18	twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
		19	salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o


diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S new file mode 100644 index 000000000000..6214a9b09706 --- /dev/null +++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -0,0 +1,920 @@
		1	# enter ECRYPT_encrypt_bytes
		2	.text
		3	.p2align 5
		4	.globl ECRYPT_encrypt_bytes
		5	ECRYPT_encrypt_bytes:
		6	mov %rsp,%r11
		7	and $31,%r11
		8	add $256,%r11
		9	sub %r11,%rsp
		10	# x = arg1
		11	mov %rdi,%r8
		12	# m = arg2
		13	mov %rsi,%rsi
		14	# out = arg3
		15	mov %rdx,%rdi
		16	# bytes = arg4
		17	mov %rcx,%rdx
		18	# unsigned>? bytes - 0
		19	cmp $0,%rdx
		20	# comment:fp stack unchanged by jump
		21	# goto done if !unsigned>
		22	jbe ._done
		23	# comment:fp stack unchanged by fallthrough
		24	# start:
		25	._start:
		26	# r11_stack = r11
		27	movq %r11,0(%rsp)
		28	# r12_stack = r12
		29	movq %r12,8(%rsp)
		30	# r13_stack = r13
		31	movq %r13,16(%rsp)
		32	# r14_stack = r14
		33	movq %r14,24(%rsp)
		34	# r15_stack = r15
		35	movq %r15,32(%rsp)
		36	# rbx_stack = rbx
		37	movq %rbx,40(%rsp)
		38	# rbp_stack = rbp
		39	movq %rbp,48(%rsp)
		40	# in0 = (uint64 ) (x + 0)
		41	movq 0(%r8),%rcx
		42	# in2 = (uint64 ) (x + 8)
		43	movq 8(%r8),%r9
		44	# in4 = (uint64 ) (x + 16)
		45	movq 16(%r8),%rax
		46	# in6 = (uint64 ) (x + 24)
		47	movq 24(%r8),%r10
		48	# in8 = (uint64 ) (x + 32)
		49	movq 32(%r8),%r11
		50	# in10 = (uint64 ) (x + 40)
		51	movq 40(%r8),%r12
		52	# in12 = (uint64 ) (x + 48)
		53	movq 48(%r8),%r13
		54	# in14 = (uint64 ) (x + 56)
		55	movq 56(%r8),%r14
		56	# j0 = in0
		57	movq %rcx,56(%rsp)
		58	# j2 = in2
		59	movq %r9,64(%rsp)
		60	# j4 = in4
		61	movq %rax,72(%rsp)
		62	# j6 = in6
		63	movq %r10,80(%rsp)
		64	# j8 = in8
		65	movq %r11,88(%rsp)
		66	# j10 = in10
		67	movq %r12,96(%rsp)
		68	# j12 = in12
		69	movq %r13,104(%rsp)
		70	# j14 = in14
		71	movq %r14,112(%rsp)
		72	# x_backup = x
		73	movq %r8,120(%rsp)
		74	# bytesatleast1:
		75	._bytesatleast1:
		76	# unsigned<? bytes - 64
		77	cmp $64,%rdx
		78	# comment:fp stack unchanged by jump
		79	# goto nocopy if !unsigned<
		80	jae ._nocopy
		81	# ctarget = out
		82	movq %rdi,128(%rsp)
		83	# out = &tmp
		84	leaq 192(%rsp),%rdi
		85	# i = bytes
		86	mov %rdx,%rcx
		87	# while (i) { out++ = m++; --i }
		88	rep movsb
		89	# out = &tmp
		90	leaq 192(%rsp),%rdi
		91	# m = &tmp
		92	leaq 192(%rsp),%rsi
		93	# comment:fp stack unchanged by fallthrough
		94	# nocopy:
		95	._nocopy:
		96	# out_backup = out
		97	movq %rdi,136(%rsp)
		98	# m_backup = m
		99	movq %rsi,144(%rsp)
		100	# bytes_backup = bytes
		101	movq %rdx,152(%rsp)
		102	# x1 = j0
		103	movq 56(%rsp),%rdi
		104	# x0 = x1
		105	mov %rdi,%rdx
		106	# (uint64) x1 >>= 32
		107	shr $32,%rdi
		108	# x3 = j2
		109	movq 64(%rsp),%rsi
		110	# x2 = x3
		111	mov %rsi,%rcx
		112	# (uint64) x3 >>= 32
		113	shr $32,%rsi
		114	# x5 = j4
		115	movq 72(%rsp),%r8
		116	# x4 = x5
		117	mov %r8,%r9
		118	# (uint64) x5 >>= 32
		119	shr $32,%r8
		120	# x5_stack = x5
		121	movq %r8,160(%rsp)
		122	# x7 = j6
		123	movq 80(%rsp),%r8
		124	# x6 = x7
		125	mov %r8,%rax
		126	# (uint64) x7 >>= 32
		127	shr $32,%r8
		128	# x9 = j8
		129	movq 88(%rsp),%r10
		130	# x8 = x9
		131	mov %r10,%r11
		132	# (uint64) x9 >>= 32
		133	shr $32,%r10
		134	# x11 = j10
		135	movq 96(%rsp),%r12
		136	# x10 = x11
		137	mov %r12,%r13
		138	# x10_stack = x10
		139	movq %r13,168(%rsp)
		140	# (uint64) x11 >>= 32
		141	shr $32,%r12
		142	# x13 = j12
		143	movq 104(%rsp),%r13
		144	# x12 = x13
		145	mov %r13,%r14
		146	# (uint64) x13 >>= 32
		147	shr $32,%r13
		148	# x15 = j14
		149	movq 112(%rsp),%r15
		150	# x14 = x15
		151	mov %r15,%rbx
		152	# (uint64) x15 >>= 32
		153	shr $32,%r15
		154	# x15_stack = x15
		155	movq %r15,176(%rsp)
		156	# i = 20
		157	mov $20,%r15
		158	# mainloop:
		159	._mainloop:
		160	# i_backup = i
		161	movq %r15,184(%rsp)
		162	# x5 = x5_stack
		163	movq 160(%rsp),%r15
		164	# a = x12 + x0
		165	lea (%r14,%rdx),%rbp
		166	# (uint32) a <<<= 7
		167	rol $7,%ebp
		168	# x4 ^= a
		169	xor %rbp,%r9
		170	# b = x1 + x5
		171	lea (%rdi,%r15),%rbp
		172	# (uint32) b <<<= 7
		173	rol $7,%ebp
		174	# x9 ^= b
		175	xor %rbp,%r10
		176	# a = x0 + x4
		177	lea (%rdx,%r9),%rbp
		178	# (uint32) a <<<= 9
		179	rol $9,%ebp
		180	# x8 ^= a
		181	xor %rbp,%r11
		182	# b = x5 + x9
		183	lea (%r15,%r10),%rbp
		184	# (uint32) b <<<= 9
		185	rol $9,%ebp
		186	# x13 ^= b
		187	xor %rbp,%r13
		188	# a = x4 + x8
		189	lea (%r9,%r11),%rbp
		190	# (uint32) a <<<= 13
		191	rol $13,%ebp
		192	# x12 ^= a
		193	xor %rbp,%r14
		194	# b = x9 + x13
		195	lea (%r10,%r13),%rbp
		196	# (uint32) b <<<= 13
		197	rol $13,%ebp
		198	# x1 ^= b
		199	xor %rbp,%rdi
		200	# a = x8 + x12
		201	lea (%r11,%r14),%rbp
		202	# (uint32) a <<<= 18
		203	rol $18,%ebp
		204	# x0 ^= a
		205	xor %rbp,%rdx
		206	# b = x13 + x1
		207	lea (%r13,%rdi),%rbp
		208	# (uint32) b <<<= 18
		209	rol $18,%ebp
		210	# x5 ^= b
		211	xor %rbp,%r15
		212	# x10 = x10_stack
		213	movq 168(%rsp),%rbp
		214	# x5_stack = x5
		215	movq %r15,160(%rsp)
		216	# c = x6 + x10
		217	lea (%rax,%rbp),%r15
		218	# (uint32) c <<<= 7
		219	rol $7,%r15d
		220	# x14 ^= c
		221	xor %r15,%rbx
		222	# c = x10 + x14
		223	lea (%rbp,%rbx),%r15
		224	# (uint32) c <<<= 9
		225	rol $9,%r15d
		226	# x2 ^= c
		227	xor %r15,%rcx
		228	# c = x14 + x2
		229	lea (%rbx,%rcx),%r15
		230	# (uint32) c <<<= 13
		231	rol $13,%r15d
		232	# x6 ^= c
		233	xor %r15,%rax
		234	# c = x2 + x6
		235	lea (%rcx,%rax),%r15
		236	# (uint32) c <<<= 18
		237	rol $18,%r15d
		238	# x10 ^= c
		239	xor %r15,%rbp
		240	# x15 = x15_stack
		241	movq 176(%rsp),%r15
		242	# x10_stack = x10
		243	movq %rbp,168(%rsp)
		244	# d = x11 + x15
		245	lea (%r12,%r15),%rbp
		246	# (uint32) d <<<= 7
		247	rol $7,%ebp
		248	# x3 ^= d
		249	xor %rbp,%rsi
		250	# d = x15 + x3
		251	lea (%r15,%rsi),%rbp
		252	# (uint32) d <<<= 9
		253	rol $9,%ebp
		254	# x7 ^= d
		255	xor %rbp,%r8
		256	# d = x3 + x7
		257	lea (%rsi,%r8),%rbp
		258	# (uint32) d <<<= 13
		259	rol $13,%ebp
		260	# x11 ^= d
		261	xor %rbp,%r12
		262	# d = x7 + x11
		263	lea (%r8,%r12),%rbp
		264	# (uint32) d <<<= 18
		265	rol $18,%ebp
		266	# x15 ^= d
		267	xor %rbp,%r15
		268	# x15_stack = x15
		269	movq %r15,176(%rsp)
		270	# x5 = x5_stack
		271	movq 160(%rsp),%r15
		272	# a = x3 + x0
		273	lea (%rsi,%rdx),%rbp
		274	# (uint32) a <<<= 7
		275	rol $7,%ebp
		276	# x1 ^= a
		277	xor %rbp,%rdi
		278	# b = x4 + x5
		279	lea (%r9,%r15),%rbp
		280	# (uint32) b <<<= 7
		281	rol $7,%ebp
		282	# x6 ^= b
		283	xor %rbp,%rax
		284	# a = x0 + x1
		285	lea (%rdx,%rdi),%rbp
		286	# (uint32) a <<<= 9
		287	rol $9,%ebp
		288	# x2 ^= a
		289	xor %rbp,%rcx
		290	# b = x5 + x6
		291	lea (%r15,%rax),%rbp
		292	# (uint32) b <<<= 9
		293	rol $9,%ebp
		294	# x7 ^= b
		295	xor %rbp,%r8
		296	# a = x1 + x2
		297	lea (%rdi,%rcx),%rbp
		298	# (uint32) a <<<= 13
		299	rol $13,%ebp
		300	# x3 ^= a
		301	xor %rbp,%rsi
		302	# b = x6 + x7
		303	lea (%rax,%r8),%rbp
		304	# (uint32) b <<<= 13
		305	rol $13,%ebp
		306	# x4 ^= b
		307	xor %rbp,%r9
		308	# a = x2 + x3
		309	lea (%rcx,%rsi),%rbp
		310	# (uint32) a <<<= 18
		311	rol $18,%ebp
		312	# x0 ^= a
		313	xor %rbp,%rdx
		314	# b = x7 + x4
		315	lea (%r8,%r9),%rbp
		316	# (uint32) b <<<= 18
		317	rol $18,%ebp
		318	# x5 ^= b
		319	xor %rbp,%r15
		320	# x10 = x10_stack
		321	movq 168(%rsp),%rbp
		322	# x5_stack = x5
		323	movq %r15,160(%rsp)
		324	# c = x9 + x10
		325	lea (%r10,%rbp),%r15
		326	# (uint32) c <<<= 7
		327	rol $7,%r15d
		328	# x11 ^= c
		329	xor %r15,%r12
		330	# c = x10 + x11
		331	lea (%rbp,%r12),%r15
		332	# (uint32) c <<<= 9
		333	rol $9,%r15d
		334	# x8 ^= c
		335	xor %r15,%r11
		336	# c = x11 + x8
		337	lea (%r12,%r11),%r15
		338	# (uint32) c <<<= 13
		339	rol $13,%r15d
		340	# x9 ^= c
		341	xor %r15,%r10
		342	# c = x8 + x9
		343	lea (%r11,%r10),%r15
		344	# (uint32) c <<<= 18
		345	rol $18,%r15d
		346	# x10 ^= c
		347	xor %r15,%rbp
		348	# x15 = x15_stack
		349	movq 176(%rsp),%r15
		350	# x10_stack = x10
		351	movq %rbp,168(%rsp)
		352	# d = x14 + x15
		353	lea (%rbx,%r15),%rbp
		354	# (uint32) d <<<= 7
		355	rol $7,%ebp
		356	# x12 ^= d
		357	xor %rbp,%r14
		358	# d = x15 + x12
		359	lea (%r15,%r14),%rbp
		360	# (uint32) d <<<= 9
		361	rol $9,%ebp
		362	# x13 ^= d
		363	xor %rbp,%r13
		364	# d = x12 + x13
		365	lea (%r14,%r13),%rbp
		366	# (uint32) d <<<= 13
		367	rol $13,%ebp
		368	# x14 ^= d
		369	xor %rbp,%rbx
		370	# d = x13 + x14
		371	lea (%r13,%rbx),%rbp
		372	# (uint32) d <<<= 18
		373	rol $18,%ebp
		374	# x15 ^= d
		375	xor %rbp,%r15
		376	# x15_stack = x15
		377	movq %r15,176(%rsp)
		378	# x5 = x5_stack
		379	movq 160(%rsp),%r15
		380	# a = x12 + x0
		381	lea (%r14,%rdx),%rbp
		382	# (uint32) a <<<= 7
		383	rol $7,%ebp
		384	# x4 ^= a
		385	xor %rbp,%r9
		386	# b = x1 + x5
		387	lea (%rdi,%r15),%rbp
		388	# (uint32) b <<<= 7
		389	rol $7,%ebp
		390	# x9 ^= b
		391	xor %rbp,%r10
		392	# a = x0 + x4
		393	lea (%rdx,%r9),%rbp
		394	# (uint32) a <<<= 9
		395	rol $9,%ebp
		396	# x8 ^= a
		397	xor %rbp,%r11
		398	# b = x5 + x9
		399	lea (%r15,%r10),%rbp
		400	# (uint32) b <<<= 9
		401	rol $9,%ebp
		402	# x13 ^= b
		403	xor %rbp,%r13
		404	# a = x4 + x8
		405	lea (%r9,%r11),%rbp
		406	# (uint32) a <<<= 13
		407	rol $13,%ebp
		408	# x12 ^= a
		409	xor %rbp,%r14
		410	# b = x9 + x13
		411	lea (%r10,%r13),%rbp
		412	# (uint32) b <<<= 13
		413	rol $13,%ebp
		414	# x1 ^= b
		415	xor %rbp,%rdi
		416	# a = x8 + x12
		417	lea (%r11,%r14),%rbp
		418	# (uint32) a <<<= 18
		419	rol $18,%ebp
		420	# x0 ^= a
		421	xor %rbp,%rdx
		422	# b = x13 + x1
		423	lea (%r13,%rdi),%rbp
		424	# (uint32) b <<<= 18
		425	rol $18,%ebp
		426	# x5 ^= b
		427	xor %rbp,%r15
		428	# x10 = x10_stack
		429	movq 168(%rsp),%rbp
		430	# x5_stack = x5
		431	movq %r15,160(%rsp)
		432	# c = x6 + x10
		433	lea (%rax,%rbp),%r15
		434	# (uint32) c <<<= 7
		435	rol $7,%r15d
		436	# x14 ^= c
		437	xor %r15,%rbx
		438	# c = x10 + x14
		439	lea (%rbp,%rbx),%r15
		440	# (uint32) c <<<= 9
		441	rol $9,%r15d
		442	# x2 ^= c
		443	xor %r15,%rcx
		444	# c = x14 + x2
		445	lea (%rbx,%rcx),%r15
		446	# (uint32) c <<<= 13
		447	rol $13,%r15d
		448	# x6 ^= c
		449	xor %r15,%rax
		450	# c = x2 + x6
		451	lea (%rcx,%rax),%r15
		452	# (uint32) c <<<= 18
		453	rol $18,%r15d
		454	# x10 ^= c
		455	xor %r15,%rbp
		456	# x15 = x15_stack
		457	movq 176(%rsp),%r15
		458	# x10_stack = x10
		459	movq %rbp,168(%rsp)
		460	# d = x11 + x15
		461	lea (%r12,%r15),%rbp
		462	# (uint32) d <<<= 7
		463	rol $7,%ebp
		464	# x3 ^= d
		465	xor %rbp,%rsi
		466	# d = x15 + x3
		467	lea (%r15,%rsi),%rbp
		468	# (uint32) d <<<= 9
		469	rol $9,%ebp
		470	# x7 ^= d
		471	xor %rbp,%r8
		472	# d = x3 + x7
		473	lea (%rsi,%r8),%rbp
		474	# (uint32) d <<<= 13
		475	rol $13,%ebp
		476	# x11 ^= d
		477	xor %rbp,%r12
		478	# d = x7 + x11
		479	lea (%r8,%r12),%rbp
		480	# (uint32) d <<<= 18
		481	rol $18,%ebp
		482	# x15 ^= d
		483	xor %rbp,%r15
		484	# x15_stack = x15
		485	movq %r15,176(%rsp)
		486	# x5 = x5_stack
		487	movq 160(%rsp),%r15
		488	# a = x3 + x0
		489	lea (%rsi,%rdx),%rbp
		490	# (uint32) a <<<= 7
		491	rol $7,%ebp
		492	# x1 ^= a
		493	xor %rbp,%rdi
		494	# b = x4 + x5
		495	lea (%r9,%r15),%rbp
		496	# (uint32) b <<<= 7
		497	rol $7,%ebp
		498	# x6 ^= b
		499	xor %rbp,%rax
		500	# a = x0 + x1
		501	lea (%rdx,%rdi),%rbp
		502	# (uint32) a <<<= 9
		503	rol $9,%ebp
		504	# x2 ^= a
		505	xor %rbp,%rcx
		506	# b = x5 + x6
		507	lea (%r15,%rax),%rbp
		508	# (uint32) b <<<= 9
		509	rol $9,%ebp
		510	# x7 ^= b
		511	xor %rbp,%r8
		512	# a = x1 + x2
		513	lea (%rdi,%rcx),%rbp
		514	# (uint32) a <<<= 13
		515	rol $13,%ebp
		516	# x3 ^= a
		517	xor %rbp,%rsi
		518	# b = x6 + x7
		519	lea (%rax,%r8),%rbp
		520	# (uint32) b <<<= 13
		521	rol $13,%ebp
		522	# x4 ^= b
		523	xor %rbp,%r9
		524	# a = x2 + x3
		525	lea (%rcx,%rsi),%rbp
		526	# (uint32) a <<<= 18
		527	rol $18,%ebp
		528	# x0 ^= a
		529	xor %rbp,%rdx
		530	# b = x7 + x4
		531	lea (%r8,%r9),%rbp
		532	# (uint32) b <<<= 18
		533	rol $18,%ebp
		534	# x5 ^= b
		535	xor %rbp,%r15
		536	# x10 = x10_stack
		537	movq 168(%rsp),%rbp
		538	# x5_stack = x5
		539	movq %r15,160(%rsp)
		540	# c = x9 + x10
		541	lea (%r10,%rbp),%r15
		542	# (uint32) c <<<= 7
		543	rol $7,%r15d
		544	# x11 ^= c
		545	xor %r15,%r12
		546	# c = x10 + x11
		547	lea (%rbp,%r12),%r15
		548	# (uint32) c <<<= 9
		549	rol $9,%r15d
		550	# x8 ^= c
		551	xor %r15,%r11
		552	# c = x11 + x8
		553	lea (%r12,%r11),%r15
		554	# (uint32) c <<<= 13
		555	rol $13,%r15d
		556	# x9 ^= c
		557	xor %r15,%r10
		558	# c = x8 + x9
		559	lea (%r11,%r10),%r15
		560	# (uint32) c <<<= 18
		561	rol $18,%r15d
		562	# x10 ^= c
		563	xor %r15,%rbp
		564	# x15 = x15_stack
		565	movq 176(%rsp),%r15
		566	# x10_stack = x10
		567	movq %rbp,168(%rsp)
		568	# d = x14 + x15
		569	lea (%rbx,%r15),%rbp
		570	# (uint32) d <<<= 7
		571	rol $7,%ebp
		572	# x12 ^= d
		573	xor %rbp,%r14
		574	# d = x15 + x12
		575	lea (%r15,%r14),%rbp
		576	# (uint32) d <<<= 9
		577	rol $9,%ebp
		578	# x13 ^= d
		579	xor %rbp,%r13
		580	# d = x12 + x13
		581	lea (%r14,%r13),%rbp
		582	# (uint32) d <<<= 13
		583	rol $13,%ebp
		584	# x14 ^= d
		585	xor %rbp,%rbx
		586	# d = x13 + x14
		587	lea (%r13,%rbx),%rbp
		588	# (uint32) d <<<= 18
		589	rol $18,%ebp
		590	# x15 ^= d
		591	xor %rbp,%r15
		592	# x15_stack = x15
		593	movq %r15,176(%rsp)
		594	# i = i_backup
		595	movq 184(%rsp),%r15
		596	# unsigned>? i -= 4
		597	sub $4,%r15
		598	# comment:fp stack unchanged by jump
		599	# goto mainloop if unsigned>
		600	ja ._mainloop
		601	# (uint32) x2 += j2
		602	addl 64(%rsp),%ecx
		603	# x3 <<= 32
		604	shl $32,%rsi
		605	# x3 += j2
		606	addq 64(%rsp),%rsi
		607	# (uint64) x3 >>= 32
		608	shr $32,%rsi
		609	# x3 <<= 32
		610	shl $32,%rsi
		611	# x2 += x3
		612	add %rsi,%rcx
		613	# (uint32) x6 += j6
		614	addl 80(%rsp),%eax
		615	# x7 <<= 32
		616	shl $32,%r8
		617	# x7 += j6
		618	addq 80(%rsp),%r8
		619	# (uint64) x7 >>= 32
		620	shr $32,%r8
		621	# x7 <<= 32
		622	shl $32,%r8
		623	# x6 += x7
		624	add %r8,%rax
		625	# (uint32) x8 += j8
		626	addl 88(%rsp),%r11d
		627	# x9 <<= 32
		628	shl $32,%r10
		629	# x9 += j8
		630	addq 88(%rsp),%r10
		631	# (uint64) x9 >>= 32
		632	shr $32,%r10
		633	# x9 <<= 32
		634	shl $32,%r10
		635	# x8 += x9
		636	add %r10,%r11
		637	# (uint32) x12 += j12
		638	addl 104(%rsp),%r14d
		639	# x13 <<= 32
		640	shl $32,%r13
		641	# x13 += j12
		642	addq 104(%rsp),%r13
		643	# (uint64) x13 >>= 32
		644	shr $32,%r13
		645	# x13 <<= 32
		646	shl $32,%r13
		647	# x12 += x13
		648	add %r13,%r14
		649	# (uint32) x0 += j0
		650	addl 56(%rsp),%edx
		651	# x1 <<= 32
		652	shl $32,%rdi
		653	# x1 += j0
		654	addq 56(%rsp),%rdi
		655	# (uint64) x1 >>= 32
		656	shr $32,%rdi
		657	# x1 <<= 32
		658	shl $32,%rdi
		659	# x0 += x1
		660	add %rdi,%rdx
		661	# x5 = x5_stack
		662	movq 160(%rsp),%rdi
		663	# (uint32) x4 += j4
		664	addl 72(%rsp),%r9d
		665	# x5 <<= 32
		666	shl $32,%rdi
		667	# x5 += j4
		668	addq 72(%rsp),%rdi
		669	# (uint64) x5 >>= 32
		670	shr $32,%rdi
		671	# x5 <<= 32
		672	shl $32,%rdi
		673	# x4 += x5
		674	add %rdi,%r9
		675	# x10 = x10_stack
		676	movq 168(%rsp),%r8
		677	# (uint32) x10 += j10
		678	addl 96(%rsp),%r8d
		679	# x11 <<= 32
		680	shl $32,%r12
		681	# x11 += j10
		682	addq 96(%rsp),%r12
		683	# (uint64) x11 >>= 32
		684	shr $32,%r12
		685	# x11 <<= 32
		686	shl $32,%r12
		687	# x10 += x11
		688	add %r12,%r8
		689	# x15 = x15_stack
		690	movq 176(%rsp),%rdi
		691	# (uint32) x14 += j14
		692	addl 112(%rsp),%ebx
		693	# x15 <<= 32
		694	shl $32,%rdi
		695	# x15 += j14
		696	addq 112(%rsp),%rdi
		697	# (uint64) x15 >>= 32
		698	shr $32,%rdi
		699	# x15 <<= 32
		700	shl $32,%rdi
		701	# x14 += x15
		702	add %rdi,%rbx
		703	# out = out_backup
		704	movq 136(%rsp),%rdi
		705	# m = m_backup
		706	movq 144(%rsp),%rsi
		707	# x0 ^= (uint64 ) (m + 0)
		708	xorq 0(%rsi),%rdx
		709	# (uint64 ) (out + 0) = x0
		710	movq %rdx,0(%rdi)
		711	# x2 ^= (uint64 ) (m + 8)
		712	xorq 8(%rsi),%rcx
		713	# (uint64 ) (out + 8) = x2
		714	movq %rcx,8(%rdi)
		715	# x4 ^= (uint64 ) (m + 16)
		716	xorq 16(%rsi),%r9
		717	# (uint64 ) (out + 16) = x4
		718	movq %r9,16(%rdi)
		719	# x6 ^= (uint64 ) (m + 24)
		720	xorq 24(%rsi),%rax
		721	# (uint64 ) (out + 24) = x6
		722	movq %rax,24(%rdi)
		723	# x8 ^= (uint64 ) (m + 32)
		724	xorq 32(%rsi),%r11
		725	# (uint64 ) (out + 32) = x8
		726	movq %r11,32(%rdi)
		727	# x10 ^= (uint64 ) (m + 40)
		728	xorq 40(%rsi),%r8
		729	# (uint64 ) (out + 40) = x10
		730	movq %r8,40(%rdi)
		731	# x12 ^= (uint64 ) (m + 48)
		732	xorq 48(%rsi),%r14
		733	# (uint64 ) (out + 48) = x12
		734	movq %r14,48(%rdi)
		735	# x14 ^= (uint64 ) (m + 56)
		736	xorq 56(%rsi),%rbx
		737	# (uint64 ) (out + 56) = x14
		738	movq %rbx,56(%rdi)
		739	# bytes = bytes_backup
		740	movq 152(%rsp),%rdx
		741	# in8 = j8
		742	movq 88(%rsp),%rcx
		743	# in8 += 1
		744	add $1,%rcx
		745	# j8 = in8
		746	movq %rcx,88(%rsp)
		747	# unsigned>? unsigned<? bytes - 64
		748	cmp $64,%rdx
		749	# comment:fp stack unchanged by jump
		750	# goto bytesatleast65 if unsigned>
		751	ja ._bytesatleast65
		752	# comment:fp stack unchanged by jump
		753	# goto bytesatleast64 if !unsigned<
		754	jae ._bytesatleast64
		755	# m = out
		756	mov %rdi,%rsi
		757	# out = ctarget
		758	movq 128(%rsp),%rdi
		759	# i = bytes
		760	mov %rdx,%rcx
		761	# while (i) { out++ = m++; --i }
		762	rep movsb
		763	# comment:fp stack unchanged by fallthrough
		764	# bytesatleast64:
		765	._bytesatleast64:
		766	# x = x_backup
		767	movq 120(%rsp),%rdi
		768	# in8 = j8
		769	movq 88(%rsp),%rsi
		770	# (uint64 ) (x + 32) = in8
		771	movq %rsi,32(%rdi)
		772	# r11 = r11_stack
		773	movq 0(%rsp),%r11
		774	# r12 = r12_stack
		775	movq 8(%rsp),%r12
		776	# r13 = r13_stack
		777	movq 16(%rsp),%r13
		778	# r14 = r14_stack
		779	movq 24(%rsp),%r14
		780	# r15 = r15_stack
		781	movq 32(%rsp),%r15
		782	# rbx = rbx_stack
		783	movq 40(%rsp),%rbx
		784	# rbp = rbp_stack
		785	movq 48(%rsp),%rbp
		786	# comment:fp stack unchanged by fallthrough
		787	# done:
		788	._done:
		789	# leave
		790	add %r11,%rsp
		791	mov %rdi,%rax
		792	mov %rsi,%rdx
		793	ret
		794	# bytesatleast65:
		795	._bytesatleast65:
		796	# bytes -= 64
		797	sub $64,%rdx
		798	# out += 64
		799	add $64,%rdi
		800	# m += 64
		801	add $64,%rsi
		802	# comment:fp stack unchanged by jump
		803	# goto bytesatleast1
		804	jmp ._bytesatleast1
		805	# enter ECRYPT_keysetup
		806	.text
		807	.p2align 5
		808	.globl ECRYPT_keysetup
		809	ECRYPT_keysetup:
		810	mov %rsp,%r11
		811	and $31,%r11
		812	add $256,%r11
		813	sub %r11,%rsp
		814	# k = arg2
		815	mov %rsi,%rsi
		816	# kbits = arg3
		817	mov %rdx,%rdx
		818	# x = arg1
		819	mov %rdi,%rdi
		820	# in0 = (uint64 ) (k + 0)
		821	movq 0(%rsi),%r8
		822	# in2 = (uint64 ) (k + 8)
		823	movq 8(%rsi),%r9
		824	# (uint64 ) (x + 4) = in0
		825	movq %r8,4(%rdi)
		826	# (uint64 ) (x + 12) = in2
		827	movq %r9,12(%rdi)
		828	# unsigned<? kbits - 256
		829	cmp $256,%rdx
		830	# comment:fp stack unchanged by jump
		831	# goto kbits128 if unsigned<
		832	jb ._kbits128
		833	# kbits256:
		834	._kbits256:
		835	# in10 = (uint64 ) (k + 16)
		836	movq 16(%rsi),%rdx
		837	# in12 = (uint64 ) (k + 24)
		838	movq 24(%rsi),%rsi
		839	# (uint64 ) (x + 44) = in10
		840	movq %rdx,44(%rdi)
		841	# (uint64 ) (x + 52) = in12
		842	movq %rsi,52(%rdi)
		843	# in0 = 1634760805
		844	mov $1634760805,%rsi
		845	# in4 = 857760878
		846	mov $857760878,%rdx
		847	# in10 = 2036477234
		848	mov $2036477234,%rcx
		849	# in14 = 1797285236
		850	mov $1797285236,%r8
		851	# (uint32 ) (x + 0) = in0
		852	movl %esi,0(%rdi)
		853	# (uint32 ) (x + 20) = in4
		854	movl %edx,20(%rdi)
		855	# (uint32 ) (x + 40) = in10
		856	movl %ecx,40(%rdi)
		857	# (uint32 ) (x + 60) = in14
		858	movl %r8d,60(%rdi)
		859	# comment:fp stack unchanged by jump
		860	# goto keysetupdone
		861	jmp ._keysetupdone
		862	# kbits128:
		863	._kbits128:
		864	# in10 = (uint64 ) (k + 0)
		865	movq 0(%rsi),%rdx
		866	# in12 = (uint64 ) (k + 8)
		867	movq 8(%rsi),%rsi
		868	# (uint64 ) (x + 44) = in10
		869	movq %rdx,44(%rdi)
		870	# (uint64 ) (x + 52) = in12
		871	movq %rsi,52(%rdi)
		872	# in0 = 1634760805
		873	mov $1634760805,%rsi
		874	# in4 = 824206446
		875	mov $824206446,%rdx
		876	# in10 = 2036477238
		877	mov $2036477238,%rcx
		878	# in14 = 1797285236
		879	mov $1797285236,%r8
		880	# (uint32 ) (x + 0) = in0
		881	movl %esi,0(%rdi)
		882	# (uint32 ) (x + 20) = in4
		883	movl %edx,20(%rdi)
		884	# (uint32 ) (x + 40) = in10
		885	movl %ecx,40(%rdi)
		886	# (uint32 ) (x + 60) = in14
		887	movl %r8d,60(%rdi)
		888	# keysetupdone:
		889	._keysetupdone:
		890	# leave
		891	add %r11,%rsp
		892	mov %rdi,%rax
		893	mov %rsi,%rdx
		894	ret
		895	# enter ECRYPT_ivsetup
		896	.text
		897	.p2align 5
		898	.globl ECRYPT_ivsetup
		899	ECRYPT_ivsetup:
		900	mov %rsp,%r11
		901	and $31,%r11
		902	add $256,%r11
		903	sub %r11,%rsp
		904	# iv = arg2
		905	mov %rsi,%rsi
		906	# x = arg1
		907	mov %rdi,%rdi
		908	# in6 = (uint64 ) (iv + 0)
		909	movq 0(%rsi),%rsi
		910	# in8 = 0
		911	mov $0,%r8
		912	# (uint64 ) (x + 24) = in6
		913	movq %rsi,24(%rdi)
		914	# (uint64 ) (x + 32) = in8
		915	movq %r8,32(%rdi)
		916	# leave
		917	add %r11,%rsp
		918	mov %rdi,%rax
		919	mov %rsi,%rdx
		920	ret


diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c index 3be443995ed6..bccb76d80987 100644 --- a/arch/x86/crypto/salsa20_glue.c +++ b/arch/x86/crypto/salsa20_glue.c
@@ -8,6 +8,8 @@
8	* and to remove extraneous comments and functions that are not needed.	8	* and to remove extraneous comments and functions that are not needed.
9	* - i586 version, renamed as salsa20-i586-asm_32.S	9	* - i586 version, renamed as salsa20-i586-asm_32.S
10	* available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>	10	* available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
		11	* - x86-64 version, renamed as salsa20-x86_64-asm_64.S
		12	* available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
11	*	13	*
12	* This program is free software; you can redistribute it and/or modify it	14	* This program is free software; you can redistribute it and/or modify it
13	* under the terms of the GNU General Public License as published by the Free	15	* under the terms of the GNU General Public License as published by the Free