11 files changed, 2309 insertions, 1036 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 46bb609e2444..3874c2de5403 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -4,12 +4,16 @@
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
 obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
+obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
 obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
+obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
-aes-i586-y := aes-i586-asm_32.o aes_32.o
+aes-i586-y := aes-i586-asm_32.o aes_glue.o
-twofish-i586-y := twofish-i586-asm_32.o twofish_32.o
+twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
+salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
-aes-x86_64-y := aes-x86_64-asm_64.o aes_64.o
+aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
-twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_64.o
+twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
+salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
index f942f0c8f630..1093bede3e0a 100644
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -46,9 +46,9 @@
 #define in_blk 16
 /* offsets in crypto_tfm structure */
-#define ekey (crypto_tfm_ctx_offset + 0)
+#define klen (crypto_tfm_ctx_offset + 0)
-#define nrnd (crypto_tfm_ctx_offset + 256)
+#define ekey (crypto_tfm_ctx_offset + 4)
-#define dkey (crypto_tfm_ctx_offset + 260)
+#define dkey (crypto_tfm_ctx_offset + 244)
 // register mapping for encrypt and decrypt subroutines
@@ -221,8 +221,8 @@
 .global  aes_enc_blk
-.extern  ft_tab
+.extern  crypto_ft_tab
-.extern  fl_tab
+.extern  crypto_fl_tab
 .align 4
@@ -236,7 +236,7 @@ aes_enc_blk:
 1:      push    %ebx
        mov     in_blk+4(%esp),%r2
        push    %esi
-        mov     nrnd(%ebp),%r3   // number of rounds
+        mov     klen(%ebp),%r3   // key size
        push    %edi
 #if ekey != 0
        lea     ekey(%ebp),%ebp  // key pointer
@@ -255,26 +255,26 @@ aes_enc_blk:
        sub     $8,%esp         // space for register saves on stack
        add     $16,%ebp        // increment to next round key
-        cmp     $12,%r3
+        cmp     $24,%r3
        jb      4f              // 10 rounds for 128-bit key
        lea     32(%ebp),%ebp
        je      3f              // 12 rounds for 192-bit key
        lea     32(%ebp),%ebp
-2:      fwd_rnd1( -64(%ebp) ,ft_tab)    // 14 rounds for 256-bit key
+2:      fwd_rnd1( -64(%ebp), crypto_ft_tab)     // 14 rounds for 256-bit key
-        fwd_rnd2( -48(%ebp) ,ft_tab)
+        fwd_rnd2( -48(%ebp), crypto_ft_tab)
-3:      fwd_rnd1( -32(%ebp) ,ft_tab)    // 12 rounds for 192-bit key
+3:      fwd_rnd1( -32(%ebp), crypto_ft_tab)     // 12 rounds for 192-bit key
-        fwd_rnd2( -16(%ebp) ,ft_tab)
+        fwd_rnd2( -16(%ebp), crypto_ft_tab)
-4:      fwd_rnd1(    (%ebp) ,ft_tab)    // 10 rounds for 128-bit key
+4:      fwd_rnd1(    (%ebp), crypto_ft_tab)     // 10 rounds for 128-bit key
-        fwd_rnd2( +16(%ebp) ,ft_tab)
+        fwd_rnd2( +16(%ebp), crypto_ft_tab)
-        fwd_rnd1( +32(%ebp) ,ft_tab)
+        fwd_rnd1( +32(%ebp), crypto_ft_tab)
-        fwd_rnd2( +48(%ebp) ,ft_tab)
+        fwd_rnd2( +48(%ebp), crypto_ft_tab)
-        fwd_rnd1( +64(%ebp) ,ft_tab)
+        fwd_rnd1( +64(%ebp), crypto_ft_tab)
-        fwd_rnd2( +80(%ebp) ,ft_tab)
+        fwd_rnd2( +80(%ebp), crypto_ft_tab)
-        fwd_rnd1( +96(%ebp) ,ft_tab)
+        fwd_rnd1( +96(%ebp), crypto_ft_tab)
-        fwd_rnd2(+112(%ebp) ,ft_tab)
+        fwd_rnd2(+112(%ebp), crypto_ft_tab)
-        fwd_rnd1(+128(%ebp) ,ft_tab)
+        fwd_rnd1(+128(%ebp), crypto_ft_tab)
-        fwd_rnd2(+144(%ebp) ,fl_tab)    // last round uses a different table
+        fwd_rnd2(+144(%ebp), crypto_fl_tab)     // last round uses a different table
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
@@ -297,8 +297,8 @@ aes_enc_blk:
 .global  aes_dec_blk
-.extern  it_tab
+.extern  crypto_it_tab
-.extern  il_tab
+.extern  crypto_il_tab
 .align 4
@@ -312,14 +312,11 @@ aes_dec_blk:
 1:      push    %ebx
        mov     in_blk+4(%esp),%r2
        push    %esi
-        mov     nrnd(%ebp),%r3   // number of rounds
+        mov     klen(%ebp),%r3   // key size
        push    %edi
 #if dkey != 0
        lea     dkey(%ebp),%ebp  // key pointer
 #endif
-        mov     %r3,%r0
-        shl     $4,%r0
-        add     %r0,%ebp
        
 // input four columns and xor in first round key
@@ -333,27 +330,27 @@ aes_dec_blk:
        xor     12(%ebp),%r5
        sub     $8,%esp         // space for register saves on stack
-        sub     $16,%ebp        // increment to next round key
+        add     $16,%ebp        // increment to next round key
-        cmp     $12,%r3
+        cmp     $24,%r3
        jb      4f              // 10 rounds for 128-bit key
-        lea     -32(%ebp),%ebp
+        lea     32(%ebp),%ebp
        je      3f              // 12 rounds for 192-bit key
-        lea     -32(%ebp),%ebp
+        lea     32(%ebp),%ebp
-2:      inv_rnd1( +64(%ebp), it_tab)    // 14 rounds for 256-bit key
+2:      inv_rnd1( -64(%ebp), crypto_it_tab)     // 14 rounds for 256-bit key
-        inv_rnd2( +48(%ebp), it_tab)
+        inv_rnd2( -48(%ebp), crypto_it_tab)
-3:      inv_rnd1( +32(%ebp), it_tab)    // 12 rounds for 192-bit key
+3:      inv_rnd1( -32(%ebp), crypto_it_tab)     // 12 rounds for 192-bit key
-        inv_rnd2( +16(%ebp), it_tab)
+        inv_rnd2( -16(%ebp), crypto_it_tab)
-4:      inv_rnd1(    (%ebp), it_tab)    // 10 rounds for 128-bit key
+4:      inv_rnd1(    (%ebp), crypto_it_tab)     // 10 rounds for 128-bit key
-        inv_rnd2( -16(%ebp), it_tab)
+        inv_rnd2( +16(%ebp), crypto_it_tab)
-        inv_rnd1( -32(%ebp), it_tab)
+        inv_rnd1( +32(%ebp), crypto_it_tab)
-        inv_rnd2( -48(%ebp), it_tab)
+        inv_rnd2( +48(%ebp), crypto_it_tab)
-        inv_rnd1( -64(%ebp), it_tab)
+        inv_rnd1( +64(%ebp), crypto_it_tab)
-        inv_rnd2( -80(%ebp), it_tab)
+        inv_rnd2( +80(%ebp), crypto_it_tab)
-        inv_rnd1( -96(%ebp), it_tab)
+        inv_rnd1( +96(%ebp), crypto_it_tab)
-        inv_rnd2(-112(%ebp), it_tab)
+        inv_rnd2(+112(%ebp), crypto_it_tab)
-        inv_rnd1(-128(%ebp), it_tab)
+        inv_rnd1(+128(%ebp), crypto_it_tab)
-        inv_rnd2(-144(%ebp), il_tab)    // last round uses a different table
+        inv_rnd2(+144(%ebp), crypto_il_tab)     // last round uses a different table
 // move final values to the output array.  CAUTION: the 
 // order of these assigns rely on the register mappings
diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S
index 26b40de4d0b0..a120f526c3df 100644
--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -8,10 +8,10 @@
 * including this sentence is retained in full.
 */
-.extern aes_ft_tab
+.extern crypto_ft_tab
-.extern aes_it_tab
+.extern crypto_it_tab
-.extern aes_fl_tab
+.extern crypto_fl_tab
-.extern aes_il_tab
+.extern crypto_il_tab
 .text
@@ -56,13 +56,13 @@
        .align  8;                      \
 FUNC:   movq    r1,r2;                  \
        movq    r3,r4;                  \
-        leaq    BASE+KEY+52(r8),r9;     \
+        leaq    BASE+KEY+48+4(r8),r9;   \
        movq    r10,r11;                \
        movl    (r7),r5 ## E;           \
        movl    4(r7),r1 ## E;          \
        movl    8(r7),r6 ## E;          \
        movl    12(r7),r7 ## E;         \
-        movl    BASE(r8),r10 ## E;      \
+        movl    BASE+0(r8),r10 ## E;    \
        xorl    -48(r9),r5 ## E;        \
        xorl    -44(r9),r1 ## E;        \
        xorl    -40(r9),r6 ## E;        \
@@ -154,37 +154,37 @@ FUNC:	movq	r1,r2;			\
 /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
        entry(aes_enc_blk,0,enc128,enc192)
-        encrypt_round(aes_ft_tab,-96)
+        encrypt_round(crypto_ft_tab,-96)
-        encrypt_round(aes_ft_tab,-80)
+        encrypt_round(crypto_ft_tab,-80)
-enc192: encrypt_round(aes_ft_tab,-64)
+enc192: encrypt_round(crypto_ft_tab,-64)
-        encrypt_round(aes_ft_tab,-48)
+        encrypt_round(crypto_ft_tab,-48)
-enc128: encrypt_round(aes_ft_tab,-32)
+enc128: encrypt_round(crypto_ft_tab,-32)
-        encrypt_round(aes_ft_tab,-16)
+        encrypt_round(crypto_ft_tab,-16)
-        encrypt_round(aes_ft_tab,  0)
+        encrypt_round(crypto_ft_tab,  0)
-        encrypt_round(aes_ft_tab, 16)
+        encrypt_round(crypto_ft_tab, 16)
-        encrypt_round(aes_ft_tab, 32)
+        encrypt_round(crypto_ft_tab, 32)
-        encrypt_round(aes_ft_tab, 48)
+        encrypt_round(crypto_ft_tab, 48)
-        encrypt_round(aes_ft_tab, 64)
+        encrypt_round(crypto_ft_tab, 64)
-        encrypt_round(aes_ft_tab, 80)
+        encrypt_round(crypto_ft_tab, 80)
-        encrypt_round(aes_ft_tab, 96)
+        encrypt_round(crypto_ft_tab, 96)
-        encrypt_final(aes_fl_tab,112)
+        encrypt_final(crypto_fl_tab,112)
        return
 /* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */
        entry(aes_dec_blk,240,dec128,dec192)
-        decrypt_round(aes_it_tab,-96)
+        decrypt_round(crypto_it_tab,-96)
-        decrypt_round(aes_it_tab,-80)
+        decrypt_round(crypto_it_tab,-80)
-dec192: decrypt_round(aes_it_tab,-64)
+dec192: decrypt_round(crypto_it_tab,-64)
-        decrypt_round(aes_it_tab,-48)
+        decrypt_round(crypto_it_tab,-48)
-dec128: decrypt_round(aes_it_tab,-32)
+dec128: decrypt_round(crypto_it_tab,-32)
-        decrypt_round(aes_it_tab,-16)
+        decrypt_round(crypto_it_tab,-16)
-        decrypt_round(aes_it_tab,  0)
+        decrypt_round(crypto_it_tab,  0)
-        decrypt_round(aes_it_tab, 16)
+        decrypt_round(crypto_it_tab, 16)
-        decrypt_round(aes_it_tab, 32)
+        decrypt_round(crypto_it_tab, 32)
-        decrypt_round(aes_it_tab, 48)
+        decrypt_round(crypto_it_tab, 48)
-        decrypt_round(aes_it_tab, 64)
+        decrypt_round(crypto_it_tab, 64)
-        decrypt_round(aes_it_tab, 80)
+        decrypt_round(crypto_it_tab, 80)
-        decrypt_round(aes_it_tab, 96)
+        decrypt_round(crypto_it_tab, 96)
-        decrypt_final(aes_il_tab,112)
+        decrypt_final(crypto_il_tab,112)
        return
diff --git a/arch/x86/crypto/aes_32.c b/arch/x86/crypto/aes_32.c
deleted file mode 100644
index 49aad9397f10..000000000000
--- a/arch/x86/crypto/aes_32.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/* 
- * 
- * Glue Code for optimized 586 assembler version of AES
- *
- * Copyright (c) 2002, Dr Brian Gladman <>, Worcester, UK.
- * All rights reserved.
- *
- * LICENSE TERMS
- *
- * The free distribution and use of this software in both source and binary
- * form is allowed (with or without changes) provided that:
- *
- *   1. distributions of this source code include the above copyright
- *      notice, this list of conditions and the following disclaimer;
- *
- *   2. distributions in binary form include the above copyright
- *      notice, this list of conditions and the following disclaimer
- *      in the documentation and/or other associated materials;
- *
- *   3. the copyright holder's name is not used to endorse products
- *      built using this software without specific written permission.
- *
- * ALTERNATIVELY, provided that this notice is retained in full, this product
- * may be distributed under the terms of the GNU General Public License (GPL),
- * in which case the provisions of the GPL apply INSTEAD OF those given above.
- *
- * DISCLAIMER
- *
- * This software is provided 'as is' with no explicit or implied warranties
- * in respect of its properties, including, but not limited to, correctness
- * and/or fitness for purpose.
- *
- * Copyright (c) 2003, Adam J. Richter <adam@yggdrasil.com> (conversion to
- * 2.5 API).
- * Copyright (c) 2003, 2004 Fruhwirth Clemens <clemens@endorphin.org>
- * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
- *
- */
-#include <asm/byteorder.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/crypto.h>
-#include <linux/linkage.h>
-asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-#define AES_MIN_KEY_SIZE        16
-#define AES_MAX_KEY_SIZE        32
-#define AES_BLOCK_SIZE          16
-#define AES_KS_LENGTH           4 * AES_BLOCK_SIZE
-#define RC_LENGTH               29
-struct aes_ctx {
-        u32 ekey[AES_KS_LENGTH];
-        u32 rounds;
-        u32 dkey[AES_KS_LENGTH];
-};
-#define WPOLY 0x011b
-#define bytes2word(b0, b1, b2, b3)  \
-        (((u32)(b3) << 24) | ((u32)(b2) << 16) | ((u32)(b1) << 8) | (b0))
-/* define the finite field multiplies required for Rijndael */
-#define f2(x) ((x) ? pow[log[x] + 0x19] : 0)
-#define f3(x) ((x) ? pow[log[x] + 0x01] : 0)
-#define f9(x) ((x) ? pow[log[x] + 0xc7] : 0)
-#define fb(x) ((x) ? pow[log[x] + 0x68] : 0)
-#define fd(x) ((x) ? pow[log[x] + 0xee] : 0)
-#define fe(x) ((x) ? pow[log[x] + 0xdf] : 0)
-#define fi(x) ((x) ?   pow[255 - log[x]]: 0)
-static inline u32 upr(u32 x, int n)
-{
-        return (x << 8 * n) | (x >> (32 - 8 * n));
-}
-static inline u8 bval(u32 x, int n)
-{
-        return x >> 8 * n;
-}
-/* The forward and inverse affine transformations used in the S-box */
-#define fwd_affine(x) \
-        (w = (u32)x, w ^= (w<<1)^(w<<2)^(w<<3)^(w<<4), 0x63^(u8)(w^(w>>8)))
-#define inv_affine(x) \
-        (w = (u32)x, w = (w<<1)^(w<<3)^(w<<6), 0x05^(u8)(w^(w>>8)))
-static u32 rcon_tab[RC_LENGTH];
-u32 ft_tab[4][256];
-u32 fl_tab[4][256];
-static u32 im_tab[4][256];
-u32 il_tab[4][256];
-u32 it_tab[4][256];
-static void gen_tabs(void)
-{
-        u32 i, w;
-        u8 pow[512], log[256];
-        /*
-         * log and power tables for GF(2^8) finite field with
-         * WPOLY as modular polynomial - the simplest primitive
-         * root is 0x03, used here to generate the tables.
-         */
-        i = 0; w = 1; 
-        
-        do {
-                pow[i] = (u8)w;
-                pow[i + 255] = (u8)w;
-                log[w] = (u8)i++;
-                w ^=  (w << 1) ^ (w & 0x80 ? WPOLY : 0);
-        } while (w != 1);
-        
-        for(i = 0, w = 1; i < RC_LENGTH; ++i) {
-                rcon_tab[i] = bytes2word(w, 0, 0, 0);
-                w = f2(w);
-        }
-        for(i = 0; i < 256; ++i) {
-                u8 b;
-                
-                b = fwd_affine(fi((u8)i));
-                w = bytes2word(f2(b), b, b, f3(b));
-                /* tables for a normal encryption round */
-                ft_tab[0][i] = w;
-                ft_tab[1][i] = upr(w, 1);
-                ft_tab[2][i] = upr(w, 2);
-                ft_tab[3][i] = upr(w, 3);
-                w = bytes2word(b, 0, 0, 0);
-                
-                /*
-                 * tables for last encryption round
-                 * (may also be used in the key schedule)
-                 */
-                fl_tab[0][i] = w;
-                fl_tab[1][i] = upr(w, 1);
-                fl_tab[2][i] = upr(w, 2);
-                fl_tab[3][i] = upr(w, 3);
-                
-                b = fi(inv_affine((u8)i));
-                w = bytes2word(fe(b), f9(b), fd(b), fb(b));
-                /* tables for the inverse mix column operation  */
-                im_tab[0][b] = w;
-                im_tab[1][b] = upr(w, 1);
-                im_tab[2][b] = upr(w, 2);
-                im_tab[3][b] = upr(w, 3);
-                /* tables for a normal decryption round */
-                it_tab[0][i] = w;
-                it_tab[1][i] = upr(w,1);
-                it_tab[2][i] = upr(w,2);
-                it_tab[3][i] = upr(w,3);
-                w = bytes2word(b, 0, 0, 0);
-                
-                /* tables for last decryption round */
-                il_tab[0][i] = w;
-                il_tab[1][i] = upr(w,1);
-                il_tab[2][i] = upr(w,2);
-                il_tab[3][i] = upr(w,3);
-    }
-}
-#define four_tables(x,tab,vf,rf,c)              \
-(       tab[0][bval(vf(x,0,c),rf(0,c))] ^       \
-        tab[1][bval(vf(x,1,c),rf(1,c))] ^       \
-        tab[2][bval(vf(x,2,c),rf(2,c))] ^       \
-        tab[3][bval(vf(x,3,c),rf(3,c))]         \
-)
-#define vf1(x,r,c)  (x)
-#define rf1(r,c)    (r)
-#define rf2(r,c)    ((r-c)&3)
-#define inv_mcol(x) four_tables(x,im_tab,vf1,rf1,0)
-#define ls_box(x,c) four_tables(x,fl_tab,vf1,rf2,c)
-#define ff(x) inv_mcol(x)
-#define ke4(k,i)                                                        \
-{                                                                       \
-        k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];            \
-        k[4*(i)+5] = ss[1] ^= ss[0];                                    \
-        k[4*(i)+6] = ss[2] ^= ss[1];                                    \
-        k[4*(i)+7] = ss[3] ^= ss[2];                                    \
-}
-#define kel4(k,i)                                                       \
-{                                                                       \
-        k[4*(i)+4] = ss[0] ^= ls_box(ss[3],3) ^ rcon_tab[i];            \
-        k[4*(i)+5] = ss[1] ^= ss[0];                                    \
-        k[4*(i)+6] = ss[2] ^= ss[1]; k[4*(i)+7] = ss[3] ^= ss[2];       \
-}
-#define ke6(k,i)                                                        \
-{                                                                       \
-        k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];           \
-        k[6*(i)+ 7] = ss[1] ^= ss[0];                                   \
-        k[6*(i)+ 8] = ss[2] ^= ss[1];                                   \
-        k[6*(i)+ 9] = ss[3] ^= ss[2];                                   \
-        k[6*(i)+10] = ss[4] ^= ss[3];                                   \
-        k[6*(i)+11] = ss[5] ^= ss[4];                                   \
-}
-#define kel6(k,i)                                                       \
-{                                                                       \
-        k[6*(i)+ 6] = ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];           \
-        k[6*(i)+ 7] = ss[1] ^= ss[0];                                   \
-        k[6*(i)+ 8] = ss[2] ^= ss[1];                                   \
-        k[6*(i)+ 9] = ss[3] ^= ss[2];                                   \
-}
-#define ke8(k,i)                                                        \
-{                                                                       \
-        k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];           \
-        k[8*(i)+ 9] = ss[1] ^= ss[0];                                   \
-        k[8*(i)+10] = ss[2] ^= ss[1];                                   \
-        k[8*(i)+11] = ss[3] ^= ss[2];                                   \
-        k[8*(i)+12] = ss[4] ^= ls_box(ss[3],0);                         \
-        k[8*(i)+13] = ss[5] ^= ss[4];                                   \
-        k[8*(i)+14] = ss[6] ^= ss[5];                                   \
-        k[8*(i)+15] = ss[7] ^= ss[6];                                   \
-}
-#define kel8(k,i)                                                       \
-{                                                                       \
-        k[8*(i)+ 8] = ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];           \
-        k[8*(i)+ 9] = ss[1] ^= ss[0];                                   \
-        k[8*(i)+10] = ss[2] ^= ss[1];                                   \
-        k[8*(i)+11] = ss[3] ^= ss[2];                                   \
-}
-#define kdf4(k,i)                                                       \
-{                                                                       \
-        ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3];                          \
-        ss[1] = ss[1] ^ ss[3];                                          \
-        ss[2] = ss[2] ^ ss[3];                                          \
-        ss[3] = ss[3];                                                  \
-        ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];                 \
-        ss[i % 4] ^= ss[4];                                             \
-        ss[4] ^= k[4*(i)];                                              \
-        k[4*(i)+4] = ff(ss[4]);                                         \
-        ss[4] ^= k[4*(i)+1];                                            \
-        k[4*(i)+5] = ff(ss[4]);                                         \
-        ss[4] ^= k[4*(i)+2];                                            \
-        k[4*(i)+6] = ff(ss[4]);                                         \
-        ss[4] ^= k[4*(i)+3];                                            \
-        k[4*(i)+7] = ff(ss[4]);                                         \
-}
-#define kd4(k,i)                                                        \
-{                                                                       \
-        ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];                 \
-        ss[i % 4] ^= ss[4];                                             \
-        ss[4] = ff(ss[4]);                                              \
-        k[4*(i)+4] = ss[4] ^= k[4*(i)];                                 \
-        k[4*(i)+5] = ss[4] ^= k[4*(i)+1];                               \
-        k[4*(i)+6] = ss[4] ^= k[4*(i)+2];                               \
-        k[4*(i)+7] = ss[4] ^= k[4*(i)+3];                               \
-}
-#define kdl4(k,i)                                                       \
-{                                                                       \
-        ss[4] = ls_box(ss[(i+3) % 4], 3) ^ rcon_tab[i];                 \
-        ss[i % 4] ^= ss[4];                                             \
-        k[4*(i)+4] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3];                  \
-        k[4*(i)+5] = ss[1] ^ ss[3];                                     \
-        k[4*(i)+6] = ss[0];                                             \
-        k[4*(i)+7] = ss[1];                                             \
-}
-#define kdf6(k,i)                                                       \
-{                                                                       \
-        ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];                         \
-        k[6*(i)+ 6] = ff(ss[0]);                                        \
-        ss[1] ^= ss[0];                                                 \
-        k[6*(i)+ 7] = ff(ss[1]);                                        \
-        ss[2] ^= ss[1];                                                 \
-        k[6*(i)+ 8] = ff(ss[2]);                                        \
-        ss[3] ^= ss[2];                                                 \
-        k[6*(i)+ 9] = ff(ss[3]);                                        \
-        ss[4] ^= ss[3];                                                 \
-        k[6*(i)+10] = ff(ss[4]);                                        \
-        ss[5] ^= ss[4];                                                 \
-        k[6*(i)+11] = ff(ss[5]);                                        \
-}
-#define kd6(k,i)                                                        \
-{                                                                       \
-        ss[6] = ls_box(ss[5],3) ^ rcon_tab[i];                          \
-        ss[0] ^= ss[6]; ss[6] = ff(ss[6]);                              \
-        k[6*(i)+ 6] = ss[6] ^= k[6*(i)];                                \
-        ss[1] ^= ss[0];                                                 \
-        k[6*(i)+ 7] = ss[6] ^= k[6*(i)+ 1];                             \
-        ss[2] ^= ss[1];                                                 \
-        k[6*(i)+ 8] = ss[6] ^= k[6*(i)+ 2];                             \
-        ss[3] ^= ss[2];                                                 \
-        k[6*(i)+ 9] = ss[6] ^= k[6*(i)+ 3];                             \
-        ss[4] ^= ss[3];                                                 \
-        k[6*(i)+10] = ss[6] ^= k[6*(i)+ 4];                             \
-        ss[5] ^= ss[4];                                                 \
-        k[6*(i)+11] = ss[6] ^= k[6*(i)+ 5];                             \
-}
-#define kdl6(k,i)                                                       \
-{                                                                       \
-        ss[0] ^= ls_box(ss[5],3) ^ rcon_tab[i];                         \
-        k[6*(i)+ 6] = ss[0];                                            \
-        ss[1] ^= ss[0];                                                 \
-        k[6*(i)+ 7] = ss[1];                                            \
-        ss[2] ^= ss[1];                                                 \
-        k[6*(i)+ 8] = ss[2];                                            \
-        ss[3] ^= ss[2];                                                 \
-        k[6*(i)+ 9] = ss[3];                                            \
-}
-#define kdf8(k,i)                                                       \
-{                                                                       \
-        ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];                         \
-        k[8*(i)+ 8] = ff(ss[0]);                                        \
-        ss[1] ^= ss[0];                                                 \
-        k[8*(i)+ 9] = ff(ss[1]);                                        \
-        ss[2] ^= ss[1];                                                 \
-        k[8*(i)+10] = ff(ss[2]);                                        \
-        ss[3] ^= ss[2];                                                 \
-        k[8*(i)+11] = ff(ss[3]);                                        \
-        ss[4] ^= ls_box(ss[3],0);                                       \
-        k[8*(i)+12] = ff(ss[4]);                                        \
-        ss[5] ^= ss[4];                                                 \
-        k[8*(i)+13] = ff(ss[5]);                                        \
-        ss[6] ^= ss[5];                                                 \
-        k[8*(i)+14] = ff(ss[6]);                                        \
-        ss[7] ^= ss[6];                                                 \
-        k[8*(i)+15] = ff(ss[7]);                                        \
-}
-#define kd8(k,i)                                                        \
-{                                                                       \
-        u32 __g = ls_box(ss[7],3) ^ rcon_tab[i];                        \
-        ss[0] ^= __g;                                                   \
-        __g = ff(__g);                                                  \
-        k[8*(i)+ 8] = __g ^= k[8*(i)];                                  \
-        ss[1] ^= ss[0];                                                 \
-        k[8*(i)+ 9] = __g ^= k[8*(i)+ 1];                               \
-        ss[2] ^= ss[1];                                                 \
-        k[8*(i)+10] = __g ^= k[8*(i)+ 2];                               \
-        ss[3] ^= ss[2];                                                 \
-        k[8*(i)+11] = __g ^= k[8*(i)+ 3];                               \
-        __g = ls_box(ss[3],0);                                          \
-        ss[4] ^= __g;                                                   \
-        __g = ff(__g);                                                  \
-        k[8*(i)+12] = __g ^= k[8*(i)+ 4];                               \
-        ss[5] ^= ss[4];                                                 \
-        k[8*(i)+13] = __g ^= k[8*(i)+ 5];                               \
-        ss[6] ^= ss[5];                                                 \
-        k[8*(i)+14] = __g ^= k[8*(i)+ 6];                               \
-        ss[7] ^= ss[6];                                                 \
-        k[8*(i)+15] = __g ^= k[8*(i)+ 7];                               \
-}
-#define kdl8(k,i)                                                       \
-{                                                                       \
-        ss[0] ^= ls_box(ss[7],3) ^ rcon_tab[i];                         \
-        k[8*(i)+ 8] = ss[0];                                            \
-        ss[1] ^= ss[0];                                                 \
-        k[8*(i)+ 9] = ss[1];                                            \
-        ss[2] ^= ss[1];                                                 \
-        k[8*(i)+10] = ss[2];                                            \
-        ss[3] ^= ss[2];                                                 \
-        k[8*(i)+11] = ss[3];                                            \
-}
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
-                       unsigned int key_len)
-{
-        int i;
-        u32 ss[8];
-        struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
-        const __le32 *key = (const __le32 *)in_key;
-        u32 *flags = &tfm->crt_flags;
-        /* encryption schedule */
-        
-        ctx->ekey[0] = ss[0] = le32_to_cpu(key[0]);
-        ctx->ekey[1] = ss[1] = le32_to_cpu(key[1]);
-        ctx->ekey[2] = ss[2] = le32_to_cpu(key[2]);
-        ctx->ekey[3] = ss[3] = le32_to_cpu(key[3]);
-        switch(key_len) {
-        case 16:
-                for (i = 0; i < 9; i++)
-                        ke4(ctx->ekey, i);
-                kel4(ctx->ekey, 9);
-                ctx->rounds = 10;
-                break;
-                
-        case 24:
-                ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
-                ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
-                for (i = 0; i < 7; i++)
-                        ke6(ctx->ekey, i);
-                kel6(ctx->ekey, 7); 
-                ctx->rounds = 12;
-                break;
-        case 32:
-                ctx->ekey[4] = ss[4] = le32_to_cpu(key[4]);
-                ctx->ekey[5] = ss[5] = le32_to_cpu(key[5]);
-                ctx->ekey[6] = ss[6] = le32_to_cpu(key[6]);
-                ctx->ekey[7] = ss[7] = le32_to_cpu(key[7]);
-                for (i = 0; i < 6; i++)
-                        ke8(ctx->ekey, i);
-                kel8(ctx->ekey, 6);
-                ctx->rounds = 14;
-                break;
-        default:
-                *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-                return -EINVAL;
-        }
-        
-        /* decryption schedule */
-        
-        ctx->dkey[0] = ss[0] = le32_to_cpu(key[0]);
-        ctx->dkey[1] = ss[1] = le32_to_cpu(key[1]);
-        ctx->dkey[2] = ss[2] = le32_to_cpu(key[2]);
-        ctx->dkey[3] = ss[3] = le32_to_cpu(key[3]);
-        switch (key_len) {
-        case 16:
-                kdf4(ctx->dkey, 0);
-                for (i = 1; i < 9; i++)
-                        kd4(ctx->dkey, i);
-                kdl4(ctx->dkey, 9);
-                break;
-                
-        case 24:
-                ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
-                ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
-                kdf6(ctx->dkey, 0);
-                for (i = 1; i < 7; i++)
-                        kd6(ctx->dkey, i);
-                kdl6(ctx->dkey, 7);
-                break;
-        case 32:
-                ctx->dkey[4] = ff(ss[4] = le32_to_cpu(key[4]));
-                ctx->dkey[5] = ff(ss[5] = le32_to_cpu(key[5]));
-                ctx->dkey[6] = ff(ss[6] = le32_to_cpu(key[6]));
-                ctx->dkey[7] = ff(ss[7] = le32_to_cpu(key[7]));
-                kdf8(ctx->dkey, 0);
-                for (i = 1; i < 6; i++)
-                        kd8(ctx->dkey, i);
-                kdl8(ctx->dkey, 6);
-                break;
-        }
-        return 0;
-}
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-        aes_enc_blk(tfm, dst, src);
-}
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-        aes_dec_blk(tfm, dst, src);
-}
-static struct crypto_alg aes_alg = {
-        .cra_name               =       "aes",
-        .cra_driver_name        =       "aes-i586",
-        .cra_priority           =       200,
-        .cra_flags              =       CRYPTO_ALG_TYPE_CIPHER,
-        .cra_blocksize          =       AES_BLOCK_SIZE,
-        .cra_ctxsize            =       sizeof(struct aes_ctx),
-        .cra_module             =       THIS_MODULE,
-        .cra_list               =       LIST_HEAD_INIT(aes_alg.cra_list),
-        .cra_u                  =       {
-                .cipher = {
-                        .cia_min_keysize        =       AES_MIN_KEY_SIZE,
-                        .cia_max_keysize        =       AES_MAX_KEY_SIZE,
-                        .cia_setkey             =       aes_set_key,
-                        .cia_encrypt            =       aes_encrypt,
-                        .cia_decrypt            =       aes_decrypt
-                }
-        }
-};
-static int __init aes_init(void)
-{
-        gen_tabs();
-        return crypto_register_alg(&aes_alg);
-}
-static void __exit aes_fini(void)
-{
-        crypto_unregister_alg(&aes_alg);
-}
-module_init(aes_init);
-module_exit(aes_fini);
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, i586 asm optimized");
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_AUTHOR("Fruhwirth Clemens, James Morris, Brian Gladman, Adam Richter");
-MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_64.c b/arch/x86/crypto/aes_64.c
deleted file mode 100644
index 5cdb13ea5cc2..000000000000
--- a/arch/x86/crypto/aes_64.c
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * Cryptographic API.
- *
- * AES Cipher Algorithm.
- *
- * Based on Brian Gladman's code.
- *
- * Linux developers:
- *  Alexander Kjeldaas <astor@fast.no>
- *  Herbert Valerio Riedel <hvr@hvrlab.org>
- *  Kyle McMartin <kyle@debian.org>
- *  Adam J. Richter <adam@yggdrasil.com> (conversion to 2.5 API).
- *  Andreas Steinmetz <ast@domdv.de> (adapted to x86_64 assembler)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * ---------------------------------------------------------------------------
- * Copyright (c) 2002, Dr Brian Gladman <brg@gladman.me.uk>, Worcester, UK.
- * All rights reserved.
- *
- * LICENSE TERMS
- *
- * The free distribution and use of this software in both source and binary
- * form is allowed (with or without changes) provided that:
- *
- *   1. distributions of this source code include the above copyright
- *      notice, this list of conditions and the following disclaimer;
- *
- *   2. distributions in binary form include the above copyright
- *      notice, this list of conditions and the following disclaimer
- *      in the documentation and/or other associated materials;
- *
- *   3. the copyright holder's name is not used to endorse products
- *      built using this software without specific written permission.
- *
- * ALTERNATIVELY, provided that this notice is retained in full, this product
- * may be distributed under the terms of the GNU General Public License (GPL),
- * in which case the provisions of the GPL apply INSTEAD OF those given above.
- *
- * DISCLAIMER
- *
- * This software is provided 'as is' with no explicit or implied warranties
- * in respect of its properties, including, but not limited to, correctness
- * and/or fitness for purpose.
- * ---------------------------------------------------------------------------
- */
-/* Some changes from the Gladman version:
-    s/RIJNDAEL(e_key)/E_KEY/g
-    s/RIJNDAEL(d_key)/D_KEY/g
-*/
-#include <asm/byteorder.h>
-#include <linux/bitops.h>
-#include <linux/crypto.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#define AES_MIN_KEY_SIZE        16
-#define AES_MAX_KEY_SIZE        32
-#define AES_BLOCK_SIZE          16
-/*
- * #define byte(x, nr) ((unsigned char)((x) >> (nr*8)))
- */
-static inline u8 byte(const u32 x, const unsigned n)
-{
-        return x >> (n << 3);
-}
-struct aes_ctx
-{
-        u32 key_length;
-        u32 buf[120];
-};
-#define E_KEY (&ctx->buf[0])
-#define D_KEY (&ctx->buf[60])
-static u8 pow_tab[256] __initdata;
-static u8 log_tab[256] __initdata;
-static u8 sbx_tab[256] __initdata;
-static u8 isb_tab[256] __initdata;
-static u32 rco_tab[10];
-u32 aes_ft_tab[4][256];
-u32 aes_it_tab[4][256];
-u32 aes_fl_tab[4][256];
-u32 aes_il_tab[4][256];
-static inline u8 f_mult(u8 a, u8 b)
-{
-        u8 aa = log_tab[a], cc = aa + log_tab[b];
-        return pow_tab[cc + (cc < aa ? 1 : 0)];
-}
-#define ff_mult(a, b) (a && b ? f_mult(a, b) : 0)
-#define ls_box(x)                               \
-        (aes_fl_tab[0][byte(x, 0)] ^            \
-         aes_fl_tab[1][byte(x, 1)] ^            \
-         aes_fl_tab[2][byte(x, 2)] ^            \
-         aes_fl_tab[3][byte(x, 3)])
-static void __init gen_tabs(void)
-{
-        u32 i, t;
-        u8 p, q;
-        /* log and power tables for GF(2**8) finite field with
-           0x011b as modular polynomial - the simplest primitive
-           root is 0x03, used here to generate the tables */
-        for (i = 0, p = 1; i < 256; ++i) {
-                pow_tab[i] = (u8)p;
-                log_tab[p] = (u8)i;
-                p ^= (p << 1) ^ (p & 0x80 ? 0x01b : 0);
-        }
-        log_tab[1] = 0;
-        for (i = 0, p = 1; i < 10; ++i) {
-                rco_tab[i] = p;
-                p = (p << 1) ^ (p & 0x80 ? 0x01b : 0);
-        }
-        for (i = 0; i < 256; ++i) {
-                p = (i ? pow_tab[255 - log_tab[i]] : 0);
-                q = ((p >> 7) | (p << 1)) ^ ((p >> 6) | (p << 2));
-                p ^= 0x63 ^ q ^ ((q >> 6) | (q << 2));
-                sbx_tab[i] = p;
-                isb_tab[p] = (u8)i;
-        }
-        for (i = 0; i < 256; ++i) {
-                p = sbx_tab[i];
-                t = p;
-                aes_fl_tab[0][i] = t;
-                aes_fl_tab[1][i] = rol32(t, 8);
-                aes_fl_tab[2][i] = rol32(t, 16);
-                aes_fl_tab[3][i] = rol32(t, 24);
-                t = ((u32)ff_mult(2, p)) |
-                    ((u32)p << 8) |
-                    ((u32)p << 16) | ((u32)ff_mult(3, p) << 24);
-                aes_ft_tab[0][i] = t;
-                aes_ft_tab[1][i] = rol32(t, 8);
-                aes_ft_tab[2][i] = rol32(t, 16);
-                aes_ft_tab[3][i] = rol32(t, 24);
-                p = isb_tab[i];
-                t = p;
-                aes_il_tab[0][i] = t;
-                aes_il_tab[1][i] = rol32(t, 8);
-                aes_il_tab[2][i] = rol32(t, 16);
-                aes_il_tab[3][i] = rol32(t, 24);
-                t = ((u32)ff_mult(14, p)) |
-                    ((u32)ff_mult(9, p) << 8) |
-                    ((u32)ff_mult(13, p) << 16) |
-                    ((u32)ff_mult(11, p) << 24);
-                aes_it_tab[0][i] = t;
-                aes_it_tab[1][i] = rol32(t, 8);
-                aes_it_tab[2][i] = rol32(t, 16);
-                aes_it_tab[3][i] = rol32(t, 24);
-        }
-}
-#define star_x(x) (((x) & 0x7f7f7f7f) << 1) ^ ((((x) & 0x80808080) >> 7) * 0x1b)
-#define imix_col(y, x)                  \
-        u    = star_x(x);               \
-        v    = star_x(u);               \
-        w    = star_x(v);               \
-        t    = w ^ (x);                 \
-        (y)  = u ^ v ^ w;               \
-        (y) ^= ror32(u ^ t,  8) ^       \
-               ror32(v ^ t, 16) ^       \
-               ror32(t, 24)
-/* initialise the key schedule from the user supplied key */
-#define loop4(i)                                        \
-{                                                       \
-        t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];   \
-        t ^= E_KEY[4 * i];     E_KEY[4 * i + 4] = t;    \
-        t ^= E_KEY[4 * i + 1]; E_KEY[4 * i + 5] = t;    \
-        t ^= E_KEY[4 * i + 2]; E_KEY[4 * i + 6] = t;    \
-        t ^= E_KEY[4 * i + 3]; E_KEY[4 * i + 7] = t;    \
-}
-#define loop6(i)                                        \
-{                                                       \
-        t = ror32(t,  8); t = ls_box(t) ^ rco_tab[i];   \
-        t ^= E_KEY[6 * i];     E_KEY[6 * i + 6] = t;    \
-        t ^= E_KEY[6 * i + 1]; E_KEY[6 * i + 7] = t;    \
-        t ^= E_KEY[6 * i + 2]; E_KEY[6 * i + 8] = t;    \
-        t ^= E_KEY[6 * i + 3]; E_KEY[6 * i + 9] = t;    \
-        t ^= E_KEY[6 * i + 4]; E_KEY[6 * i + 10] = t;   \
-        t ^= E_KEY[6 * i + 5]; E_KEY[6 * i + 11] = t;   \
-}
-#define loop8(i)                                        \
-{                                                       \
-        t = ror32(t,  8); ; t = ls_box(t) ^ rco_tab[i]; \
-        t ^= E_KEY[8 * i];     E_KEY[8 * i + 8] = t;    \
-        t ^= E_KEY[8 * i + 1]; E_KEY[8 * i + 9] = t;    \
-        t ^= E_KEY[8 * i + 2]; E_KEY[8 * i + 10] = t;   \
-        t ^= E_KEY[8 * i + 3]; E_KEY[8 * i + 11] = t;   \
-        t  = E_KEY[8 * i + 4] ^ ls_box(t);              \
-        E_KEY[8 * i + 12] = t;                          \
-        t ^= E_KEY[8 * i + 5]; E_KEY[8 * i + 13] = t;   \
-        t ^= E_KEY[8 * i + 6]; E_KEY[8 * i + 14] = t;   \
-        t ^= E_KEY[8 * i + 7]; E_KEY[8 * i + 15] = t;   \
-}
-static int aes_set_key(struct crypto_tfm *tfm, const u8 *in_key,
-                       unsigned int key_len)
-{
-        struct aes_ctx *ctx = crypto_tfm_ctx(tfm);
-        const __le32 *key = (const __le32 *)in_key;
-        u32 *flags = &tfm->crt_flags;
-        u32 i, j, t, u, v, w;
-        if (key_len % 8) {
-                *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN;
-                return -EINVAL;
-        }
-        ctx->key_length = key_len;
-        D_KEY[key_len + 24] = E_KEY[0] = le32_to_cpu(key[0]);
-        D_KEY[key_len + 25] = E_KEY[1] = le32_to_cpu(key[1]);
-        D_KEY[key_len + 26] = E_KEY[2] = le32_to_cpu(key[2]);
-        D_KEY[key_len + 27] = E_KEY[3] = le32_to_cpu(key[3]);
-        switch (key_len) {
-        case 16:
-                t = E_KEY[3];
-                for (i = 0; i < 10; ++i)
-                        loop4(i);
-                break;
-        case 24:
-                E_KEY[4] = le32_to_cpu(key[4]);
-                t = E_KEY[5] = le32_to_cpu(key[5]);
-                for (i = 0; i < 8; ++i)
-                        loop6 (i);
-                break;
-        case 32:
-                E_KEY[4] = le32_to_cpu(key[4]);
-                E_KEY[5] = le32_to_cpu(key[5]);
-                E_KEY[6] = le32_to_cpu(key[6]);
-                t = E_KEY[7] = le32_to_cpu(key[7]);
-                for (i = 0; i < 7; ++i)
-                        loop8(i);
-                break;
-        }
-        D_KEY[0] = E_KEY[key_len + 24];
-        D_KEY[1] = E_KEY[key_len + 25];
-        D_KEY[2] = E_KEY[key_len + 26];
-        D_KEY[3] = E_KEY[key_len + 27];
-        for (i = 4; i < key_len + 24; ++i) {
-                j = key_len + 24 - (i & ~3) + (i & 3);
-                imix_col(D_KEY[j], E_KEY[i]);
-        }
-        return 0;
-}
-asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
-asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
-static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-        aes_enc_blk(tfm, dst, src);
-}
-static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-        aes_dec_blk(tfm, dst, src);
-}
-static struct crypto_alg aes_alg = {
-        .cra_name               =       "aes",
-        .cra_driver_name        =       "aes-x86_64",
-        .cra_priority           =       200,
-        .cra_flags              =       CRYPTO_ALG_TYPE_CIPHER,
-        .cra_blocksize          =       AES_BLOCK_SIZE,
-        .cra_ctxsize            =       sizeof(struct aes_ctx),
-        .cra_module             =       THIS_MODULE,
-        .cra_list               =       LIST_HEAD_INIT(aes_alg.cra_list),
-        .cra_u                  =       {
-                .cipher = {
-                        .cia_min_keysize        =       AES_MIN_KEY_SIZE,
-                        .cia_max_keysize        =       AES_MAX_KEY_SIZE,
-                        .cia_setkey             =       aes_set_key,
-                        .cia_encrypt            =       aes_encrypt,
-                        .cia_decrypt            =       aes_decrypt
-                }
-        }
-};
-static int __init aes_init(void)
-{
-        gen_tabs();
-        return crypto_register_alg(&aes_alg);
-}
-static void __exit aes_fini(void)
-{
-        crypto_unregister_alg(&aes_alg);
-}
-module_init(aes_init);
-module_exit(aes_fini);
-MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("aes");
diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c
new file mode 100644
index 000000000000..71f457827116
--- /dev/null
+++ b/arch/x86/crypto/aes_glue.c
@@ -0,0 +1,57 @@
+/*
+ * Glue Code for the asm optimized version of the AES Cipher Algorithm
+ *
+ */
+#include <crypto/aes.h>
+asmlinkage void aes_enc_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+asmlinkage void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in);
+static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        aes_enc_blk(tfm, dst, src);
+}
+static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+        aes_dec_blk(tfm, dst, src);
+}
+static struct crypto_alg aes_alg = {
+        .cra_name               = "aes",
+        .cra_driver_name        = "aes-asm",
+        .cra_priority           = 200,
+        .cra_flags              = CRYPTO_ALG_TYPE_CIPHER,
+        .cra_blocksize          = AES_BLOCK_SIZE,
+        .cra_ctxsize            = sizeof(struct crypto_aes_ctx),
+        .cra_module             = THIS_MODULE,
+        .cra_list               = LIST_HEAD_INIT(aes_alg.cra_list),
+        .cra_u  = {
+                .cipher = {
+                        .cia_min_keysize        = AES_MIN_KEY_SIZE,
+                        .cia_max_keysize        = AES_MAX_KEY_SIZE,
+                        .cia_setkey             = crypto_aes_set_key,
+                        .cia_encrypt            = aes_encrypt,
+                        .cia_decrypt            = aes_decrypt
+                }
+        }
+};
+static int __init aes_init(void)
+{
+        return crypto_register_alg(&aes_alg);
+}
+static void __exit aes_fini(void)
+{
+        crypto_unregister_alg(&aes_alg);
+}
+module_init(aes_init);
+module_exit(aes_fini);
+MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm, asm optimized");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("aes");
+MODULE_ALIAS("aes-asm");
diff --git a/arch/x86/crypto/salsa20-i586-asm_32.S b/arch/x86/crypto/salsa20-i586-asm_32.S
new file mode 100644
index 000000000000..72eb306680b2
--- /dev/null
+++ b/arch/x86/crypto/salsa20-i586-asm_32.S
@@ -0,0 +1,1114 @@
+# salsa20_pm.s version 20051229
+# D. J. Bernstein
+# Public domain.
+# enter ECRYPT_encrypt_bytes
+.text
+.p2align 5
+.globl ECRYPT_encrypt_bytes
+ECRYPT_encrypt_bytes:
+        mov     %esp,%eax
+        and     $31,%eax
+        add     $256,%eax
+        sub     %eax,%esp
+        # eax_stack = eax
+        movl    %eax,80(%esp)
+        # ebx_stack = ebx
+        movl    %ebx,84(%esp)
+        # esi_stack = esi
+        movl    %esi,88(%esp)
+        # edi_stack = edi
+        movl    %edi,92(%esp)
+        # ebp_stack = ebp
+        movl    %ebp,96(%esp)
+        # x = arg1
+        movl    4(%esp,%eax),%edx
+        # m = arg2
+        movl    8(%esp,%eax),%esi
+        # out = arg3
+        movl    12(%esp,%eax),%edi
+        # bytes = arg4
+        movl    16(%esp,%eax),%ebx
+        # bytes -= 0
+        sub     $0,%ebx
+        # goto done if unsigned<=
+        jbe     ._done
+._start:
+        # in0 = *(uint32 *) (x + 0)
+        movl    0(%edx),%eax
+        # in1 = *(uint32 *) (x + 4)
+        movl    4(%edx),%ecx
+        # in2 = *(uint32 *) (x + 8)
+        movl    8(%edx),%ebp
+        # j0 = in0
+        movl    %eax,164(%esp)
+        # in3 = *(uint32 *) (x + 12)
+        movl    12(%edx),%eax
+        # j1 = in1
+        movl    %ecx,168(%esp)
+        # in4 = *(uint32 *) (x + 16)
+        movl    16(%edx),%ecx
+        # j2 = in2
+        movl    %ebp,172(%esp)
+        # in5 = *(uint32 *) (x + 20)
+        movl    20(%edx),%ebp
+        # j3 = in3
+        movl    %eax,176(%esp)
+        # in6 = *(uint32 *) (x + 24)
+        movl    24(%edx),%eax
+        # j4 = in4
+        movl    %ecx,180(%esp)
+        # in7 = *(uint32 *) (x + 28)
+        movl    28(%edx),%ecx
+        # j5 = in5
+        movl    %ebp,184(%esp)
+        # in8 = *(uint32 *) (x + 32)
+        movl    32(%edx),%ebp
+        # j6 = in6
+        movl    %eax,188(%esp)
+        # in9 = *(uint32 *) (x + 36)
+        movl    36(%edx),%eax
+        # j7 = in7
+        movl    %ecx,192(%esp)
+        # in10 = *(uint32 *) (x + 40)
+        movl    40(%edx),%ecx
+        # j8 = in8
+        movl    %ebp,196(%esp)
+        # in11 = *(uint32 *) (x + 44)
+        movl    44(%edx),%ebp
+        # j9 = in9
+        movl    %eax,200(%esp)
+        # in12 = *(uint32 *) (x + 48)
+        movl    48(%edx),%eax
+        # j10 = in10
+        movl    %ecx,204(%esp)
+        # in13 = *(uint32 *) (x + 52)
+        movl    52(%edx),%ecx
+        # j11 = in11
+        movl    %ebp,208(%esp)
+        # in14 = *(uint32 *) (x + 56)
+        movl    56(%edx),%ebp
+        # j12 = in12
+        movl    %eax,212(%esp)
+        # in15 = *(uint32 *) (x + 60)
+        movl    60(%edx),%eax
+        # j13 = in13
+        movl    %ecx,216(%esp)
+        # j14 = in14
+        movl    %ebp,220(%esp)
+        # j15 = in15
+        movl    %eax,224(%esp)
+        # x_backup = x
+        movl    %edx,64(%esp)
+._bytesatleast1:
+        #   bytes - 64
+        cmp     $64,%ebx
+        #   goto nocopy if unsigned>=
+        jae     ._nocopy
+        #     ctarget = out
+        movl    %edi,228(%esp)
+        #     out = &tmp
+        leal    0(%esp),%edi
+        #     i = bytes
+        mov     %ebx,%ecx
+        #     while (i) { *out++ = *m++; --i }
+        rep     movsb
+        #     out = &tmp
+        leal    0(%esp),%edi
+        #     m = &tmp
+        leal    0(%esp),%esi
+._nocopy:
+        #   out_backup = out
+        movl    %edi,72(%esp)
+        #   m_backup = m
+        movl    %esi,68(%esp)
+        #   bytes_backup = bytes
+        movl    %ebx,76(%esp)
+        #   in0 = j0
+        movl    164(%esp),%eax
+        #   in1 = j1
+        movl    168(%esp),%ecx
+        #   in2 = j2
+        movl    172(%esp),%edx
+        #   in3 = j3
+        movl    176(%esp),%ebx
+        #   x0 = in0
+        movl    %eax,100(%esp)
+        #   x1 = in1
+        movl    %ecx,104(%esp)
+        #   x2 = in2
+        movl    %edx,108(%esp)
+        #   x3 = in3
+        movl    %ebx,112(%esp)
+        #   in4 = j4
+        movl    180(%esp),%eax
+        #   in5 = j5
+        movl    184(%esp),%ecx
+        #   in6 = j6
+        movl    188(%esp),%edx
+        #   in7 = j7
+        movl    192(%esp),%ebx
+        #   x4 = in4
+        movl    %eax,116(%esp)
+        #   x5 = in5
+        movl    %ecx,120(%esp)
+        #   x6 = in6
+        movl    %edx,124(%esp)
+        #   x7 = in7
+        movl    %ebx,128(%esp)
+        #   in8 = j8
+        movl    196(%esp),%eax
+        #   in9 = j9
+        movl    200(%esp),%ecx
+        #   in10 = j10
+        movl    204(%esp),%edx
+        #   in11 = j11
+        movl    208(%esp),%ebx
+        #   x8 = in8
+        movl    %eax,132(%esp)
+        #   x9 = in9
+        movl    %ecx,136(%esp)
+        #   x10 = in10
+        movl    %edx,140(%esp)
+        #   x11 = in11
+        movl    %ebx,144(%esp)
+        #   in12 = j12
+        movl    212(%esp),%eax
+        #   in13 = j13
+        movl    216(%esp),%ecx
+        #   in14 = j14
+        movl    220(%esp),%edx
+        #   in15 = j15
+        movl    224(%esp),%ebx
+        #   x12 = in12
+        movl    %eax,148(%esp)
+        #   x13 = in13
+        movl    %ecx,152(%esp)
+        #   x14 = in14
+        movl    %edx,156(%esp)
+        #   x15 = in15
+        movl    %ebx,160(%esp)
+        #   i = 20
+        mov     $20,%ebp
+        # p = x0
+        movl    100(%esp),%eax
+        # s = x5
+        movl    120(%esp),%ecx
+        # t = x10
+        movl    140(%esp),%edx
+        # w = x15
+        movl    160(%esp),%ebx
+._mainloop:
+        # x0 = p
+        movl    %eax,100(%esp)
+        #                               x10 = t
+        movl    %edx,140(%esp)
+        # p += x12
+        addl    148(%esp),%eax
+        #               x5 = s
+        movl    %ecx,120(%esp)
+        #                               t += x6
+        addl    124(%esp),%edx
+        #                                               x15 = w
+        movl    %ebx,160(%esp)
+        #               r = x1
+        movl    104(%esp),%esi
+        #               r += s
+        add     %ecx,%esi
+        #                                               v = x11
+        movl    144(%esp),%edi
+        #                                               v += w
+        add     %ebx,%edi
+        # p <<<= 7
+        rol     $7,%eax
+        # p ^= x4
+        xorl    116(%esp),%eax
+        #                               t <<<= 7
+        rol     $7,%edx
+        #                               t ^= x14
+        xorl    156(%esp),%edx
+        #               r <<<= 7
+        rol     $7,%esi
+        #               r ^= x9
+        xorl    136(%esp),%esi
+        #                                               v <<<= 7
+        rol     $7,%edi
+        #                                               v ^= x3
+        xorl    112(%esp),%edi
+        # x4 = p
+        movl    %eax,116(%esp)
+        #                               x14 = t
+        movl    %edx,156(%esp)
+        # p += x0
+        addl    100(%esp),%eax
+        #               x9 = r
+        movl    %esi,136(%esp)
+        #                               t += x10
+        addl    140(%esp),%edx
+        #                                               x3 = v
+        movl    %edi,112(%esp)
+        # p <<<= 9
+        rol     $9,%eax
+        # p ^= x8
+        xorl    132(%esp),%eax
+        #                               t <<<= 9
+        rol     $9,%edx
+        #                               t ^= x2
+        xorl    108(%esp),%edx
+        #               s += r
+        add     %esi,%ecx
+        #               s <<<= 9
+        rol     $9,%ecx
+        #               s ^= x13
+        xorl    152(%esp),%ecx
+        #                                               w += v
+        add     %edi,%ebx
+        #                                               w <<<= 9
+        rol     $9,%ebx
+        #                                               w ^= x7
+        xorl    128(%esp),%ebx
+        # x8 = p
+        movl    %eax,132(%esp)
+        #                               x2 = t
+        movl    %edx,108(%esp)
+        # p += x4
+        addl    116(%esp),%eax
+        #               x13 = s
+        movl    %ecx,152(%esp)
+        #                               t += x14
+        addl    156(%esp),%edx
+        #                                               x7 = w
+        movl    %ebx,128(%esp)
+        # p <<<= 13
+        rol     $13,%eax
+        # p ^= x12
+        xorl    148(%esp),%eax
+        #                               t <<<= 13
+        rol     $13,%edx
+        #                               t ^= x6
+        xorl    124(%esp),%edx
+        #               r += s
+        add     %ecx,%esi
+        #               r <<<= 13
+        rol     $13,%esi
+        #               r ^= x1
+        xorl    104(%esp),%esi
+        #                                               v += w
+        add     %ebx,%edi
+        #                                               v <<<= 13
+        rol     $13,%edi
+        #                                               v ^= x11
+        xorl    144(%esp),%edi
+        # x12 = p
+        movl    %eax,148(%esp)
+        #                               x6 = t
+        movl    %edx,124(%esp)
+        # p += x8
+        addl    132(%esp),%eax
+        #               x1 = r
+        movl    %esi,104(%esp)
+        #                               t += x2
+        addl    108(%esp),%edx
+        #                                               x11 = v
+        movl    %edi,144(%esp)
+        # p <<<= 18
+        rol     $18,%eax
+        # p ^= x0
+        xorl    100(%esp),%eax
+        #                               t <<<= 18
+        rol     $18,%edx
+        #                               t ^= x10
+        xorl    140(%esp),%edx
+        #               s += r
+        add     %esi,%ecx
+        #               s <<<= 18
+        rol     $18,%ecx
+        #               s ^= x5
+        xorl    120(%esp),%ecx
+        #                                               w += v
+        add     %edi,%ebx
+        #                                               w <<<= 18
+        rol     $18,%ebx
+        #                                               w ^= x15
+        xorl    160(%esp),%ebx
+        # x0 = p
+        movl    %eax,100(%esp)
+        #                               x10 = t
+        movl    %edx,140(%esp)
+        # p += x3
+        addl    112(%esp),%eax
+        # p <<<= 7
+        rol     $7,%eax
+        #               x5 = s
+        movl    %ecx,120(%esp)
+        #                               t += x9
+        addl    136(%esp),%edx
+        #                                               x15 = w
+        movl    %ebx,160(%esp)
+        #               r = x4
+        movl    116(%esp),%esi
+        #               r += s
+        add     %ecx,%esi
+        #                                               v = x14
+        movl    156(%esp),%edi
+        #                                               v += w
+        add     %ebx,%edi
+        # p ^= x1
+        xorl    104(%esp),%eax
+        #                               t <<<= 7
+        rol     $7,%edx
+        #                               t ^= x11
+        xorl    144(%esp),%edx
+        #               r <<<= 7
+        rol     $7,%esi
+        #               r ^= x6
+        xorl    124(%esp),%esi
+        #                                               v <<<= 7
+        rol     $7,%edi
+        #                                               v ^= x12
+        xorl    148(%esp),%edi
+        # x1 = p
+        movl    %eax,104(%esp)
+        #                               x11 = t
+        movl    %edx,144(%esp)
+        # p += x0
+        addl    100(%esp),%eax
+        #               x6 = r
+        movl    %esi,124(%esp)
+        #                               t += x10
+        addl    140(%esp),%edx
+        #                                               x12 = v
+        movl    %edi,148(%esp)
+        # p <<<= 9
+        rol     $9,%eax
+        # p ^= x2
+        xorl    108(%esp),%eax
+        #                               t <<<= 9
+        rol     $9,%edx
+        #                               t ^= x8
+        xorl    132(%esp),%edx
+        #               s += r
+        add     %esi,%ecx
+        #               s <<<= 9
+        rol     $9,%ecx
+        #               s ^= x7
+        xorl    128(%esp),%ecx
+        #                                               w += v
+        add     %edi,%ebx
+        #                                               w <<<= 9
+        rol     $9,%ebx
+        #                                               w ^= x13
+        xorl    152(%esp),%ebx
+        # x2 = p
+        movl    %eax,108(%esp)
+        #                               x8 = t
+        movl    %edx,132(%esp)
+        # p += x1
+        addl    104(%esp),%eax
+        #               x7 = s
+        movl    %ecx,128(%esp)
+        #                               t += x11
+        addl    144(%esp),%edx
+        #                                               x13 = w
+        movl    %ebx,152(%esp)
+        # p <<<= 13
+        rol     $13,%eax
+        # p ^= x3
+        xorl    112(%esp),%eax
+        #                               t <<<= 13
+        rol     $13,%edx
+        #                               t ^= x9
+        xorl    136(%esp),%edx
+        #               r += s
+        add     %ecx,%esi
+        #               r <<<= 13
+        rol     $13,%esi
+        #               r ^= x4
+        xorl    116(%esp),%esi
+        #                                               v += w
+        add     %ebx,%edi
+        #                                               v <<<= 13
+        rol     $13,%edi
+        #                                               v ^= x14
+        xorl    156(%esp),%edi
+        # x3 = p
+        movl    %eax,112(%esp)
+        #                               x9 = t
+        movl    %edx,136(%esp)
+        # p += x2
+        addl    108(%esp),%eax
+        #               x4 = r
+        movl    %esi,116(%esp)
+        #                               t += x8
+        addl    132(%esp),%edx
+        #                                               x14 = v
+        movl    %edi,156(%esp)
+        # p <<<= 18
+        rol     $18,%eax
+        # p ^= x0
+        xorl    100(%esp),%eax
+        #                               t <<<= 18
+        rol     $18,%edx
+        #                               t ^= x10
+        xorl    140(%esp),%edx
+        #               s += r
+        add     %esi,%ecx
+        #               s <<<= 18
+        rol     $18,%ecx
+        #               s ^= x5
+        xorl    120(%esp),%ecx
+        #                                               w += v
+        add     %edi,%ebx
+        #                                               w <<<= 18
+        rol     $18,%ebx
+        #                                               w ^= x15
+        xorl    160(%esp),%ebx
+        # x0 = p
+        movl    %eax,100(%esp)
+        #                               x10 = t
+        movl    %edx,140(%esp)
+        # p += x12
+        addl    148(%esp),%eax
+        #               x5 = s
+        movl    %ecx,120(%esp)
+        #                               t += x6
+        addl    124(%esp),%edx
+        #                                               x15 = w
+        movl    %ebx,160(%esp)
+        #               r = x1
+        movl    104(%esp),%esi
+        #               r += s
+        add     %ecx,%esi
+        #                                               v = x11
+        movl    144(%esp),%edi
+        #                                               v += w
+        add     %ebx,%edi
+        # p <<<= 7
+        rol     $7,%eax
+        # p ^= x4
+        xorl    116(%esp),%eax
+        #                               t <<<= 7
+        rol     $7,%edx
+        #                               t ^= x14
+        xorl    156(%esp),%edx
+        #               r <<<= 7
+        rol     $7,%esi
+        #               r ^= x9
+        xorl    136(%esp),%esi
+        #                                               v <<<= 7
+        rol     $7,%edi
+        #                                               v ^= x3
+        xorl    112(%esp),%edi
+        # x4 = p
+        movl    %eax,116(%esp)
+        #                               x14 = t
+        movl    %edx,156(%esp)
+        # p += x0
+        addl    100(%esp),%eax
+        #               x9 = r
+        movl    %esi,136(%esp)
+        #                               t += x10
+        addl    140(%esp),%edx
+        #                                               x3 = v
+        movl    %edi,112(%esp)
+        # p <<<= 9
+        rol     $9,%eax
+        # p ^= x8
+        xorl    132(%esp),%eax
+        #                               t <<<= 9
+        rol     $9,%edx
+        #                               t ^= x2
+        xorl    108(%esp),%edx
+        #               s += r
+        add     %esi,%ecx
+        #               s <<<= 9
+        rol     $9,%ecx
+        #               s ^= x13
+        xorl    152(%esp),%ecx
+        #                                               w += v
+        add     %edi,%ebx
+        #                                               w <<<= 9
+        rol     $9,%ebx
+        #                                               w ^= x7
+        xorl    128(%esp),%ebx
+        # x8 = p
+        movl    %eax,132(%esp)
+        #                               x2 = t
+        movl    %edx,108(%esp)
+        # p += x4
+        addl    116(%esp),%eax
+        #               x13 = s
+        movl    %ecx,152(%esp)
+        #                               t += x14
+        addl    156(%esp),%edx
+        #                                               x7 = w
+        movl    %ebx,128(%esp)
+        # p <<<= 13
+        rol     $13,%eax
+        # p ^= x12
+        xorl    148(%esp),%eax
+        #                               t <<<= 13
+        rol     $13,%edx
+        #                               t ^= x6
+        xorl    124(%esp),%edx
+        #               r += s
+        add     %ecx,%esi
+        #               r <<<= 13
+        rol     $13,%esi
+        #               r ^= x1
+        xorl    104(%esp),%esi
+        #                                               v += w
+        add     %ebx,%edi
+        #                                               v <<<= 13
+        rol     $13,%edi
+        #                                               v ^= x11
+        xorl    144(%esp),%edi
+        # x12 = p
+        movl    %eax,148(%esp)
+        #                               x6 = t
+        movl    %edx,124(%esp)
+        # p += x8
+        addl    132(%esp),%eax
+        #               x1 = r
+        movl    %esi,104(%esp)
+        #                               t += x2
+        addl    108(%esp),%edx
+        #                                               x11 = v
+        movl    %edi,144(%esp)
+        # p <<<= 18
+        rol     $18,%eax
+        # p ^= x0
+        xorl    100(%esp),%eax
+        #                               t <<<= 18
+        rol     $18,%edx
+        #                               t ^= x10
+        xorl    140(%esp),%edx
+        #               s += r
+        add     %esi,%ecx
+        #               s <<<= 18
+        rol     $18,%ecx
+        #               s ^= x5
+        xorl    120(%esp),%ecx
+        #                                               w += v
+        add     %edi,%ebx
+        #                                               w <<<= 18
+        rol     $18,%ebx
+        #                                               w ^= x15
+        xorl    160(%esp),%ebx
+        # x0 = p
+        movl    %eax,100(%esp)
+        #                               x10 = t
+        movl    %edx,140(%esp)
+        # p += x3
+        addl    112(%esp),%eax
+        # p <<<= 7
+        rol     $7,%eax
+        #               x5 = s
+        movl    %ecx,120(%esp)
+        #                               t += x9
+        addl    136(%esp),%edx
+        #                                               x15 = w
+        movl    %ebx,160(%esp)
+        #               r = x4
+        movl    116(%esp),%esi
+        #               r += s
+        add     %ecx,%esi
+        #                                               v = x14
+        movl    156(%esp),%edi
+        #                                               v += w
+        add     %ebx,%edi
+        # p ^= x1
+        xorl    104(%esp),%eax
+        #                               t <<<= 7
+        rol     $7,%edx
+        #                               t ^= x11
+        xorl    144(%esp),%edx
+        #               r <<<= 7
+        rol     $7,%esi
+        #               r ^= x6
+        xorl    124(%esp),%esi
+        #                                               v <<<= 7
+        rol     $7,%edi
+        #                                               v ^= x12
+        xorl    148(%esp),%edi
+        # x1 = p
+        movl    %eax,104(%esp)
+        #                               x11 = t
+        movl    %edx,144(%esp)
+        # p += x0
+        addl    100(%esp),%eax
+        #               x6 = r
+        movl    %esi,124(%esp)
+        #                               t += x10
+        addl    140(%esp),%edx
+        #                                               x12 = v
+        movl    %edi,148(%esp)
+        # p <<<= 9
+        rol     $9,%eax
+        # p ^= x2
+        xorl    108(%esp),%eax
+        #                               t <<<= 9
+        rol     $9,%edx
+        #                               t ^= x8
+        xorl    132(%esp),%edx
+        #               s += r
+        add     %esi,%ecx
+        #               s <<<= 9
+        rol     $9,%ecx
+        #               s ^= x7
+        xorl    128(%esp),%ecx
+        #                                               w += v
+        add     %edi,%ebx
+        #                                               w <<<= 9
+        rol     $9,%ebx
+        #                                               w ^= x13
+        xorl    152(%esp),%ebx
+        # x2 = p
+        movl    %eax,108(%esp)
+        #                               x8 = t
+        movl    %edx,132(%esp)
+        # p += x1
+        addl    104(%esp),%eax
+        #               x7 = s
+        movl    %ecx,128(%esp)
+        #                               t += x11
+        addl    144(%esp),%edx
+        #                                               x13 = w
+        movl    %ebx,152(%esp)
+        # p <<<= 13
+        rol     $13,%eax
+        # p ^= x3
+        xorl    112(%esp),%eax
+        #                               t <<<= 13
+        rol     $13,%edx
+        #                               t ^= x9
+        xorl    136(%esp),%edx
+        #               r += s
+        add     %ecx,%esi
+        #               r <<<= 13
+        rol     $13,%esi
+        #               r ^= x4
+        xorl    116(%esp),%esi
+        #                                               v += w
+        add     %ebx,%edi
+        #                                               v <<<= 13
+        rol     $13,%edi
+        #                                               v ^= x14
+        xorl    156(%esp),%edi
+        # x3 = p
+        movl    %eax,112(%esp)
+        #                               x9 = t
+        movl    %edx,136(%esp)
+        # p += x2
+        addl    108(%esp),%eax
+        #               x4 = r
+        movl    %esi,116(%esp)
+        #                               t += x8
+        addl    132(%esp),%edx
+        #                                               x14 = v
+        movl    %edi,156(%esp)
+        # p <<<= 18
+        rol     $18,%eax
+        # p ^= x0
+        xorl    100(%esp),%eax
+        #                               t <<<= 18
+        rol     $18,%edx
+        #                               t ^= x10
+        xorl    140(%esp),%edx
+        #               s += r
+        add     %esi,%ecx
+        #               s <<<= 18
+        rol     $18,%ecx
+        #               s ^= x5
+        xorl    120(%esp),%ecx
+        #                                               w += v
+        add     %edi,%ebx
+        #                                               w <<<= 18
+        rol     $18,%ebx
+        #                                               w ^= x15
+        xorl    160(%esp),%ebx
+        # i -= 4
+        sub     $4,%ebp
+        # goto mainloop if unsigned >
+        ja      ._mainloop
+        # x0 = p
+        movl    %eax,100(%esp)
+        # x5 = s
+        movl    %ecx,120(%esp)
+        # x10 = t
+        movl    %edx,140(%esp)
+        # x15 = w
+        movl    %ebx,160(%esp)
+        #   out = out_backup
+        movl    72(%esp),%edi
+        #   m = m_backup
+        movl    68(%esp),%esi
+        #   in0 = x0
+        movl    100(%esp),%eax
+        #   in1 = x1
+        movl    104(%esp),%ecx
+        #   in0 += j0
+        addl    164(%esp),%eax
+        #   in1 += j1
+        addl    168(%esp),%ecx
+        #   in0 ^= *(uint32 *) (m + 0)
+        xorl    0(%esi),%eax
+        #   in1 ^= *(uint32 *) (m + 4)
+        xorl    4(%esi),%ecx
+        #   *(uint32 *) (out + 0) = in0
+        movl    %eax,0(%edi)
+        #   *(uint32 *) (out + 4) = in1
+        movl    %ecx,4(%edi)
+        #   in2 = x2
+        movl    108(%esp),%eax
+        #   in3 = x3
+        movl    112(%esp),%ecx
+        #   in2 += j2
+        addl    172(%esp),%eax
+        #   in3 += j3
+        addl    176(%esp),%ecx
+        #   in2 ^= *(uint32 *) (m + 8)
+        xorl    8(%esi),%eax
+        #   in3 ^= *(uint32 *) (m + 12)
+        xorl    12(%esi),%ecx
+        #   *(uint32 *) (out + 8) = in2
+        movl    %eax,8(%edi)
+        #   *(uint32 *) (out + 12) = in3
+        movl    %ecx,12(%edi)
+        #   in4 = x4
+        movl    116(%esp),%eax
+        #   in5 = x5
+        movl    120(%esp),%ecx
+        #   in4 += j4
+        addl    180(%esp),%eax
+        #   in5 += j5
+        addl    184(%esp),%ecx
+        #   in4 ^= *(uint32 *) (m + 16)
+        xorl    16(%esi),%eax
+        #   in5 ^= *(uint32 *) (m + 20)
+        xorl    20(%esi),%ecx
+        #   *(uint32 *) (out + 16) = in4
+        movl    %eax,16(%edi)
+        #   *(uint32 *) (out + 20) = in5
+        movl    %ecx,20(%edi)
+        #   in6 = x6
+        movl    124(%esp),%eax
+        #   in7 = x7
+        movl    128(%esp),%ecx
+        #   in6 += j6
+        addl    188(%esp),%eax
+        #   in7 += j7
+        addl    192(%esp),%ecx
+        #   in6 ^= *(uint32 *) (m + 24)
+        xorl    24(%esi),%eax
+        #   in7 ^= *(uint32 *) (m + 28)
+        xorl    28(%esi),%ecx
+        #   *(uint32 *) (out + 24) = in6
+        movl    %eax,24(%edi)
+        #   *(uint32 *) (out + 28) = in7
+        movl    %ecx,28(%edi)
+        #   in8 = x8
+        movl    132(%esp),%eax
+        #   in9 = x9
+        movl    136(%esp),%ecx
+        #   in8 += j8
+        addl    196(%esp),%eax
+        #   in9 += j9
+        addl    200(%esp),%ecx
+        #   in8 ^= *(uint32 *) (m + 32)
+        xorl    32(%esi),%eax
+        #   in9 ^= *(uint32 *) (m + 36)
+        xorl    36(%esi),%ecx
+        #   *(uint32 *) (out + 32) = in8
+        movl    %eax,32(%edi)
+        #   *(uint32 *) (out + 36) = in9
+        movl    %ecx,36(%edi)
+        #   in10 = x10
+        movl    140(%esp),%eax
+        #   in11 = x11
+        movl    144(%esp),%ecx
+        #   in10 += j10
+        addl    204(%esp),%eax
+        #   in11 += j11
+        addl    208(%esp),%ecx
+        #   in10 ^= *(uint32 *) (m + 40)
+        xorl    40(%esi),%eax
+        #   in11 ^= *(uint32 *) (m + 44)
+        xorl    44(%esi),%ecx
+        #   *(uint32 *) (out + 40) = in10
+        movl    %eax,40(%edi)
+        #   *(uint32 *) (out + 44) = in11
+        movl    %ecx,44(%edi)
+        #   in12 = x12
+        movl    148(%esp),%eax
+        #   in13 = x13
+        movl    152(%esp),%ecx
+        #   in12 += j12
+        addl    212(%esp),%eax
+        #   in13 += j13
+        addl    216(%esp),%ecx
+        #   in12 ^= *(uint32 *) (m + 48)
+        xorl    48(%esi),%eax
+        #   in13 ^= *(uint32 *) (m + 52)
+        xorl    52(%esi),%ecx
+        #   *(uint32 *) (out + 48) = in12
+        movl    %eax,48(%edi)
+        #   *(uint32 *) (out + 52) = in13
+        movl    %ecx,52(%edi)
+        #   in14 = x14
+        movl    156(%esp),%eax
+        #   in15 = x15
+        movl    160(%esp),%ecx
+        #   in14 += j14
+        addl    220(%esp),%eax
+        #   in15 += j15
+        addl    224(%esp),%ecx
+        #   in14 ^= *(uint32 *) (m + 56)
+        xorl    56(%esi),%eax
+        #   in15 ^= *(uint32 *) (m + 60)
+        xorl    60(%esi),%ecx
+        #   *(uint32 *) (out + 56) = in14
+        movl    %eax,56(%edi)
+        #   *(uint32 *) (out + 60) = in15
+        movl    %ecx,60(%edi)
+        #   bytes = bytes_backup
+        movl    76(%esp),%ebx
+        #   in8 = j8
+        movl    196(%esp),%eax
+        #   in9 = j9
+        movl    200(%esp),%ecx
+        #   in8 += 1
+        add     $1,%eax
+        #   in9 += 0 + carry
+        adc     $0,%ecx
+        #   j8 = in8
+        movl    %eax,196(%esp)
+        #   j9 = in9
+        movl    %ecx,200(%esp)
+        #   bytes - 64
+        cmp     $64,%ebx
+        #   goto bytesatleast65 if unsigned>
+        ja      ._bytesatleast65
+        #     goto bytesatleast64 if unsigned>=
+        jae     ._bytesatleast64
+        #       m = out
+        mov     %edi,%esi
+        #       out = ctarget
+        movl    228(%esp),%edi
+        #       i = bytes
+        mov     %ebx,%ecx
+        #       while (i) { *out++ = *m++; --i }
+        rep     movsb
+._bytesatleast64:
+        #     x = x_backup
+        movl    64(%esp),%eax
+        #     in8 = j8
+        movl    196(%esp),%ecx
+        #     in9 = j9
+        movl    200(%esp),%edx
+        #     *(uint32 *) (x + 32) = in8
+        movl    %ecx,32(%eax)
+        #     *(uint32 *) (x + 36) = in9
+        movl    %edx,36(%eax)
+._done:
+        #     eax = eax_stack
+        movl    80(%esp),%eax
+        #     ebx = ebx_stack
+        movl    84(%esp),%ebx
+        #     esi = esi_stack
+        movl    88(%esp),%esi
+        #     edi = edi_stack
+        movl    92(%esp),%edi
+        #     ebp = ebp_stack
+        movl    96(%esp),%ebp
+        #     leave
+        add     %eax,%esp
+        ret
+._bytesatleast65:
+        #   bytes -= 64
+        sub     $64,%ebx
+        #   out += 64
+        add     $64,%edi
+        #   m += 64
+        add     $64,%esi
+        # goto bytesatleast1
+        jmp     ._bytesatleast1
+# enter ECRYPT_keysetup
+.text
+.p2align 5
+.globl ECRYPT_keysetup
+ECRYPT_keysetup:
+        mov     %esp,%eax
+        and     $31,%eax
+        add     $256,%eax
+        sub     %eax,%esp
+        #   eax_stack = eax
+        movl    %eax,64(%esp)
+        #   ebx_stack = ebx
+        movl    %ebx,68(%esp)
+        #   esi_stack = esi
+        movl    %esi,72(%esp)
+        #   edi_stack = edi
+        movl    %edi,76(%esp)
+        #   ebp_stack = ebp
+        movl    %ebp,80(%esp)
+        #   k = arg2
+        movl    8(%esp,%eax),%ecx
+        #   kbits = arg3
+        movl    12(%esp,%eax),%edx
+        #   x = arg1
+        movl    4(%esp,%eax),%eax
+        #   in1 = *(uint32 *) (k + 0)
+        movl    0(%ecx),%ebx
+        #   in2 = *(uint32 *) (k + 4)
+        movl    4(%ecx),%esi
+        #   in3 = *(uint32 *) (k + 8)
+        movl    8(%ecx),%edi
+        #   in4 = *(uint32 *) (k + 12)
+        movl    12(%ecx),%ebp
+        #   *(uint32 *) (x + 4) = in1
+        movl    %ebx,4(%eax)
+        #   *(uint32 *) (x + 8) = in2
+        movl    %esi,8(%eax)
+        #   *(uint32 *) (x + 12) = in3
+        movl    %edi,12(%eax)
+        #   *(uint32 *) (x + 16) = in4
+        movl    %ebp,16(%eax)
+        #   kbits - 256
+        cmp     $256,%edx
+        #   goto kbits128 if unsigned<
+        jb      ._kbits128
+._kbits256:
+        #     in11 = *(uint32 *) (k + 16)
+        movl    16(%ecx),%edx
+        #     in12 = *(uint32 *) (k + 20)
+        movl    20(%ecx),%ebx
+        #     in13 = *(uint32 *) (k + 24)
+        movl    24(%ecx),%esi
+        #     in14 = *(uint32 *) (k + 28)
+        movl    28(%ecx),%ecx
+        #     *(uint32 *) (x + 44) = in11
+        movl    %edx,44(%eax)
+        #     *(uint32 *) (x + 48) = in12
+        movl    %ebx,48(%eax)
+        #     *(uint32 *) (x + 52) = in13
+        movl    %esi,52(%eax)
+        #     *(uint32 *) (x + 56) = in14
+        movl    %ecx,56(%eax)
+        #     in0 = 1634760805
+        mov     $1634760805,%ecx
+        #     in5 = 857760878
+        mov     $857760878,%edx
+        #     in10 = 2036477234
+        mov     $2036477234,%ebx
+        #     in15 = 1797285236
+        mov     $1797285236,%esi
+        #     *(uint32 *) (x + 0) = in0
+        movl    %ecx,0(%eax)
+        #     *(uint32 *) (x + 20) = in5
+        movl    %edx,20(%eax)
+        #     *(uint32 *) (x + 40) = in10
+        movl    %ebx,40(%eax)
+        #     *(uint32 *) (x + 60) = in15
+        movl    %esi,60(%eax)
+        #   goto keysetupdone
+        jmp     ._keysetupdone
+._kbits128:
+        #     in11 = *(uint32 *) (k + 0)
+        movl    0(%ecx),%edx
+        #     in12 = *(uint32 *) (k + 4)
+        movl    4(%ecx),%ebx
+        #     in13 = *(uint32 *) (k + 8)
+        movl    8(%ecx),%esi
+        #     in14 = *(uint32 *) (k + 12)
+        movl    12(%ecx),%ecx
+        #     *(uint32 *) (x + 44) = in11
+        movl    %edx,44(%eax)
+        #     *(uint32 *) (x + 48) = in12
+        movl    %ebx,48(%eax)
+        #     *(uint32 *) (x + 52) = in13
+        movl    %esi,52(%eax)
+        #     *(uint32 *) (x + 56) = in14
+        movl    %ecx,56(%eax)
+        #     in0 = 1634760805
+        mov     $1634760805,%ecx
+        #     in5 = 824206446
+        mov     $824206446,%edx
+        #     in10 = 2036477238
+        mov     $2036477238,%ebx
+        #     in15 = 1797285236
+        mov     $1797285236,%esi
+        #     *(uint32 *) (x + 0) = in0
+        movl    %ecx,0(%eax)
+        #     *(uint32 *) (x + 20) = in5
+        movl    %edx,20(%eax)
+        #     *(uint32 *) (x + 40) = in10
+        movl    %ebx,40(%eax)
+        #     *(uint32 *) (x + 60) = in15
+        movl    %esi,60(%eax)
+._keysetupdone:
+        #   eax = eax_stack
+        movl    64(%esp),%eax
+        #   ebx = ebx_stack
+        movl    68(%esp),%ebx
+        #   esi = esi_stack
+        movl    72(%esp),%esi
+        #   edi = edi_stack
+        movl    76(%esp),%edi
+        #   ebp = ebp_stack
+        movl    80(%esp),%ebp
+        # leave
+        add     %eax,%esp
+        ret
+# enter ECRYPT_ivsetup
+.text
+.p2align 5
+.globl ECRYPT_ivsetup
+ECRYPT_ivsetup:
+        mov     %esp,%eax
+        and     $31,%eax
+        add     $256,%eax
+        sub     %eax,%esp
+        #   eax_stack = eax
+        movl    %eax,64(%esp)
+        #   ebx_stack = ebx
+        movl    %ebx,68(%esp)
+        #   esi_stack = esi
+        movl    %esi,72(%esp)
+        #   edi_stack = edi
+        movl    %edi,76(%esp)
+        #   ebp_stack = ebp
+        movl    %ebp,80(%esp)
+        #   iv = arg2
+        movl    8(%esp,%eax),%ecx
+        #   x = arg1
+        movl    4(%esp,%eax),%eax
+        #   in6 = *(uint32 *) (iv + 0)
+        movl    0(%ecx),%edx
+        #   in7 = *(uint32 *) (iv + 4)
+        movl    4(%ecx),%ecx
+        #   in8 = 0
+        mov     $0,%ebx
+        #   in9 = 0
+        mov     $0,%esi
+        #   *(uint32 *) (x + 24) = in6
+        movl    %edx,24(%eax)
+        #   *(uint32 *) (x + 28) = in7
+        movl    %ecx,28(%eax)
+        #   *(uint32 *) (x + 32) = in8
+        movl    %ebx,32(%eax)
+        #   *(uint32 *) (x + 36) = in9
+        movl    %esi,36(%eax)
+        #   eax = eax_stack
+        movl    64(%esp),%eax
+        #   ebx = ebx_stack
+        movl    68(%esp),%ebx
+        #   esi = esi_stack
+        movl    72(%esp),%esi
+        #   edi = edi_stack
+        movl    76(%esp),%edi
+        #   ebp = ebp_stack
+        movl    80(%esp),%ebp
+        # leave
+        add     %eax,%esp
+        ret
diff --git a/arch/x86/crypto/salsa20-x86_64-asm_64.S b/arch/x86/crypto/salsa20-x86_64-asm_64.S
new file mode 100644
index 000000000000..6214a9b09706
--- /dev/null
+++ b/arch/x86/crypto/salsa20-x86_64-asm_64.S
@@ -0,0 +1,920 @@
+# enter ECRYPT_encrypt_bytes
+.text
+.p2align 5
+.globl ECRYPT_encrypt_bytes
+ECRYPT_encrypt_bytes:
+        mov     %rsp,%r11
+        and     $31,%r11
+        add     $256,%r11
+        sub     %r11,%rsp
+        # x = arg1
+        mov     %rdi,%r8
+        # m = arg2
+        mov     %rsi,%rsi
+        # out = arg3
+        mov     %rdx,%rdi
+        # bytes = arg4
+        mov     %rcx,%rdx
+        #               unsigned>? bytes - 0
+        cmp     $0,%rdx
+        # comment:fp stack unchanged by jump
+        # goto done if !unsigned>
+        jbe     ._done
+        # comment:fp stack unchanged by fallthrough
+# start:
+._start:
+        # r11_stack = r11
+        movq    %r11,0(%rsp)
+        # r12_stack = r12
+        movq    %r12,8(%rsp)
+        # r13_stack = r13
+        movq    %r13,16(%rsp)
+        # r14_stack = r14
+        movq    %r14,24(%rsp)
+        # r15_stack = r15
+        movq    %r15,32(%rsp)
+        # rbx_stack = rbx
+        movq    %rbx,40(%rsp)
+        # rbp_stack = rbp
+        movq    %rbp,48(%rsp)
+        # in0 = *(uint64 *) (x + 0)
+        movq    0(%r8),%rcx
+        # in2 = *(uint64 *) (x + 8)
+        movq    8(%r8),%r9
+        # in4 = *(uint64 *) (x + 16)
+        movq    16(%r8),%rax
+        # in6 = *(uint64 *) (x + 24)
+        movq    24(%r8),%r10
+        # in8 = *(uint64 *) (x + 32)
+        movq    32(%r8),%r11
+        # in10 = *(uint64 *) (x + 40)
+        movq    40(%r8),%r12
+        # in12 = *(uint64 *) (x + 48)
+        movq    48(%r8),%r13
+        # in14 = *(uint64 *) (x + 56)
+        movq    56(%r8),%r14
+        # j0 = in0
+        movq    %rcx,56(%rsp)
+        # j2 = in2
+        movq    %r9,64(%rsp)
+        # j4 = in4
+        movq    %rax,72(%rsp)
+        # j6 = in6
+        movq    %r10,80(%rsp)
+        # j8 = in8
+        movq    %r11,88(%rsp)
+        # j10 = in10
+        movq    %r12,96(%rsp)
+        # j12 = in12
+        movq    %r13,104(%rsp)
+        # j14 = in14
+        movq    %r14,112(%rsp)
+        # x_backup = x
+        movq    %r8,120(%rsp)
+# bytesatleast1:
+._bytesatleast1:
+        #                   unsigned<? bytes - 64
+        cmp     $64,%rdx
+        # comment:fp stack unchanged by jump
+        #   goto nocopy if !unsigned<
+        jae     ._nocopy
+        #     ctarget = out
+        movq    %rdi,128(%rsp)
+        #     out = &tmp
+        leaq    192(%rsp),%rdi
+        #     i = bytes
+        mov     %rdx,%rcx
+        #     while (i) { *out++ = *m++; --i }
+        rep     movsb
+        #     out = &tmp
+        leaq    192(%rsp),%rdi
+        #     m = &tmp
+        leaq    192(%rsp),%rsi
+        # comment:fp stack unchanged by fallthrough
+#   nocopy:
+._nocopy:
+        #   out_backup = out
+        movq    %rdi,136(%rsp)
+        #   m_backup = m
+        movq    %rsi,144(%rsp)
+        #   bytes_backup = bytes
+        movq    %rdx,152(%rsp)
+        #   x1 = j0
+        movq    56(%rsp),%rdi
+        #   x0 = x1
+        mov     %rdi,%rdx
+        #   (uint64) x1 >>= 32
+        shr     $32,%rdi
+        #               x3 = j2
+        movq    64(%rsp),%rsi
+        #               x2 = x3
+        mov     %rsi,%rcx
+        #               (uint64) x3 >>= 32
+        shr     $32,%rsi
+        #   x5 = j4
+        movq    72(%rsp),%r8
+        #   x4 = x5
+        mov     %r8,%r9
+        #   (uint64) x5 >>= 32
+        shr     $32,%r8
+        #   x5_stack = x5
+        movq    %r8,160(%rsp)
+        #               x7 = j6
+        movq    80(%rsp),%r8
+        #               x6 = x7
+        mov     %r8,%rax
+        #               (uint64) x7 >>= 32
+        shr     $32,%r8
+        #   x9 = j8
+        movq    88(%rsp),%r10
+        #   x8 = x9
+        mov     %r10,%r11
+        #   (uint64) x9 >>= 32
+        shr     $32,%r10
+        #               x11 = j10
+        movq    96(%rsp),%r12
+        #               x10 = x11
+        mov     %r12,%r13
+        #               x10_stack = x10
+        movq    %r13,168(%rsp)
+        #               (uint64) x11 >>= 32
+        shr     $32,%r12
+        #   x13 = j12
+        movq    104(%rsp),%r13
+        #   x12 = x13
+        mov     %r13,%r14
+        #   (uint64) x13 >>= 32
+        shr     $32,%r13
+        #               x15 = j14
+        movq    112(%rsp),%r15
+        #               x14 = x15
+        mov     %r15,%rbx
+        #               (uint64) x15 >>= 32
+        shr     $32,%r15
+        #               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #   i = 20
+        mov     $20,%r15
+#   mainloop:
+._mainloop:
+        #   i_backup = i
+        movq    %r15,184(%rsp)
+        #               x5 = x5_stack
+        movq    160(%rsp),%r15
+        # a = x12 + x0
+        lea     (%r14,%rdx),%rbp
+        # (uint32) a <<<= 7
+        rol     $7,%ebp
+        # x4 ^= a
+        xor     %rbp,%r9
+        #               b = x1 + x5
+        lea     (%rdi,%r15),%rbp
+        #               (uint32) b <<<= 7
+        rol     $7,%ebp
+        #               x9 ^= b
+        xor     %rbp,%r10
+        # a = x0 + x4
+        lea     (%rdx,%r9),%rbp
+        # (uint32) a <<<= 9
+        rol     $9,%ebp
+        # x8 ^= a
+        xor     %rbp,%r11
+        #               b = x5 + x9
+        lea     (%r15,%r10),%rbp
+        #               (uint32) b <<<= 9
+        rol     $9,%ebp
+        #               x13 ^= b
+        xor     %rbp,%r13
+        # a = x4 + x8
+        lea     (%r9,%r11),%rbp
+        # (uint32) a <<<= 13
+        rol     $13,%ebp
+        # x12 ^= a
+        xor     %rbp,%r14
+        #               b = x9 + x13
+        lea     (%r10,%r13),%rbp
+        #               (uint32) b <<<= 13
+        rol     $13,%ebp
+        #               x1 ^= b
+        xor     %rbp,%rdi
+        # a = x8 + x12
+        lea     (%r11,%r14),%rbp
+        # (uint32) a <<<= 18
+        rol     $18,%ebp
+        # x0 ^= a
+        xor     %rbp,%rdx
+        #               b = x13 + x1
+        lea     (%r13,%rdi),%rbp
+        #               (uint32) b <<<= 18
+        rol     $18,%ebp
+        #               x5 ^= b
+        xor     %rbp,%r15
+        #                               x10 = x10_stack
+        movq    168(%rsp),%rbp
+        #               x5_stack = x5
+        movq    %r15,160(%rsp)
+        #                               c = x6 + x10
+        lea     (%rax,%rbp),%r15
+        #                               (uint32) c <<<= 7
+        rol     $7,%r15d
+        #                               x14 ^= c
+        xor     %r15,%rbx
+        #                               c = x10 + x14
+        lea     (%rbp,%rbx),%r15
+        #                               (uint32) c <<<= 9
+        rol     $9,%r15d
+        #                               x2 ^= c
+        xor     %r15,%rcx
+        #                               c = x14 + x2
+        lea     (%rbx,%rcx),%r15
+        #                               (uint32) c <<<= 13
+        rol     $13,%r15d
+        #                               x6 ^= c
+        xor     %r15,%rax
+        #                               c = x2 + x6
+        lea     (%rcx,%rax),%r15
+        #                               (uint32) c <<<= 18
+        rol     $18,%r15d
+        #                               x10 ^= c
+        xor     %r15,%rbp
+        #                                               x15 = x15_stack
+        movq    176(%rsp),%r15
+        #                               x10_stack = x10
+        movq    %rbp,168(%rsp)
+        #                                               d = x11 + x15
+        lea     (%r12,%r15),%rbp
+        #                                               (uint32) d <<<= 7
+        rol     $7,%ebp
+        #                                               x3 ^= d
+        xor     %rbp,%rsi
+        #                                               d = x15 + x3
+        lea     (%r15,%rsi),%rbp
+        #                                               (uint32) d <<<= 9
+        rol     $9,%ebp
+        #                                               x7 ^= d
+        xor     %rbp,%r8
+        #                                               d = x3 + x7
+        lea     (%rsi,%r8),%rbp
+        #                                               (uint32) d <<<= 13
+        rol     $13,%ebp
+        #                                               x11 ^= d
+        xor     %rbp,%r12
+        #                                               d = x7 + x11
+        lea     (%r8,%r12),%rbp
+        #                                               (uint32) d <<<= 18
+        rol     $18,%ebp
+        #                                               x15 ^= d
+        xor     %rbp,%r15
+        #                                               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #               x5 = x5_stack
+        movq    160(%rsp),%r15
+        # a = x3 + x0
+        lea     (%rsi,%rdx),%rbp
+        # (uint32) a <<<= 7
+        rol     $7,%ebp
+        # x1 ^= a
+        xor     %rbp,%rdi
+        #               b = x4 + x5
+        lea     (%r9,%r15),%rbp
+        #               (uint32) b <<<= 7
+        rol     $7,%ebp
+        #               x6 ^= b
+        xor     %rbp,%rax
+        # a = x0 + x1
+        lea     (%rdx,%rdi),%rbp
+        # (uint32) a <<<= 9
+        rol     $9,%ebp
+        # x2 ^= a
+        xor     %rbp,%rcx
+        #               b = x5 + x6
+        lea     (%r15,%rax),%rbp
+        #               (uint32) b <<<= 9
+        rol     $9,%ebp
+        #               x7 ^= b
+        xor     %rbp,%r8
+        # a = x1 + x2
+        lea     (%rdi,%rcx),%rbp
+        # (uint32) a <<<= 13
+        rol     $13,%ebp
+        # x3 ^= a
+        xor     %rbp,%rsi
+        #               b = x6 + x7
+        lea     (%rax,%r8),%rbp
+        #               (uint32) b <<<= 13
+        rol     $13,%ebp
+        #               x4 ^= b
+        xor     %rbp,%r9
+        # a = x2 + x3
+        lea     (%rcx,%rsi),%rbp
+        # (uint32) a <<<= 18
+        rol     $18,%ebp
+        # x0 ^= a
+        xor     %rbp,%rdx
+        #               b = x7 + x4
+        lea     (%r8,%r9),%rbp
+        #               (uint32) b <<<= 18
+        rol     $18,%ebp
+        #               x5 ^= b
+        xor     %rbp,%r15
+        #                               x10 = x10_stack
+        movq    168(%rsp),%rbp
+        #               x5_stack = x5
+        movq    %r15,160(%rsp)
+        #                               c = x9 + x10
+        lea     (%r10,%rbp),%r15
+        #                               (uint32) c <<<= 7
+        rol     $7,%r15d
+        #                               x11 ^= c
+        xor     %r15,%r12
+        #                               c = x10 + x11
+        lea     (%rbp,%r12),%r15
+        #                               (uint32) c <<<= 9
+        rol     $9,%r15d
+        #                               x8 ^= c
+        xor     %r15,%r11
+        #                               c = x11 + x8
+        lea     (%r12,%r11),%r15
+        #                               (uint32) c <<<= 13
+        rol     $13,%r15d
+        #                               x9 ^= c
+        xor     %r15,%r10
+        #                               c = x8 + x9
+        lea     (%r11,%r10),%r15
+        #                               (uint32) c <<<= 18
+        rol     $18,%r15d
+        #                               x10 ^= c
+        xor     %r15,%rbp
+        #                                               x15 = x15_stack
+        movq    176(%rsp),%r15
+        #                               x10_stack = x10
+        movq    %rbp,168(%rsp)
+        #                                               d = x14 + x15
+        lea     (%rbx,%r15),%rbp
+        #                                               (uint32) d <<<= 7
+        rol     $7,%ebp
+        #                                               x12 ^= d
+        xor     %rbp,%r14
+        #                                               d = x15 + x12
+        lea     (%r15,%r14),%rbp
+        #                                               (uint32) d <<<= 9
+        rol     $9,%ebp
+        #                                               x13 ^= d
+        xor     %rbp,%r13
+        #                                               d = x12 + x13
+        lea     (%r14,%r13),%rbp
+        #                                               (uint32) d <<<= 13
+        rol     $13,%ebp
+        #                                               x14 ^= d
+        xor     %rbp,%rbx
+        #                                               d = x13 + x14
+        lea     (%r13,%rbx),%rbp
+        #                                               (uint32) d <<<= 18
+        rol     $18,%ebp
+        #                                               x15 ^= d
+        xor     %rbp,%r15
+        #                                               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #               x5 = x5_stack
+        movq    160(%rsp),%r15
+        # a = x12 + x0
+        lea     (%r14,%rdx),%rbp
+        # (uint32) a <<<= 7
+        rol     $7,%ebp
+        # x4 ^= a
+        xor     %rbp,%r9
+        #               b = x1 + x5
+        lea     (%rdi,%r15),%rbp
+        #               (uint32) b <<<= 7
+        rol     $7,%ebp
+        #               x9 ^= b
+        xor     %rbp,%r10
+        # a = x0 + x4
+        lea     (%rdx,%r9),%rbp
+        # (uint32) a <<<= 9
+        rol     $9,%ebp
+        # x8 ^= a
+        xor     %rbp,%r11
+        #               b = x5 + x9
+        lea     (%r15,%r10),%rbp
+        #               (uint32) b <<<= 9
+        rol     $9,%ebp
+        #               x13 ^= b
+        xor     %rbp,%r13
+        # a = x4 + x8
+        lea     (%r9,%r11),%rbp
+        # (uint32) a <<<= 13
+        rol     $13,%ebp
+        # x12 ^= a
+        xor     %rbp,%r14
+        #               b = x9 + x13
+        lea     (%r10,%r13),%rbp
+        #               (uint32) b <<<= 13
+        rol     $13,%ebp
+        #               x1 ^= b
+        xor     %rbp,%rdi
+        # a = x8 + x12
+        lea     (%r11,%r14),%rbp
+        # (uint32) a <<<= 18
+        rol     $18,%ebp
+        # x0 ^= a
+        xor     %rbp,%rdx
+        #               b = x13 + x1
+        lea     (%r13,%rdi),%rbp
+        #               (uint32) b <<<= 18
+        rol     $18,%ebp
+        #               x5 ^= b
+        xor     %rbp,%r15
+        #                               x10 = x10_stack
+        movq    168(%rsp),%rbp
+        #               x5_stack = x5
+        movq    %r15,160(%rsp)
+        #                               c = x6 + x10
+        lea     (%rax,%rbp),%r15
+        #                               (uint32) c <<<= 7
+        rol     $7,%r15d
+        #                               x14 ^= c
+        xor     %r15,%rbx
+        #                               c = x10 + x14
+        lea     (%rbp,%rbx),%r15
+        #                               (uint32) c <<<= 9
+        rol     $9,%r15d
+        #                               x2 ^= c
+        xor     %r15,%rcx
+        #                               c = x14 + x2
+        lea     (%rbx,%rcx),%r15
+        #                               (uint32) c <<<= 13
+        rol     $13,%r15d
+        #                               x6 ^= c
+        xor     %r15,%rax
+        #                               c = x2 + x6
+        lea     (%rcx,%rax),%r15
+        #                               (uint32) c <<<= 18
+        rol     $18,%r15d
+        #                               x10 ^= c
+        xor     %r15,%rbp
+        #                                               x15 = x15_stack
+        movq    176(%rsp),%r15
+        #                               x10_stack = x10
+        movq    %rbp,168(%rsp)
+        #                                               d = x11 + x15
+        lea     (%r12,%r15),%rbp
+        #                                               (uint32) d <<<= 7
+        rol     $7,%ebp
+        #                                               x3 ^= d
+        xor     %rbp,%rsi
+        #                                               d = x15 + x3
+        lea     (%r15,%rsi),%rbp
+        #                                               (uint32) d <<<= 9
+        rol     $9,%ebp
+        #                                               x7 ^= d
+        xor     %rbp,%r8
+        #                                               d = x3 + x7
+        lea     (%rsi,%r8),%rbp
+        #                                               (uint32) d <<<= 13
+        rol     $13,%ebp
+        #                                               x11 ^= d
+        xor     %rbp,%r12
+        #                                               d = x7 + x11
+        lea     (%r8,%r12),%rbp
+        #                                               (uint32) d <<<= 18
+        rol     $18,%ebp
+        #                                               x15 ^= d
+        xor     %rbp,%r15
+        #                                               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #               x5 = x5_stack
+        movq    160(%rsp),%r15
+        # a = x3 + x0
+        lea     (%rsi,%rdx),%rbp
+        # (uint32) a <<<= 7
+        rol     $7,%ebp
+        # x1 ^= a
+        xor     %rbp,%rdi
+        #               b = x4 + x5
+        lea     (%r9,%r15),%rbp
+        #               (uint32) b <<<= 7
+        rol     $7,%ebp
+        #               x6 ^= b
+        xor     %rbp,%rax
+        # a = x0 + x1
+        lea     (%rdx,%rdi),%rbp
+        # (uint32) a <<<= 9
+        rol     $9,%ebp
+        # x2 ^= a
+        xor     %rbp,%rcx
+        #               b = x5 + x6
+        lea     (%r15,%rax),%rbp
+        #               (uint32) b <<<= 9
+        rol     $9,%ebp
+        #               x7 ^= b
+        xor     %rbp,%r8
+        # a = x1 + x2
+        lea     (%rdi,%rcx),%rbp
+        # (uint32) a <<<= 13
+        rol     $13,%ebp
+        # x3 ^= a
+        xor     %rbp,%rsi
+        #               b = x6 + x7
+        lea     (%rax,%r8),%rbp
+        #               (uint32) b <<<= 13
+        rol     $13,%ebp
+        #               x4 ^= b
+        xor     %rbp,%r9
+        # a = x2 + x3
+        lea     (%rcx,%rsi),%rbp
+        # (uint32) a <<<= 18
+        rol     $18,%ebp
+        # x0 ^= a
+        xor     %rbp,%rdx
+        #               b = x7 + x4
+        lea     (%r8,%r9),%rbp
+        #               (uint32) b <<<= 18
+        rol     $18,%ebp
+        #               x5 ^= b
+        xor     %rbp,%r15
+        #                               x10 = x10_stack
+        movq    168(%rsp),%rbp
+        #               x5_stack = x5
+        movq    %r15,160(%rsp)
+        #                               c = x9 + x10
+        lea     (%r10,%rbp),%r15
+        #                               (uint32) c <<<= 7
+        rol     $7,%r15d
+        #                               x11 ^= c
+        xor     %r15,%r12
+        #                               c = x10 + x11
+        lea     (%rbp,%r12),%r15
+        #                               (uint32) c <<<= 9
+        rol     $9,%r15d
+        #                               x8 ^= c
+        xor     %r15,%r11
+        #                               c = x11 + x8
+        lea     (%r12,%r11),%r15
+        #                               (uint32) c <<<= 13
+        rol     $13,%r15d
+        #                               x9 ^= c
+        xor     %r15,%r10
+        #                               c = x8 + x9
+        lea     (%r11,%r10),%r15
+        #                               (uint32) c <<<= 18
+        rol     $18,%r15d
+        #                               x10 ^= c
+        xor     %r15,%rbp
+        #                                               x15 = x15_stack
+        movq    176(%rsp),%r15
+        #                               x10_stack = x10
+        movq    %rbp,168(%rsp)
+        #                                               d = x14 + x15
+        lea     (%rbx,%r15),%rbp
+        #                                               (uint32) d <<<= 7
+        rol     $7,%ebp
+        #                                               x12 ^= d
+        xor     %rbp,%r14
+        #                                               d = x15 + x12
+        lea     (%r15,%r14),%rbp
+        #                                               (uint32) d <<<= 9
+        rol     $9,%ebp
+        #                                               x13 ^= d
+        xor     %rbp,%r13
+        #                                               d = x12 + x13
+        lea     (%r14,%r13),%rbp
+        #                                               (uint32) d <<<= 13
+        rol     $13,%ebp
+        #                                               x14 ^= d
+        xor     %rbp,%rbx
+        #                                               d = x13 + x14
+        lea     (%r13,%rbx),%rbp
+        #                                               (uint32) d <<<= 18
+        rol     $18,%ebp
+        #                                               x15 ^= d
+        xor     %rbp,%r15
+        #                                               x15_stack = x15
+        movq    %r15,176(%rsp)
+        #   i = i_backup
+        movq    184(%rsp),%r15
+        #                  unsigned>? i -= 4
+        sub     $4,%r15
+        # comment:fp stack unchanged by jump
+        # goto mainloop if unsigned>
+        ja      ._mainloop
+        #   (uint32) x2 += j2
+        addl    64(%rsp),%ecx
+        #   x3 <<= 32
+        shl     $32,%rsi
+        #   x3 += j2
+        addq    64(%rsp),%rsi
+        #   (uint64) x3 >>= 32
+        shr     $32,%rsi
+        #   x3 <<= 32
+        shl     $32,%rsi
+        #   x2 += x3
+        add     %rsi,%rcx
+        #   (uint32) x6 += j6
+        addl    80(%rsp),%eax
+        #   x7 <<= 32
+        shl     $32,%r8
+        #   x7 += j6
+        addq    80(%rsp),%r8
+        #   (uint64) x7 >>= 32
+        shr     $32,%r8
+        #   x7 <<= 32
+        shl     $32,%r8
+        #   x6 += x7
+        add     %r8,%rax
+        #   (uint32) x8 += j8
+        addl    88(%rsp),%r11d
+        #   x9 <<= 32
+        shl     $32,%r10
+        #   x9 += j8
+        addq    88(%rsp),%r10
+        #   (uint64) x9 >>= 32
+        shr     $32,%r10
+        #   x9 <<= 32
+        shl     $32,%r10
+        #   x8 += x9
+        add     %r10,%r11
+        #   (uint32) x12 += j12
+        addl    104(%rsp),%r14d
+        #   x13 <<= 32
+        shl     $32,%r13
+        #   x13 += j12
+        addq    104(%rsp),%r13
+        #   (uint64) x13 >>= 32
+        shr     $32,%r13
+        #   x13 <<= 32
+        shl     $32,%r13
+        #   x12 += x13
+        add     %r13,%r14
+        #   (uint32) x0 += j0
+        addl    56(%rsp),%edx
+        #   x1 <<= 32
+        shl     $32,%rdi
+        #   x1 += j0
+        addq    56(%rsp),%rdi
+        #   (uint64) x1 >>= 32
+        shr     $32,%rdi
+        #   x1 <<= 32
+        shl     $32,%rdi
+        #   x0 += x1
+        add     %rdi,%rdx
+        #   x5 = x5_stack
+        movq    160(%rsp),%rdi
+        #   (uint32) x4 += j4
+        addl    72(%rsp),%r9d
+        #   x5 <<= 32
+        shl     $32,%rdi
+        #   x5 += j4
+        addq    72(%rsp),%rdi
+        #   (uint64) x5 >>= 32
+        shr     $32,%rdi
+        #   x5 <<= 32
+        shl     $32,%rdi
+        #   x4 += x5
+        add     %rdi,%r9
+        #   x10 = x10_stack
+        movq    168(%rsp),%r8
+        #   (uint32) x10 += j10
+        addl    96(%rsp),%r8d
+        #   x11 <<= 32
+        shl     $32,%r12
+        #   x11 += j10
+        addq    96(%rsp),%r12
+        #   (uint64) x11 >>= 32
+        shr     $32,%r12
+        #   x11 <<= 32
+        shl     $32,%r12
+        #   x10 += x11
+        add     %r12,%r8
+        #   x15 = x15_stack
+        movq    176(%rsp),%rdi
+        #   (uint32) x14 += j14
+        addl    112(%rsp),%ebx
+        #   x15 <<= 32
+        shl     $32,%rdi
+        #   x15 += j14
+        addq    112(%rsp),%rdi
+        #   (uint64) x15 >>= 32
+        shr     $32,%rdi
+        #   x15 <<= 32
+        shl     $32,%rdi
+        #   x14 += x15
+        add     %rdi,%rbx
+        #   out = out_backup
+        movq    136(%rsp),%rdi
+        #   m = m_backup
+        movq    144(%rsp),%rsi
+        #   x0 ^= *(uint64 *) (m + 0)
+        xorq    0(%rsi),%rdx
+        #   *(uint64 *) (out + 0) = x0
+        movq    %rdx,0(%rdi)
+        #   x2 ^= *(uint64 *) (m + 8)
+        xorq    8(%rsi),%rcx
+        #   *(uint64 *) (out + 8) = x2
+        movq    %rcx,8(%rdi)
+        #   x4 ^= *(uint64 *) (m + 16)
+        xorq    16(%rsi),%r9
+        #   *(uint64 *) (out + 16) = x4
+        movq    %r9,16(%rdi)
+        #   x6 ^= *(uint64 *) (m + 24)
+        xorq    24(%rsi),%rax
+        #   *(uint64 *) (out + 24) = x6
+        movq    %rax,24(%rdi)
+        #   x8 ^= *(uint64 *) (m + 32)
+        xorq    32(%rsi),%r11
+        #   *(uint64 *) (out + 32) = x8
+        movq    %r11,32(%rdi)
+        #   x10 ^= *(uint64 *) (m + 40)
+        xorq    40(%rsi),%r8
+        #   *(uint64 *) (out + 40) = x10
+        movq    %r8,40(%rdi)
+        #   x12 ^= *(uint64 *) (m + 48)
+        xorq    48(%rsi),%r14
+        #   *(uint64 *) (out + 48) = x12
+        movq    %r14,48(%rdi)
+        #   x14 ^= *(uint64 *) (m + 56)
+        xorq    56(%rsi),%rbx
+        #   *(uint64 *) (out + 56) = x14
+        movq    %rbx,56(%rdi)
+        #   bytes = bytes_backup
+        movq    152(%rsp),%rdx
+        #   in8 = j8
+        movq    88(%rsp),%rcx
+        #   in8 += 1
+        add     $1,%rcx
+        #   j8 = in8
+        movq    %rcx,88(%rsp)
+        #                          unsigned>? unsigned<? bytes - 64
+        cmp     $64,%rdx
+        # comment:fp stack unchanged by jump
+        #   goto bytesatleast65 if unsigned>
+        ja      ._bytesatleast65
+        # comment:fp stack unchanged by jump
+        #     goto bytesatleast64 if !unsigned<
+        jae     ._bytesatleast64
+        #       m = out
+        mov     %rdi,%rsi
+        #       out = ctarget
+        movq    128(%rsp),%rdi
+        #       i = bytes
+        mov     %rdx,%rcx
+        #       while (i) { *out++ = *m++; --i }
+        rep     movsb
+        # comment:fp stack unchanged by fallthrough
+#     bytesatleast64:
+._bytesatleast64:
+        #     x = x_backup
+        movq    120(%rsp),%rdi
+        #     in8 = j8
+        movq    88(%rsp),%rsi
+        #     *(uint64 *) (x + 32) = in8
+        movq    %rsi,32(%rdi)
+        #     r11 = r11_stack
+        movq    0(%rsp),%r11
+        #     r12 = r12_stack
+        movq    8(%rsp),%r12
+        #     r13 = r13_stack
+        movq    16(%rsp),%r13
+        #     r14 = r14_stack
+        movq    24(%rsp),%r14
+        #     r15 = r15_stack
+        movq    32(%rsp),%r15
+        #     rbx = rbx_stack
+        movq    40(%rsp),%rbx
+        #     rbp = rbp_stack
+        movq    48(%rsp),%rbp
+        # comment:fp stack unchanged by fallthrough
+#     done:
+._done:
+        #     leave
+        add     %r11,%rsp
+        mov     %rdi,%rax
+        mov     %rsi,%rdx
+        ret
+#   bytesatleast65:
+._bytesatleast65:
+        #   bytes -= 64
+        sub     $64,%rdx
+        #   out += 64
+        add     $64,%rdi
+        #   m += 64
+        add     $64,%rsi
+        # comment:fp stack unchanged by jump
+        # goto bytesatleast1
+        jmp     ._bytesatleast1
+# enter ECRYPT_keysetup
+.text
+.p2align 5
+.globl ECRYPT_keysetup
+ECRYPT_keysetup:
+        mov     %rsp,%r11
+        and     $31,%r11
+        add     $256,%r11
+        sub     %r11,%rsp
+        #   k = arg2
+        mov     %rsi,%rsi
+        #   kbits = arg3
+        mov     %rdx,%rdx
+        #   x = arg1
+        mov     %rdi,%rdi
+        #   in0 = *(uint64 *) (k + 0)
+        movq    0(%rsi),%r8
+        #   in2 = *(uint64 *) (k + 8)
+        movq    8(%rsi),%r9
+        #   *(uint64 *) (x + 4) = in0
+        movq    %r8,4(%rdi)
+        #   *(uint64 *) (x + 12) = in2
+        movq    %r9,12(%rdi)
+        #                    unsigned<? kbits - 256
+        cmp     $256,%rdx
+        # comment:fp stack unchanged by jump
+        #   goto kbits128 if unsigned<
+        jb      ._kbits128
+#   kbits256:
+._kbits256:
+        #     in10 = *(uint64 *) (k + 16)
+        movq    16(%rsi),%rdx
+        #     in12 = *(uint64 *) (k + 24)
+        movq    24(%rsi),%rsi
+        #     *(uint64 *) (x + 44) = in10
+        movq    %rdx,44(%rdi)
+        #     *(uint64 *) (x + 52) = in12
+        movq    %rsi,52(%rdi)
+        #     in0 = 1634760805
+        mov     $1634760805,%rsi
+        #     in4 = 857760878
+        mov     $857760878,%rdx
+        #     in10 = 2036477234
+        mov     $2036477234,%rcx
+        #     in14 = 1797285236
+        mov     $1797285236,%r8
+        #     *(uint32 *) (x + 0) = in0
+        movl    %esi,0(%rdi)
+        #     *(uint32 *) (x + 20) = in4
+        movl    %edx,20(%rdi)
+        #     *(uint32 *) (x + 40) = in10
+        movl    %ecx,40(%rdi)
+        #     *(uint32 *) (x + 60) = in14
+        movl    %r8d,60(%rdi)
+        # comment:fp stack unchanged by jump
+        #   goto keysetupdone
+        jmp     ._keysetupdone
+#   kbits128:
+._kbits128:
+        #     in10 = *(uint64 *) (k + 0)
+        movq    0(%rsi),%rdx
+        #     in12 = *(uint64 *) (k + 8)
+        movq    8(%rsi),%rsi
+        #     *(uint64 *) (x + 44) = in10
+        movq    %rdx,44(%rdi)
+        #     *(uint64 *) (x + 52) = in12
+        movq    %rsi,52(%rdi)
+        #     in0 = 1634760805
+        mov     $1634760805,%rsi
+        #     in4 = 824206446
+        mov     $824206446,%rdx
+        #     in10 = 2036477238
+        mov     $2036477238,%rcx
+        #     in14 = 1797285236
+        mov     $1797285236,%r8
+        #     *(uint32 *) (x + 0) = in0
+        movl    %esi,0(%rdi)
+        #     *(uint32 *) (x + 20) = in4
+        movl    %edx,20(%rdi)
+        #     *(uint32 *) (x + 40) = in10
+        movl    %ecx,40(%rdi)
+        #     *(uint32 *) (x + 60) = in14
+        movl    %r8d,60(%rdi)
+#   keysetupdone:
+._keysetupdone:
+        # leave
+        add     %r11,%rsp
+        mov     %rdi,%rax
+        mov     %rsi,%rdx
+        ret
+# enter ECRYPT_ivsetup
+.text
+.p2align 5
+.globl ECRYPT_ivsetup
+ECRYPT_ivsetup:
+        mov     %rsp,%r11
+        and     $31,%r11
+        add     $256,%r11
+        sub     %r11,%rsp
+        #   iv = arg2
+        mov     %rsi,%rsi
+        #   x = arg1
+        mov     %rdi,%rdi
+        #   in6 = *(uint64 *) (iv + 0)
+        movq    0(%rsi),%rsi
+        #   in8 = 0
+        mov     $0,%r8
+        #   *(uint64 *) (x + 24) = in6
+        movq    %rsi,24(%rdi)
+        #   *(uint64 *) (x + 32) = in8
+        movq    %r8,32(%rdi)
+        # leave
+        add     %r11,%rsp
+        mov     %rdi,%rax
+        mov     %rsi,%rdx
+        ret
diff --git a/arch/x86/crypto/salsa20_glue.c b/arch/x86/crypto/salsa20_glue.c
new file mode 100644
index 000000000000..bccb76d80987
--- /dev/null
+++ b/arch/x86/crypto/salsa20_glue.c
@@ -0,0 +1,129 @@
+/*
+ * Glue code for optimized assembly version of  Salsa20.
+ *
+ * Copyright (c) 2007 Tan Swee Heng <thesweeheng@gmail.com>
+ *
+ * The assembly codes are public domain assembly codes written by Daniel. J.
+ * Bernstein <djb@cr.yp.to>. The codes are modified to include indentation
+ * and to remove extraneous comments and functions that are not needed.
+ * - i586 version, renamed as salsa20-i586-asm_32.S
+ *   available from <http://cr.yp.to/snuffle/salsa20/x86-pm/salsa20.s>
+ * - x86-64 version, renamed as salsa20-x86_64-asm_64.S
+ *   available from <http://cr.yp.to/snuffle/salsa20/amd64-3/salsa20.s>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+#include <crypto/algapi.h>
+#include <linux/module.h>
+#include <linux/crypto.h>
+#define SALSA20_IV_SIZE        8U
+#define SALSA20_MIN_KEY_SIZE  16U
+#define SALSA20_MAX_KEY_SIZE  32U
+// use the ECRYPT_* function names
+#define salsa20_keysetup        ECRYPT_keysetup
+#define salsa20_ivsetup         ECRYPT_ivsetup
+#define salsa20_encrypt_bytes   ECRYPT_encrypt_bytes
+struct salsa20_ctx
+{
+        u32 input[16];
+};
+asmlinkage void salsa20_keysetup(struct salsa20_ctx *ctx, const u8 *k,
+                                 u32 keysize, u32 ivsize);
+asmlinkage void salsa20_ivsetup(struct salsa20_ctx *ctx, const u8 *iv);
+asmlinkage void salsa20_encrypt_bytes(struct salsa20_ctx *ctx,
+                                      const u8 *src, u8 *dst, u32 bytes);
+static int setkey(struct crypto_tfm *tfm, const u8 *key,
+                  unsigned int keysize)
+{
+        struct salsa20_ctx *ctx = crypto_tfm_ctx(tfm);
+        salsa20_keysetup(ctx, key, keysize*8, SALSA20_IV_SIZE*8);
+        return 0;
+}
+static int encrypt(struct blkcipher_desc *desc,
+                   struct scatterlist *dst, struct scatterlist *src,
+                   unsigned int nbytes)
+{
+        struct blkcipher_walk walk;
+        struct crypto_blkcipher *tfm = desc->tfm;
+        struct salsa20_ctx *ctx = crypto_blkcipher_ctx(tfm);
+        int err;
+        blkcipher_walk_init(&walk, dst, src, nbytes);
+        err = blkcipher_walk_virt_block(desc, &walk, 64);
+        salsa20_ivsetup(ctx, walk.iv);
+        if (likely(walk.nbytes == nbytes))
+        {
+                salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
+                                      walk.dst.virt.addr, nbytes);
+                return blkcipher_walk_done(desc, &walk, 0);
+        }
+        while (walk.nbytes >= 64) {
+                salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
+                                      walk.dst.virt.addr,
+                                      walk.nbytes - (walk.nbytes % 64));
+                err = blkcipher_walk_done(desc, &walk, walk.nbytes % 64);
+        }
+        if (walk.nbytes) {
+                salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
+                                      walk.dst.virt.addr, walk.nbytes);
+                err = blkcipher_walk_done(desc, &walk, 0);
+        }
+        return err;
+}
+static struct crypto_alg alg = {
+        .cra_name           =   "salsa20",
+        .cra_driver_name    =   "salsa20-asm",
+        .cra_priority       =   200,
+        .cra_flags          =   CRYPTO_ALG_TYPE_BLKCIPHER,
+        .cra_type           =   &crypto_blkcipher_type,
+        .cra_blocksize      =   1,
+        .cra_ctxsize        =   sizeof(struct salsa20_ctx),
+        .cra_alignmask      =   3,
+        .cra_module         =   THIS_MODULE,
+        .cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+        .cra_u              =   {
+                .blkcipher = {
+                        .setkey         =   setkey,
+                        .encrypt        =   encrypt,
+                        .decrypt        =   encrypt,
+                        .min_keysize    =   SALSA20_MIN_KEY_SIZE,
+                        .max_keysize    =   SALSA20_MAX_KEY_SIZE,
+                        .ivsize         =   SALSA20_IV_SIZE,
+                }
+        }
+};
+static int __init init(void)
+{
+        return crypto_register_alg(&alg);
+}
+static void __exit fini(void)
+{
+        crypto_unregister_alg(&alg);
+}
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Salsa20 stream cipher algorithm (optimized assembly version)");
+MODULE_ALIAS("salsa20");
+MODULE_ALIAS("salsa20-asm");
diff --git a/arch/x86/crypto/twofish_64.c b/arch/x86/crypto/twofish_64.c
deleted file mode 100644
index 182d91d5cfb9..000000000000
--- a/arch/x86/crypto/twofish_64.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Glue Code for optimized x86_64 assembler version of TWOFISH
- *
- * Originally Twofish for GPG
- * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
- * 256-bit key length added March 20, 1999
- * Some modifications to reduce the text size by Werner Koch, April, 1998
- * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
- * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
- *
- * The original author has disclaimed all copyright interest in this
- * code and thus put it in the public domain. The subsequent authors
- * have put this under the GNU General Public License.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
- * USA
- *
- * This code is a "clean room" implementation, written from the paper
- * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
- * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
- * through http://www.counterpane.com/twofish.html
- *
- * For background information on multiplication in finite fields, used for
- * the matrix operations in the key schedule, see the book _Contemporary
- * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
- * Third Edition.
- */
-#include <crypto/twofish.h>
-#include <linux/crypto.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
-static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-        twofish_enc_blk(tfm, dst, src);
-}
-static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
-{
-        twofish_dec_blk(tfm, dst, src);
-}
-static struct crypto_alg alg = {
-        .cra_name               =       "twofish",
-        .cra_driver_name        =       "twofish-x86_64",
-        .cra_priority           =       200,
-        .cra_flags              =       CRYPTO_ALG_TYPE_CIPHER,
-        .cra_blocksize          =       TF_BLOCK_SIZE,
-        .cra_ctxsize            =       sizeof(struct twofish_ctx),
-        .cra_alignmask          =       3,
-        .cra_module             =       THIS_MODULE,
-        .cra_list               =       LIST_HEAD_INIT(alg.cra_list),
-        .cra_u                  =       {
-                .cipher = {
-                        .cia_min_keysize        =       TF_MIN_KEY_SIZE,
-                        .cia_max_keysize        =       TF_MAX_KEY_SIZE,
-                        .cia_setkey             =       twofish_setkey,
-                        .cia_encrypt            =       twofish_encrypt,
-                        .cia_decrypt            =       twofish_decrypt
-                }
-        }
-};
-static int __init init(void)
-{
-        return crypto_register_alg(&alg);
-}
-static void __exit fini(void)
-{
-        crypto_unregister_alg(&alg);
-}
-module_init(init);
-module_exit(fini);
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
-MODULE_ALIAS("twofish");
diff --git a/arch/x86/crypto/twofish_32.c b/arch/x86/crypto/twofish_glue.c
index e3004dfe9c7a..cefaf8b9aa18 100644
--- a/arch/x86/crypto/twofish_32.c
+++ b/arch/x86/crypto/twofish_glue.c
@@ -1,5 +1,5 @@
 /*
- *  Glue Code for optimized 586 assembler version of TWOFISH
+ * Glue Code for assembler optimized version of TWOFISH
 *
 * Originally Twofish for GPG
 * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
@@ -44,7 +44,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
 asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
@@ -60,7 +59,7 @@ static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
 static struct crypto_alg alg = {
        .cra_name               =       "twofish",
-        .cra_driver_name        =       "twofish-i586",
+        .cra_driver_name        =       "twofish-asm",
        .cra_priority           =       200,
        .cra_flags              =       CRYPTO_ALG_TYPE_CIPHER,
        .cra_blocksize          =       TF_BLOCK_SIZE,
@@ -93,5 +92,6 @@ module_init(init);
 module_exit(fini);
 MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, asm optimized");
 MODULE_ALIAS("twofish");
+MODULE_ALIAS("twofish-asm");