Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/i386/crypto/aes-i586-asm.S
1 files changed, 376 insertions, 0 deletions
diff --git a/arch/i386/crypto/aes-i586-asm.S b/arch/i386/crypto/aes-i586-asm.S
new file mode 100644
index 000000000000..7b73c67cb4e8
--- /dev/null
+++ b/arch/i386/crypto/aes-i586-asm.S
@@ -0,0 +1,376 @@
+// -------------------------------------------------------------------------
+// Copyright (c) 2001, Dr Brian Gladman <                 >, Worcester, UK.
+// All rights reserved.
+//
+// LICENSE TERMS
+//
+// The free distribution and use of this software in both source and binary 
+// form is allowed (with or without changes) provided that:
+//
+//   1. distributions of this source code include the above copyright 
+//      notice, this list of conditions and the following disclaimer//
+//
+//   2. distributions in binary form include the above copyright
+//      notice, this list of conditions and the following disclaimer
+//      in the documentation and/or other associated materials//
+//
+//   3. the copyright holder's name is not used to endorse products 
+//      built using this software without specific written permission.
+//
+//
+// ALTERNATIVELY, provided that this notice is retained in full, this product
+// may be distributed under the terms of the GNU General Public License (GPL),
+// in which case the provisions of the GPL apply INSTEAD OF those given above.
+//
+// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
+// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
+// DISCLAIMER
+//
+// This software is provided 'as is' with no explicit or implied warranties
+// in respect of its properties including, but not limited to, correctness 
+// and fitness for purpose.
+// -------------------------------------------------------------------------
+// Issue Date: 29/07/2002
+.file "aes-i586-asm.S"
+.text
+// aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
+// aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
+        
+#define tlen 1024   // length of each of 4 'xor' arrays (256 32-bit words)
+// offsets to parameters with one register pushed onto stack
+#define in_blk    8  // input byte array address parameter
+#define out_blk  12  // output byte array address parameter
+#define ctx      16  // AES context structure
+// offsets in context structure
+#define ekey     0   // encryption key schedule base address
+#define nrnd   256   // number of rounds
+#define dkey   260   // decryption key schedule base address
+// register mapping for encrypt and decrypt subroutines
+#define r0  eax
+#define r1  ebx
+#define r2  ecx
+#define r3  edx
+#define r4  esi
+#define r5  edi
+#define eaxl  al
+#define eaxh  ah
+#define ebxl  bl
+#define ebxh  bh
+#define ecxl  cl
+#define ecxh  ch
+#define edxl  dl
+#define edxh  dh
+#define _h(reg) reg##h
+#define h(reg) _h(reg)
+#define _l(reg) reg##l
+#define l(reg) _l(reg)
+// This macro takes a 32-bit word representing a column and uses
+// each of its four bytes to index into four tables of 256 32-bit
+// words to obtain values that are then xored into the appropriate
+// output registers r0, r1, r4 or r5.  
+// Parameters:
+// table table base address
+//   %1  out_state[0]
+//   %2  out_state[1]
+//   %3  out_state[2]
+//   %4  out_state[3]
+//   idx input register for the round (destroyed)
+//   tmp scratch register for the round
+// sched key schedule
+#define do_col(table, a1,a2,a3,a4, idx, tmp)    \
+        movzx   %l(idx),%tmp;                   \
+        xor     table(,%tmp,4),%a1;             \
+        movzx   %h(idx),%tmp;                   \
+        shr     $16,%idx;                       \
+        xor     table+tlen(,%tmp,4),%a2;        \
+        movzx   %l(idx),%tmp;                   \
+        movzx   %h(idx),%idx;                   \
+        xor     table+2*tlen(,%tmp,4),%a3;      \
+        xor     table+3*tlen(,%idx,4),%a4;
+// initialise output registers from the key schedule
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
+        mov     0 sched,%a1;                    \
+        movzx   %l(idx),%tmp;                   \
+        mov     12 sched,%a2;                   \
+        xor     table(,%tmp,4),%a1;             \
+        mov     4 sched,%a4;                    \
+        movzx   %h(idx),%tmp;                   \
+        shr     $16,%idx;                       \
+        xor     table+tlen(,%tmp,4),%a2;        \
+        movzx   %l(idx),%tmp;                   \
+        movzx   %h(idx),%idx;                   \
+        xor     table+3*tlen(,%idx,4),%a4;      \
+        mov     %a3,%idx;                       \
+        mov     8 sched,%a3;                    \
+        xor     table+2*tlen(,%tmp,4),%a3;
+// initialise output registers from the key schedule
+// NB1: original value of a3 is in idx on exit
+// NB2: original values of a1,a2,a4 aren't used
+#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
+        mov     0 sched,%a1;                    \
+        movzx   %l(idx),%tmp;                   \
+        mov     4 sched,%a2;                    \
+        xor     table(,%tmp,4),%a1;             \
+        mov     12 sched,%a4;                   \
+        movzx   %h(idx),%tmp;                   \
+        shr     $16,%idx;                       \
+        xor     table+tlen(,%tmp,4),%a2;        \
+        movzx   %l(idx),%tmp;                   \
+        movzx   %h(idx),%idx;                   \
+        xor     table+3*tlen(,%idx,4),%a4;      \
+        mov     %a3,%idx;                       \
+        mov     8 sched,%a3;                    \
+        xor     table+2*tlen(,%tmp,4),%a3;
+// original Gladman had conditional saves to MMX regs.
+#define save(a1, a2)            \
+        mov     %a2,4*a1(%esp)
+#define restore(a1, a2)         \
+        mov     4*a2(%esp),%a1
+// These macros perform a forward encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage.
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit:  r2,r1,r4,r5
+#define fwd_rnd1(arg, table)                                            \
+        save   (0,r1);                                                  \
+        save   (1,r5);                                                  \
+                                                                        \
+        /* compute new column values */                                 \
+        do_fcol(table, r2,r5,r4,r1, r0,r3, arg);        /* idx=r0 */    \
+        do_col (table, r4,r1,r2,r5, r0,r3);             /* idx=r4 */    \
+        restore(r0,0);                                                  \
+        do_col (table, r1,r2,r5,r4, r0,r3);             /* idx=r1 */    \
+        restore(r0,1);                                                  \
+        do_col (table, r5,r4,r1,r2, r0,r3);             /* idx=r5 */
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit:  r0,r1,r4,r5
+#define fwd_rnd2(arg, table)                                            \
+        save   (0,r1);                                                  \
+        save   (1,r5);                                                  \
+                                                                        \
+        /* compute new column values */                                 \
+        do_fcol(table, r0,r5,r4,r1, r2,r3, arg);        /* idx=r2 */    \
+        do_col (table, r4,r1,r0,r5, r2,r3);             /* idx=r4 */    \
+        restore(r2,0);                                                  \
+        do_col (table, r1,r0,r5,r4, r2,r3);             /* idx=r1 */    \
+        restore(r2,1);                                                  \
+        do_col (table, r5,r4,r1,r0, r2,r3);             /* idx=r5 */
+// These macros performs an inverse encryption cycle. They are entered with
+// the first previous round column values in r0,r1,r4,r5 and
+// exit with the final values in the same registers, using stack
+// for temporary storage
+// round column values
+// on entry: r0,r1,r4,r5
+// on exit:  r2,r1,r4,r5
+#define inv_rnd1(arg, table)                                            \
+        save    (0,r1);                                                 \
+        save    (1,r5);                                                 \
+                                                                        \
+        /* compute new column values */                                 \
+        do_icol(table, r2,r1,r4,r5, r0,r3, arg);        /* idx=r0 */    \
+        do_col (table, r4,r5,r2,r1, r0,r3);             /* idx=r4 */    \
+        restore(r0,0);                                                  \
+        do_col (table, r1,r4,r5,r2, r0,r3);             /* idx=r1 */    \
+        restore(r0,1);                                                  \
+        do_col (table, r5,r2,r1,r4, r0,r3);             /* idx=r5 */
+// round column values
+// on entry: r2,r1,r4,r5
+// on exit:  r0,r1,r4,r5
+#define inv_rnd2(arg, table)                                            \
+        save    (0,r1);                                                 \
+        save    (1,r5);                                                 \
+                                                                        \
+        /* compute new column values */                                 \
+        do_icol(table, r0,r1,r4,r5, r2,r3, arg);        /* idx=r2 */    \
+        do_col (table, r4,r5,r0,r1, r2,r3);             /* idx=r4 */    \
+        restore(r2,0);                                                  \
+        do_col (table, r1,r4,r5,r0, r2,r3);             /* idx=r1 */    \
+        restore(r2,1);                                                  \
+        do_col (table, r5,r0,r1,r4, r2,r3);             /* idx=r5 */
+// AES (Rijndael) Encryption Subroutine
+.global  aes_enc_blk
+.extern  ft_tab
+.extern  fl_tab
+.align 4
+aes_enc_blk:
+        push    %ebp
+        mov     ctx(%esp),%ebp      // pointer to context
+// CAUTION: the order and the values used in these assigns 
+// rely on the register mappings
+1:      push    %ebx
+        mov     in_blk+4(%esp),%r2
+        push    %esi
+        mov     nrnd(%ebp),%r3   // number of rounds
+        push    %edi
+#if ekey != 0
+        lea     ekey(%ebp),%ebp  // key pointer
+#endif
+// input four columns and xor in first round key
+        mov     (%r2),%r0
+        mov     4(%r2),%r1
+        mov     8(%r2),%r4
+        mov     12(%r2),%r5
+        xor     (%ebp),%r0
+        xor     4(%ebp),%r1
+        xor     8(%ebp),%r4
+        xor     12(%ebp),%r5
+        sub     $8,%esp           // space for register saves on stack
+        add     $16,%ebp          // increment to next round key
+        sub     $10,%r3          
+        je      4f              // 10 rounds for 128-bit key
+        add     $32,%ebp
+        sub     $2,%r3
+        je      3f              // 12 rounds for 128-bit key
+        add     $32,%ebp
+2:      fwd_rnd1( -64(%ebp) ,ft_tab)    // 14 rounds for 128-bit key
+        fwd_rnd2( -48(%ebp) ,ft_tab)
+3:      fwd_rnd1( -32(%ebp) ,ft_tab)    // 12 rounds for 128-bit key
+        fwd_rnd2( -16(%ebp) ,ft_tab)
+4:      fwd_rnd1(    (%ebp) ,ft_tab)    // 10 rounds for 128-bit key
+        fwd_rnd2( +16(%ebp) ,ft_tab)
+        fwd_rnd1( +32(%ebp) ,ft_tab)
+        fwd_rnd2( +48(%ebp) ,ft_tab)
+        fwd_rnd1( +64(%ebp) ,ft_tab)
+        fwd_rnd2( +80(%ebp) ,ft_tab)
+        fwd_rnd1( +96(%ebp) ,ft_tab)
+        fwd_rnd2(+112(%ebp) ,ft_tab)
+        fwd_rnd1(+128(%ebp) ,ft_tab)
+        fwd_rnd2(+144(%ebp) ,fl_tab)    // last round uses a different table
+// move final values to the output array.  CAUTION: the 
+// order of these assigns rely on the register mappings
+        add     $8,%esp
+        mov     out_blk+12(%esp),%ebp
+        mov     %r5,12(%ebp)
+        pop     %edi
+        mov     %r4,8(%ebp)
+        pop     %esi
+        mov     %r1,4(%ebp)
+        pop     %ebx
+        mov     %r0,(%ebp)
+        pop     %ebp
+        mov     $1,%eax
+        ret
+// AES (Rijndael) Decryption Subroutine
+.global  aes_dec_blk
+.extern  it_tab
+.extern  il_tab
+.align 4
+aes_dec_blk:
+        push    %ebp
+        mov     ctx(%esp),%ebp       // pointer to context
+// CAUTION: the order and the values used in these assigns 
+// rely on the register mappings
+1:      push    %ebx
+        mov     in_blk+4(%esp),%r2
+        push    %esi
+        mov     nrnd(%ebp),%r3   // number of rounds
+        push    %edi
+#if dkey != 0
+        lea     dkey(%ebp),%ebp  // key pointer
+#endif
+        mov     %r3,%r0
+        shl     $4,%r0
+        add     %r0,%ebp
+        
+// input four columns and xor in first round key
+        mov     (%r2),%r0
+        mov     4(%r2),%r1
+        mov     8(%r2),%r4
+        mov     12(%r2),%r5
+        xor     (%ebp),%r0
+        xor     4(%ebp),%r1
+        xor     8(%ebp),%r4
+        xor     12(%ebp),%r5
+        sub     $8,%esp         // space for register saves on stack
+        sub     $16,%ebp        // increment to next round key
+        sub     $10,%r3          
+        je      4f              // 10 rounds for 128-bit key
+        sub     $32,%ebp
+        sub     $2,%r3
+        je      3f              // 12 rounds for 128-bit key
+        sub     $32,%ebp
+2:      inv_rnd1( +64(%ebp), it_tab)    // 14 rounds for 128-bit key
+        inv_rnd2( +48(%ebp), it_tab)
+3:      inv_rnd1( +32(%ebp), it_tab)    // 12 rounds for 128-bit key
+        inv_rnd2( +16(%ebp), it_tab)
+4:      inv_rnd1(    (%ebp), it_tab)    // 10 rounds for 128-bit key
+        inv_rnd2( -16(%ebp), it_tab)
+        inv_rnd1( -32(%ebp), it_tab)
+        inv_rnd2( -48(%ebp), it_tab)
+        inv_rnd1( -64(%ebp), it_tab)
+        inv_rnd2( -80(%ebp), it_tab)
+        inv_rnd1( -96(%ebp), it_tab)
+        inv_rnd2(-112(%ebp), it_tab)
+        inv_rnd1(-128(%ebp), it_tab)
+        inv_rnd2(-144(%ebp), il_tab)    // last round uses a different table
+// move final values to the output array.  CAUTION: the 
+// order of these assigns rely on the register mappings
+        add     $8,%esp
+        mov     out_blk+12(%esp),%ebp
+        mov     %r5,12(%ebp)
+        pop     %edi
+        mov     %r4,8(%ebp)
+        pop     %esi
+        mov     %r1,4(%ebp)
+        pop     %ebx
+        mov     %r0,(%ebp)
+        pop     %ebp
+        mov     $1,%eax
+        ret
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /arch/i386/crypto/aes-i586-asm.S

diff --git a/arch/i386/crypto/aes-i586-asm.S b/arch/i386/crypto/aes-i586-asm.S new file mode 100644 index 000000000000..7b73c67cb4e8 --- /dev/null +++ b/arch/i386/crypto/aes-i586-asm.S
@@ -0,0 +1,376 @@
	1	// -------------------------------------------------------------------------
	2	// Copyright (c) 2001, Dr Brian Gladman < >, Worcester, UK.
	3	// All rights reserved.
	4	//
	5	// LICENSE TERMS
	6	//
	7	// The free distribution and use of this software in both source and binary
	8	// form is allowed (with or without changes) provided that:
	9	//
	10	// 1. distributions of this source code include the above copyright
	11	// notice, this list of conditions and the following disclaimer//
	12	//
	13	// 2. distributions in binary form include the above copyright
	14	// notice, this list of conditions and the following disclaimer
	15	// in the documentation and/or other associated materials//
	16	//
	17	// 3. the copyright holder's name is not used to endorse products
	18	// built using this software without specific written permission.
	19	//
	20	//
	21	// ALTERNATIVELY, provided that this notice is retained in full, this product
	22	// may be distributed under the terms of the GNU General Public License (GPL),
	23	// in which case the provisions of the GPL apply INSTEAD OF those given above.
	24	//
	25	// Copyright (c) 2004 Linus Torvalds <torvalds@osdl.org>
	26	// Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
	27
	28	// DISCLAIMER
	29	//
	30	// This software is provided 'as is' with no explicit or implied warranties
	31	// in respect of its properties including, but not limited to, correctness
	32	// and fitness for purpose.
	33	// -------------------------------------------------------------------------
	34	// Issue Date: 29/07/2002
	35
	36	.file "aes-i586-asm.S"
	37	.text
	38
	39	// aes_rval aes_enc_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
	40	// aes_rval aes_dec_blk(const unsigned char in_blk[], unsigned char out_blk[], const aes_ctx cx[1])//
	41
	42	#define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words)
	43
	44	// offsets to parameters with one register pushed onto stack
	45
	46	#define in_blk 8 // input byte array address parameter
	47	#define out_blk 12 // output byte array address parameter
	48	#define ctx 16 // AES context structure
	49
	50	// offsets in context structure
	51
	52	#define ekey 0 // encryption key schedule base address
	53	#define nrnd 256 // number of rounds
	54	#define dkey 260 // decryption key schedule base address
	55
	56	// register mapping for encrypt and decrypt subroutines
	57
	58	#define r0 eax
	59	#define r1 ebx
	60	#define r2 ecx
	61	#define r3 edx
	62	#define r4 esi
	63	#define r5 edi
	64
	65	#define eaxl al
	66	#define eaxh ah
	67	#define ebxl bl
	68	#define ebxh bh
	69	#define ecxl cl
	70	#define ecxh ch
	71	#define edxl dl
	72	#define edxh dh
	73
	74	#define _h(reg) reg##h
	75	#define h(reg) _h(reg)
	76
	77	#define _l(reg) reg##l
	78	#define l(reg) _l(reg)
	79
	80	// This macro takes a 32-bit word representing a column and uses
	81	// each of its four bytes to index into four tables of 256 32-bit
	82	// words to obtain values that are then xored into the appropriate
	83	// output registers r0, r1, r4 or r5.
	84
	85	// Parameters:
	86	// table table base address
	87	// %1 out_state[0]
	88	// %2 out_state[1]
	89	// %3 out_state[2]
	90	// %4 out_state[3]
	91	// idx input register for the round (destroyed)
	92	// tmp scratch register for the round
	93	// sched key schedule
	94
	95	#define do_col(table, a1,a2,a3,a4, idx, tmp) \
	96	movzx %l(idx),%tmp; \
	97	xor table(,%tmp,4),%a1; \
	98	movzx %h(idx),%tmp; \
	99	shr $16,%idx; \
	100	xor table+tlen(,%tmp,4),%a2; \
	101	movzx %l(idx),%tmp; \
	102	movzx %h(idx),%idx; \
	103	xor table+2*tlen(,%tmp,4),%a3; \
	104	xor table+3*tlen(,%idx,4),%a4;
	105
	106	// initialise output registers from the key schedule
	107	// NB1: original value of a3 is in idx on exit
	108	// NB2: original values of a1,a2,a4 aren't used
	109	#define do_fcol(table, a1,a2,a3,a4, idx, tmp, sched) \
	110	mov 0 sched,%a1; \
	111	movzx %l(idx),%tmp; \
	112	mov 12 sched,%a2; \
	113	xor table(,%tmp,4),%a1; \
	114	mov 4 sched,%a4; \
	115	movzx %h(idx),%tmp; \
	116	shr $16,%idx; \
	117	xor table+tlen(,%tmp,4),%a2; \
	118	movzx %l(idx),%tmp; \
	119	movzx %h(idx),%idx; \
	120	xor table+3*tlen(,%idx,4),%a4; \
	121	mov %a3,%idx; \
	122	mov 8 sched,%a3; \
	123	xor table+2*tlen(,%tmp,4),%a3;
	124
	125	// initialise output registers from the key schedule
	126	// NB1: original value of a3 is in idx on exit
	127	// NB2: original values of a1,a2,a4 aren't used
	128	#define do_icol(table, a1,a2,a3,a4, idx, tmp, sched) \
	129	mov 0 sched,%a1; \
	130	movzx %l(idx),%tmp; \
	131	mov 4 sched,%a2; \
	132	xor table(,%tmp,4),%a1; \
	133	mov 12 sched,%a4; \
	134	movzx %h(idx),%tmp; \
	135	shr $16,%idx; \
	136	xor table+tlen(,%tmp,4),%a2; \
	137	movzx %l(idx),%tmp; \
	138	movzx %h(idx),%idx; \
	139	xor table+3*tlen(,%idx,4),%a4; \
	140	mov %a3,%idx; \
	141	mov 8 sched,%a3; \
	142	xor table+2*tlen(,%tmp,4),%a3;
	143
	144
	145	// original Gladman had conditional saves to MMX regs.
	146	#define save(a1, a2) \
	147	mov %a2,4*a1(%esp)
	148
	149	#define restore(a1, a2) \
	150	mov 4*a2(%esp),%a1
	151
	152	// These macros perform a forward encryption cycle. They are entered with
	153	// the first previous round column values in r0,r1,r4,r5 and
	154	// exit with the final values in the same registers, using stack
	155	// for temporary storage.
	156
	157	// round column values
	158	// on entry: r0,r1,r4,r5
	159	// on exit: r2,r1,r4,r5
	160	#define fwd_rnd1(arg, table) \
	161	save (0,r1); \
	162	save (1,r5); \
	163	\
	164	/* compute new column values */ \
	165	do_fcol(table, r2,r5,r4,r1, r0,r3, arg); /* idx=r0 */ \
	166	do_col (table, r4,r1,r2,r5, r0,r3); /* idx=r4 */ \
	167	restore(r0,0); \
	168	do_col (table, r1,r2,r5,r4, r0,r3); /* idx=r1 */ \
	169	restore(r0,1); \
	170	do_col (table, r5,r4,r1,r2, r0,r3); /* idx=r5 */
	171
	172	// round column values
	173	// on entry: r2,r1,r4,r5
	174	// on exit: r0,r1,r4,r5
	175	#define fwd_rnd2(arg, table) \
	176	save (0,r1); \
	177	save (1,r5); \
	178	\
	179	/* compute new column values */ \
	180	do_fcol(table, r0,r5,r4,r1, r2,r3, arg); /* idx=r2 */ \
	181	do_col (table, r4,r1,r0,r5, r2,r3); /* idx=r4 */ \
	182	restore(r2,0); \
	183	do_col (table, r1,r0,r5,r4, r2,r3); /* idx=r1 */ \
	184	restore(r2,1); \
	185	do_col (table, r5,r4,r1,r0, r2,r3); /* idx=r5 */
	186
	187	// These macros performs an inverse encryption cycle. They are entered with
	188	// the first previous round column values in r0,r1,r4,r5 and
	189	// exit with the final values in the same registers, using stack
	190	// for temporary storage
	191
	192	// round column values
	193	// on entry: r0,r1,r4,r5
	194	// on exit: r2,r1,r4,r5
	195	#define inv_rnd1(arg, table) \
	196	save (0,r1); \
	197	save (1,r5); \
	198	\
	199	/* compute new column values */ \
	200	do_icol(table, r2,r1,r4,r5, r0,r3, arg); /* idx=r0 */ \
	201	do_col (table, r4,r5,r2,r1, r0,r3); /* idx=r4 */ \
	202	restore(r0,0); \
	203	do_col (table, r1,r4,r5,r2, r0,r3); /* idx=r1 */ \
	204	restore(r0,1); \
	205	do_col (table, r5,r2,r1,r4, r0,r3); /* idx=r5 */
	206
	207	// round column values
	208	// on entry: r2,r1,r4,r5
	209	// on exit: r0,r1,r4,r5
	210	#define inv_rnd2(arg, table) \
	211	save (0,r1); \
	212	save (1,r5); \
	213	\
	214	/* compute new column values */ \
	215	do_icol(table, r0,r1,r4,r5, r2,r3, arg); /* idx=r2 */ \
	216	do_col (table, r4,r5,r0,r1, r2,r3); /* idx=r4 */ \
	217	restore(r2,0); \
	218	do_col (table, r1,r4,r5,r0, r2,r3); /* idx=r1 */ \
	219	restore(r2,1); \
	220	do_col (table, r5,r0,r1,r4, r2,r3); /* idx=r5 */
	221
	222	// AES (Rijndael) Encryption Subroutine
	223
	224	.global aes_enc_blk
	225
	226	.extern ft_tab
	227	.extern fl_tab
	228
	229	.align 4
	230
	231	aes_enc_blk:
	232	push %ebp
	233	mov ctx(%esp),%ebp // pointer to context
	234
	235	// CAUTION: the order and the values used in these assigns
	236	// rely on the register mappings
	237
	238	1: push %ebx
	239	mov in_blk+4(%esp),%r2
	240	push %esi
	241	mov nrnd(%ebp),%r3 // number of rounds
	242	push %edi
	243	#if ekey != 0
	244	lea ekey(%ebp),%ebp // key pointer
	245	#endif
	246
	247	// input four columns and xor in first round key
	248
	249	mov (%r2),%r0
	250	mov 4(%r2),%r1
	251	mov 8(%r2),%r4
	252	mov 12(%r2),%r5
	253	xor (%ebp),%r0
	254	xor 4(%ebp),%r1
	255	xor 8(%ebp),%r4
	256	xor 12(%ebp),%r5
	257
	258	sub $8,%esp // space for register saves on stack
	259	add $16,%ebp // increment to next round key
	260	sub $10,%r3
	261	je 4f // 10 rounds for 128-bit key
	262	add $32,%ebp
	263	sub $2,%r3
	264	je 3f // 12 rounds for 128-bit key
	265	add $32,%ebp
	266
	267	2: fwd_rnd1( -64(%ebp) ,ft_tab) // 14 rounds for 128-bit key
	268	fwd_rnd2( -48(%ebp) ,ft_tab)
	269	3: fwd_rnd1( -32(%ebp) ,ft_tab) // 12 rounds for 128-bit key
	270	fwd_rnd2( -16(%ebp) ,ft_tab)
	271	4: fwd_rnd1( (%ebp) ,ft_tab) // 10 rounds for 128-bit key
	272	fwd_rnd2( +16(%ebp) ,ft_tab)
	273	fwd_rnd1( +32(%ebp) ,ft_tab)
	274	fwd_rnd2( +48(%ebp) ,ft_tab)
	275	fwd_rnd1( +64(%ebp) ,ft_tab)
	276	fwd_rnd2( +80(%ebp) ,ft_tab)
	277	fwd_rnd1( +96(%ebp) ,ft_tab)
	278	fwd_rnd2(+112(%ebp) ,ft_tab)
	279	fwd_rnd1(+128(%ebp) ,ft_tab)
	280	fwd_rnd2(+144(%ebp) ,fl_tab) // last round uses a different table
	281
	282	// move final values to the output array. CAUTION: the
	283	// order of these assigns rely on the register mappings
	284
	285	add $8,%esp
	286	mov out_blk+12(%esp),%ebp
	287	mov %r5,12(%ebp)
	288	pop %edi
	289	mov %r4,8(%ebp)
	290	pop %esi
	291	mov %r1,4(%ebp)
	292	pop %ebx
	293	mov %r0,(%ebp)
	294	pop %ebp
	295	mov $1,%eax
	296	ret
	297
	298	// AES (Rijndael) Decryption Subroutine
	299
	300	.global aes_dec_blk
	301
	302	.extern it_tab
	303	.extern il_tab
	304
	305	.align 4
	306
	307	aes_dec_blk:
	308	push %ebp
	309	mov ctx(%esp),%ebp // pointer to context
	310
	311	// CAUTION: the order and the values used in these assigns
	312	// rely on the register mappings
	313
	314	1: push %ebx
	315	mov in_blk+4(%esp),%r2
	316	push %esi
	317	mov nrnd(%ebp),%r3 // number of rounds
	318	push %edi
	319	#if dkey != 0
	320	lea dkey(%ebp),%ebp // key pointer
	321	#endif
	322	mov %r3,%r0
	323	shl $4,%r0
	324	add %r0,%ebp
	325
	326	// input four columns and xor in first round key
	327
	328	mov (%r2),%r0
	329	mov 4(%r2),%r1
	330	mov 8(%r2),%r4
	331	mov 12(%r2),%r5
	332	xor (%ebp),%r0
	333	xor 4(%ebp),%r1
	334	xor 8(%ebp),%r4
	335	xor 12(%ebp),%r5
	336
	337	sub $8,%esp // space for register saves on stack
	338	sub $16,%ebp // increment to next round key
	339	sub $10,%r3
	340	je 4f // 10 rounds for 128-bit key
	341	sub $32,%ebp
	342	sub $2,%r3
	343	je 3f // 12 rounds for 128-bit key
	344	sub $32,%ebp
	345
	346	2: inv_rnd1( +64(%ebp), it_tab) // 14 rounds for 128-bit key
	347	inv_rnd2( +48(%ebp), it_tab)
	348	3: inv_rnd1( +32(%ebp), it_tab) // 12 rounds for 128-bit key
	349	inv_rnd2( +16(%ebp), it_tab)
	350	4: inv_rnd1( (%ebp), it_tab) // 10 rounds for 128-bit key
	351	inv_rnd2( -16(%ebp), it_tab)
	352	inv_rnd1( -32(%ebp), it_tab)
	353	inv_rnd2( -48(%ebp), it_tab)
	354	inv_rnd1( -64(%ebp), it_tab)
	355	inv_rnd2( -80(%ebp), it_tab)
	356	inv_rnd1( -96(%ebp), it_tab)
	357	inv_rnd2(-112(%ebp), it_tab)
	358	inv_rnd1(-128(%ebp), it_tab)
	359	inv_rnd2(-144(%ebp), il_tab) // last round uses a different table
	360
	361	// move final values to the output array. CAUTION: the
	362	// order of these assigns rely on the register mappings
	363
	364	add $8,%esp
	365	mov out_blk+12(%esp),%ebp
	366	mov %r5,12(%ebp)
	367	pop %edi
	368	mov %r4,8(%ebp)
	369	pop %esi
	370	mov %r1,4(%ebp)
	371	pop %ebx
	372	mov %r0,(%ebp)
	373	pop %ebp
	374	mov $1,%eax
	375	ret
	376