[CRYPTO] Add x86_64 asm AES

Implementation: =============== The encrypt/decrypt code is based on an x86 implementation I did a while ago which I never published. This unpublished implementation does include an assembler based key schedule and precomputed tables. For simplicity and best acceptance, however, I took Gladman's in-kernel code for table generation and key schedule for the kernel port of my assembler code and modified this code to produce the key schedule as required by my assembler implementation. File locations and Kconfig are kept similar to the i586 AES assembler implementation. It may seem a little bit strange to use 32 bit I/O and registers in the assembler implementation but this gives the best code size. My implementation takes one instruction more per round compared to Gladman's x86 assembler but it doesn't require any stack for local variables or saved registers and it is less serialized than Gladman's code. Note that all comparisons to Gladman's code were done after my code was implemented. I did only use FIPS PUB 197 for the implementation so my implementation is independent work. If anybody has a better assembler solution for x86_64 I'll be pleased to have my code replaced with the better solution. Testing: ======== The implementation passes the in-kernel crypto testing module and I'm running it without any problems on my laptop where it is mainly used for dm-crypt. Microbenchmark: =============== The microbenchmark was done in userspace with similar compile flags as used during kernel compile. Encrypt/decrypt is about 35% faster than the generic C implementation. As the generic C as well as my assembler implementation are both table I don't really expect that there is much room for further improvements though I'll be glad to be corrected here. The key schedule is about 5% slower than the generic C implementation. This is due to the fact that some more work has to be done in the key schedule routine to fit the schedule to the assembler implementation. Code Size: ========== Encrypt and decrypt are together about 2.1 Kbytes smaller than the generic C implementation which is important with regard to L1 cache usage. The key schedule routine is about 100 bytes larger than the generic C implementation. Data Size: ========== There's no difference in data size requirements between the assembler implementation and the generic C implementation. License: ======== Gladmans's code is dual BSD/GPL whereas my assembler code is GPLv2 only (I'm not going to change the license for my code). So I had to change the module license for the x86_64 aes module from 'Dual BSD/GPL' to 'GPL' to reflect the most restrictive license within the module. Signed-off-by: Andreas Steinmetz <ast@domdv.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Andreas Steinmetz <ast@domdv.de> 2005-07-06 16:55:00 -0400
committer: David S. Miller <davem@davemloft.net> 2005-07-06 16:55:00 -0400
commit: a2a892a236d03a6e985471a7e57d1c863de144c8 (patch)
tree: 33b52c87bdecf0f24936b952a565a445ce03c616 /arch/x86_64/crypto/aes-x86_64-asm.S
parent: a61cc44812ff94793987bf43b70a3d9bc64a6820 (diff)
1 files changed, 186 insertions, 0 deletions
diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S
new file mode 100644
index 000000000000..483cbb23ab8d
--- /dev/null
+++ b/arch/x86_64/crypto/aes-x86_64-asm.S
@@ -0,0 +1,186 @@
+/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64
+ *
+ * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de>
+ *
+ * License:
+ * This code can be distributed under the terms of the GNU General Public
+ * License (GPL) Version 2 provided that the above header down to and
+ * including this sentence is retained in full.
+ */
+.extern aes_ft_tab
+.extern aes_it_tab
+.extern aes_fl_tab
+.extern aes_il_tab
+.text
+#define R1      %rax
+#define R1E     %eax
+#define R1X     %ax
+#define R1H     %ah
+#define R1L     %al
+#define R2      %rbx
+#define R2E     %ebx
+#define R2X     %bx
+#define R2H     %bh
+#define R2L     %bl
+#define R3      %rcx
+#define R3E     %ecx
+#define R3X     %cx
+#define R3H     %ch
+#define R3L     %cl
+#define R4      %rdx
+#define R4E     %edx
+#define R4X     %dx
+#define R4H     %dh
+#define R4L     %dl
+#define R5      %rsi
+#define R5E     %esi
+#define R6      %rdi
+#define R6E     %edi
+#define R7      %rbp
+#define R7E     %ebp
+#define R8      %r8
+#define R9      %r9
+#define R10     %r10
+#define R11     %r11
+#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
+        .global FUNC;                   \
+        .type   FUNC,@function;         \
+        .align  8;                      \
+FUNC:   movq    r1,r2;                  \
+        movq    r3,r4;                  \
+        leaq    BASE+52(r8),r9;         \
+        movq    r10,r11;                \
+        movl    (r7),r5 ## E;           \
+        movl    4(r7),r1 ## E;          \
+        movl    8(r7),r6 ## E;          \
+        movl    12(r7),r7 ## E;         \
+        movl    (r8),r10 ## E;          \
+        xorl    -48(r9),r5 ## E;        \
+        xorl    -44(r9),r1 ## E;        \
+        xorl    -40(r9),r6 ## E;        \
+        xorl    -36(r9),r7 ## E;        \
+        cmpl    $24,r10 ## E;           \
+        jb      B128;                   \
+        leaq    32(r9),r9;              \
+        je      B192;                   \
+        leaq    32(r9),r9;
+#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
+        movq    r1,r2;                  \
+        movq    r3,r4;                  \
+        movl    r5 ## E,(r9);           \
+        movl    r6 ## E,4(r9);          \
+        movl    r7 ## E,8(r9);          \
+        movl    r8 ## E,12(r9);         \
+        ret;
+#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
+        movzbl  r2 ## H,r5 ## E;        \
+        movzbl  r2 ## L,r6 ## E;        \
+        movl    TAB+1024(,r5,4),r5 ## E;\
+        movw    r4 ## X,r2 ## X;        \
+        movl    TAB(,r6,4),r6 ## E;     \
+        roll    $16,r2 ## E;            \
+        shrl    $16,r4 ## E;            \
+        movzbl  r4 ## H,r7 ## E;        \
+        movzbl  r4 ## L,r4 ## E;        \
+        xorl    OFFSET(r8),ra ## E;     \
+        xorl    OFFSET+4(r8),rb ## E;   \
+        xorl    TAB+3072(,r7,4),r5 ## E;\
+        xorl    TAB+2048(,r4,4),r6 ## E;\
+        movzbl  r1 ## L,r7 ## E;        \
+        movzbl  r1 ## H,r4 ## E;        \
+        movl    TAB+1024(,r4,4),r4 ## E;\
+        movw    r3 ## X,r1 ## X;        \
+        roll    $16,r1 ## E;            \
+        shrl    $16,r3 ## E;            \
+        xorl    TAB(,r7,4),r5 ## E;     \
+        movzbl  r3 ## H,r7 ## E;        \
+        movzbl  r3 ## L,r3 ## E;        \
+        xorl    TAB+3072(,r7,4),r4 ## E;\
+        xorl    TAB+2048(,r3,4),r5 ## E;\
+        movzbl  r1 ## H,r7 ## E;        \
+        movzbl  r1 ## L,r3 ## E;        \
+        shrl    $16,r1 ## E;            \
+        xorl    TAB+3072(,r7,4),r6 ## E;\
+        movl    TAB+2048(,r3,4),r3 ## E;\
+        movzbl  r1 ## H,r7 ## E;        \
+        movzbl  r1 ## L,r1 ## E;        \
+        xorl    TAB+1024(,r7,4),r6 ## E;\
+        xorl    TAB(,r1,4),r3 ## E;     \
+        movzbl  r2 ## H,r1 ## E;        \
+        movzbl  r2 ## L,r7 ## E;        \
+        shrl    $16,r2 ## E;            \
+        xorl    TAB+3072(,r1,4),r3 ## E;\
+        xorl    TAB+2048(,r7,4),r4 ## E;\
+        movzbl  r2 ## H,r1 ## E;        \
+        movzbl  r2 ## L,r2 ## E;        \
+        xorl    OFFSET+8(r8),rc ## E;   \
+        xorl    OFFSET+12(r8),rd ## E;  \
+        xorl    TAB+1024(,r1,4),r3 ## E;\
+        xorl    TAB(,r2,4),r4 ## E;
+#define move_regs(r1,r2,r3,r4) \
+        movl    r3 ## E,r1 ## E;        \
+        movl    r4 ## E,r2 ## E;
+#define entry(FUNC,BASE,B128,B192) \
+        prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
+#define encrypt_round(TAB,OFFSET) \
+        round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
+        move_regs(R1,R2,R5,R6)
+#define encrypt_final(TAB,OFFSET) \
+        round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
+#define decrypt_round(TAB,OFFSET) \
+        round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
+        move_regs(R1,R2,R5,R6)
+#define decrypt_final(TAB,OFFSET) \
+        round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
+/* void aes_encrypt(void *ctx, u8 *out, const u8 *in) */
+        entry(aes_encrypt,0,enc128,enc192)
+        encrypt_round(aes_ft_tab,-96)
+        encrypt_round(aes_ft_tab,-80)
+enc192: encrypt_round(aes_ft_tab,-64)
+        encrypt_round(aes_ft_tab,-48)
+enc128: encrypt_round(aes_ft_tab,-32)
+        encrypt_round(aes_ft_tab,-16)
+        encrypt_round(aes_ft_tab,  0)
+        encrypt_round(aes_ft_tab, 16)
+        encrypt_round(aes_ft_tab, 32)
+        encrypt_round(aes_ft_tab, 48)
+        encrypt_round(aes_ft_tab, 64)
+        encrypt_round(aes_ft_tab, 80)
+        encrypt_round(aes_ft_tab, 96)
+        encrypt_final(aes_fl_tab,112)
+        return
+/* void aes_decrypt(void *ctx, u8 *out, const u8 *in) */
+        entry(aes_decrypt,240,dec128,dec192)
+        decrypt_round(aes_it_tab,-96)
+        decrypt_round(aes_it_tab,-80)
+dec192: decrypt_round(aes_it_tab,-64)
+        decrypt_round(aes_it_tab,-48)
+dec128: decrypt_round(aes_it_tab,-32)
+        decrypt_round(aes_it_tab,-16)
+        decrypt_round(aes_it_tab,  0)
+        decrypt_round(aes_it_tab, 16)
+        decrypt_round(aes_it_tab, 32)
+        decrypt_round(aes_it_tab, 48)
+        decrypt_round(aes_it_tab, 64)
+        decrypt_round(aes_it_tab, 80)
+        decrypt_round(aes_it_tab, 96)
+        decrypt_final(aes_il_tab,112)
+        return
author	Andreas Steinmetz <ast@domdv.de>	2005-07-06 16:55:00 -0400
committer	David S. Miller <davem@davemloft.net>	2005-07-06 16:55:00 -0400
commit	a2a892a236d03a6e985471a7e57d1c863de144c8 (patch)
tree	33b52c87bdecf0f24936b952a565a445ce03c616 /arch/x86_64/crypto/aes-x86_64-asm.S
parent	a61cc44812ff94793987bf43b70a3d9bc64a6820 (diff)

diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S new file mode 100644 index 000000000000..483cbb23ab8d --- /dev/null +++ b/arch/x86_64/crypto/aes-x86_64-asm.S
@@ -0,0 +1,186 @@
	1	/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64
	2	*
	3	* Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de>
	4	*
	5	* License:
	6	* This code can be distributed under the terms of the GNU General Public
	7	* License (GPL) Version 2 provided that the above header down to and
	8	* including this sentence is retained in full.
	9	*/
	10
	11	.extern aes_ft_tab
	12	.extern aes_it_tab
	13	.extern aes_fl_tab
	14	.extern aes_il_tab
	15
	16	.text
	17
	18	#define R1 %rax
	19	#define R1E %eax
	20	#define R1X %ax
	21	#define R1H %ah
	22	#define R1L %al
	23	#define R2 %rbx
	24	#define R2E %ebx
	25	#define R2X %bx
	26	#define R2H %bh
	27	#define R2L %bl
	28	#define R3 %rcx
	29	#define R3E %ecx
	30	#define R3X %cx
	31	#define R3H %ch
	32	#define R3L %cl
	33	#define R4 %rdx
	34	#define R4E %edx
	35	#define R4X %dx
	36	#define R4H %dh
	37	#define R4L %dl
	38	#define R5 %rsi
	39	#define R5E %esi
	40	#define R6 %rdi
	41	#define R6E %edi
	42	#define R7 %rbp
	43	#define R7E %ebp
	44	#define R8 %r8
	45	#define R9 %r9
	46	#define R10 %r10
	47	#define R11 %r11
	48
	49	#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
	50	.global FUNC; \
	51	.type FUNC,@function; \
	52	.align 8; \
	53	FUNC: movq r1,r2; \
	54	movq r3,r4; \
	55	leaq BASE+52(r8),r9; \
	56	movq r10,r11; \
	57	movl (r7),r5 ## E; \
	58	movl 4(r7),r1 ## E; \
	59	movl 8(r7),r6 ## E; \
	60	movl 12(r7),r7 ## E; \
	61	movl (r8),r10 ## E; \
	62	xorl -48(r9),r5 ## E; \
	63	xorl -44(r9),r1 ## E; \
	64	xorl -40(r9),r6 ## E; \
	65	xorl -36(r9),r7 ## E; \
	66	cmpl $24,r10 ## E; \
	67	jb B128; \
	68	leaq 32(r9),r9; \
	69	je B192; \
	70	leaq 32(r9),r9;
	71
	72	#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
	73	movq r1,r2; \
	74	movq r3,r4; \
	75	movl r5 ## E,(r9); \
	76	movl r6 ## E,4(r9); \
	77	movl r7 ## E,8(r9); \
	78	movl r8 ## E,12(r9); \
	79	ret;
	80
	81	#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
	82	movzbl r2 ## H,r5 ## E; \
	83	movzbl r2 ## L,r6 ## E; \
	84	movl TAB+1024(,r5,4),r5 ## E;\
	85	movw r4 ## X,r2 ## X; \
	86	movl TAB(,r6,4),r6 ## E; \
	87	roll $16,r2 ## E; \
	88	shrl $16,r4 ## E; \
	89	movzbl r4 ## H,r7 ## E; \
	90	movzbl r4 ## L,r4 ## E; \
	91	xorl OFFSET(r8),ra ## E; \
	92	xorl OFFSET+4(r8),rb ## E; \
	93	xorl TAB+3072(,r7,4),r5 ## E;\
	94	xorl TAB+2048(,r4,4),r6 ## E;\
	95	movzbl r1 ## L,r7 ## E; \
	96	movzbl r1 ## H,r4 ## E; \
	97	movl TAB+1024(,r4,4),r4 ## E;\
	98	movw r3 ## X,r1 ## X; \
	99	roll $16,r1 ## E; \
	100	shrl $16,r3 ## E; \
	101	xorl TAB(,r7,4),r5 ## E; \
	102	movzbl r3 ## H,r7 ## E; \
	103	movzbl r3 ## L,r3 ## E; \
	104	xorl TAB+3072(,r7,4),r4 ## E;\
	105	xorl TAB+2048(,r3,4),r5 ## E;\
	106	movzbl r1 ## H,r7 ## E; \
	107	movzbl r1 ## L,r3 ## E; \
	108	shrl $16,r1 ## E; \
	109	xorl TAB+3072(,r7,4),r6 ## E;\
	110	movl TAB+2048(,r3,4),r3 ## E;\
	111	movzbl r1 ## H,r7 ## E; \
	112	movzbl r1 ## L,r1 ## E; \
	113	xorl TAB+1024(,r7,4),r6 ## E;\
	114	xorl TAB(,r1,4),r3 ## E; \
	115	movzbl r2 ## H,r1 ## E; \
	116	movzbl r2 ## L,r7 ## E; \
	117	shrl $16,r2 ## E; \
	118	xorl TAB+3072(,r1,4),r3 ## E;\
	119	xorl TAB+2048(,r7,4),r4 ## E;\
	120	movzbl r2 ## H,r1 ## E; \
	121	movzbl r2 ## L,r2 ## E; \
	122	xorl OFFSET+8(r8),rc ## E; \
	123	xorl OFFSET+12(r8),rd ## E; \
	124	xorl TAB+1024(,r1,4),r3 ## E;\
	125	xorl TAB(,r2,4),r4 ## E;
	126
	127	#define move_regs(r1,r2,r3,r4) \
	128	movl r3 ## E,r1 ## E; \
	129	movl r4 ## E,r2 ## E;
	130
	131	#define entry(FUNC,BASE,B128,B192) \
	132	prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
	133
	134	#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
	135
	136	#define encrypt_round(TAB,OFFSET) \
	137	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
	138	move_regs(R1,R2,R5,R6)
	139
	140	#define encrypt_final(TAB,OFFSET) \
	141	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
	142
	143	#define decrypt_round(TAB,OFFSET) \
	144	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
	145	move_regs(R1,R2,R5,R6)
	146
	147	#define decrypt_final(TAB,OFFSET) \
	148	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
	149
	150	/* void aes_encrypt(void ctx, u8 out, const u8 in) /
	151
	152	entry(aes_encrypt,0,enc128,enc192)
	153	encrypt_round(aes_ft_tab,-96)
	154	encrypt_round(aes_ft_tab,-80)
	155	enc192: encrypt_round(aes_ft_tab,-64)
	156	encrypt_round(aes_ft_tab,-48)
	157	enc128: encrypt_round(aes_ft_tab,-32)
	158	encrypt_round(aes_ft_tab,-16)
	159	encrypt_round(aes_ft_tab, 0)
	160	encrypt_round(aes_ft_tab, 16)
	161	encrypt_round(aes_ft_tab, 32)
	162	encrypt_round(aes_ft_tab, 48)
	163	encrypt_round(aes_ft_tab, 64)
	164	encrypt_round(aes_ft_tab, 80)
	165	encrypt_round(aes_ft_tab, 96)
	166	encrypt_final(aes_fl_tab,112)
	167	return
	168
	169	/* void aes_decrypt(void ctx, u8 out, const u8 in) /
	170
	171	entry(aes_decrypt,240,dec128,dec192)
	172	decrypt_round(aes_it_tab,-96)
	173	decrypt_round(aes_it_tab,-80)
	174	dec192: decrypt_round(aes_it_tab,-64)
	175	decrypt_round(aes_it_tab,-48)
	176	dec128: decrypt_round(aes_it_tab,-32)
	177	decrypt_round(aes_it_tab,-16)
	178	decrypt_round(aes_it_tab, 0)
	179	decrypt_round(aes_it_tab, 16)
	180	decrypt_round(aes_it_tab, 32)
	181	decrypt_round(aes_it_tab, 48)
	182	decrypt_round(aes_it_tab, 64)
	183	decrypt_round(aes_it_tab, 80)
	184	decrypt_round(aes_it_tab, 96)
	185	decrypt_final(aes_il_tab,112)
	186	return