aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/crypto/aes-x86_64-asm.S
diff options
context:
space:
mode:
authorAndreas Steinmetz <ast@domdv.de>2005-07-06 16:55:00 -0400
committerDavid S. Miller <davem@davemloft.net>2005-07-06 16:55:00 -0400
commita2a892a236d03a6e985471a7e57d1c863de144c8 (patch)
tree33b52c87bdecf0f24936b952a565a445ce03c616 /arch/x86_64/crypto/aes-x86_64-asm.S
parenta61cc44812ff94793987bf43b70a3d9bc64a6820 (diff)
[CRYPTO] Add x86_64 asm AES
Implementation: =============== The encrypt/decrypt code is based on an x86 implementation I did a while ago which I never published. This unpublished implementation does include an assembler based key schedule and precomputed tables. For simplicity and best acceptance, however, I took Gladman's in-kernel code for table generation and key schedule for the kernel port of my assembler code and modified this code to produce the key schedule as required by my assembler implementation. File locations and Kconfig are kept similar to the i586 AES assembler implementation. It may seem a little bit strange to use 32 bit I/O and registers in the assembler implementation but this gives the best code size. My implementation takes one instruction more per round compared to Gladman's x86 assembler but it doesn't require any stack for local variables or saved registers and it is less serialized than Gladman's code. Note that all comparisons to Gladman's code were done after my code was implemented. I did only use FIPS PUB 197 for the implementation so my implementation is independent work. If anybody has a better assembler solution for x86_64 I'll be pleased to have my code replaced with the better solution. Testing: ======== The implementation passes the in-kernel crypto testing module and I'm running it without any problems on my laptop where it is mainly used for dm-crypt. Microbenchmark: =============== The microbenchmark was done in userspace with similar compile flags as used during kernel compile. Encrypt/decrypt is about 35% faster than the generic C implementation. As the generic C as well as my assembler implementation are both table I don't really expect that there is much room for further improvements though I'll be glad to be corrected here. The key schedule is about 5% slower than the generic C implementation. This is due to the fact that some more work has to be done in the key schedule routine to fit the schedule to the assembler implementation. Code Size: ========== Encrypt and decrypt are together about 2.1 Kbytes smaller than the generic C implementation which is important with regard to L1 cache usage. The key schedule routine is about 100 bytes larger than the generic C implementation. Data Size: ========== There's no difference in data size requirements between the assembler implementation and the generic C implementation. License: ======== Gladmans's code is dual BSD/GPL whereas my assembler code is GPLv2 only (I'm not going to change the license for my code). So I had to change the module license for the x86_64 aes module from 'Dual BSD/GPL' to 'GPL' to reflect the most restrictive license within the module. Signed-off-by: Andreas Steinmetz <ast@domdv.de> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'arch/x86_64/crypto/aes-x86_64-asm.S')
-rw-r--r--arch/x86_64/crypto/aes-x86_64-asm.S186
1 files changed, 186 insertions, 0 deletions
diff --git a/arch/x86_64/crypto/aes-x86_64-asm.S b/arch/x86_64/crypto/aes-x86_64-asm.S
new file mode 100644
index 000000000000..483cbb23ab8d
--- /dev/null
+++ b/arch/x86_64/crypto/aes-x86_64-asm.S
@@ -0,0 +1,186 @@
1/* AES (Rijndael) implementation (FIPS PUB 197) for x86_64
2 *
3 * Copyright (C) 2005 Andreas Steinmetz, <ast@domdv.de>
4 *
5 * License:
6 * This code can be distributed under the terms of the GNU General Public
7 * License (GPL) Version 2 provided that the above header down to and
8 * including this sentence is retained in full.
9 */
10
11.extern aes_ft_tab
12.extern aes_it_tab
13.extern aes_fl_tab
14.extern aes_il_tab
15
16.text
17
18#define R1 %rax
19#define R1E %eax
20#define R1X %ax
21#define R1H %ah
22#define R1L %al
23#define R2 %rbx
24#define R2E %ebx
25#define R2X %bx
26#define R2H %bh
27#define R2L %bl
28#define R3 %rcx
29#define R3E %ecx
30#define R3X %cx
31#define R3H %ch
32#define R3L %cl
33#define R4 %rdx
34#define R4E %edx
35#define R4X %dx
36#define R4H %dh
37#define R4L %dl
38#define R5 %rsi
39#define R5E %esi
40#define R6 %rdi
41#define R6E %edi
42#define R7 %rbp
43#define R7E %ebp
44#define R8 %r8
45#define R9 %r9
46#define R10 %r10
47#define R11 %r11
48
49#define prologue(FUNC,BASE,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
50 .global FUNC; \
51 .type FUNC,@function; \
52 .align 8; \
53FUNC: movq r1,r2; \
54 movq r3,r4; \
55 leaq BASE+52(r8),r9; \
56 movq r10,r11; \
57 movl (r7),r5 ## E; \
58 movl 4(r7),r1 ## E; \
59 movl 8(r7),r6 ## E; \
60 movl 12(r7),r7 ## E; \
61 movl (r8),r10 ## E; \
62 xorl -48(r9),r5 ## E; \
63 xorl -44(r9),r1 ## E; \
64 xorl -40(r9),r6 ## E; \
65 xorl -36(r9),r7 ## E; \
66 cmpl $24,r10 ## E; \
67 jb B128; \
68 leaq 32(r9),r9; \
69 je B192; \
70 leaq 32(r9),r9;
71
72#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
73 movq r1,r2; \
74 movq r3,r4; \
75 movl r5 ## E,(r9); \
76 movl r6 ## E,4(r9); \
77 movl r7 ## E,8(r9); \
78 movl r8 ## E,12(r9); \
79 ret;
80
81#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
82 movzbl r2 ## H,r5 ## E; \
83 movzbl r2 ## L,r6 ## E; \
84 movl TAB+1024(,r5,4),r5 ## E;\
85 movw r4 ## X,r2 ## X; \
86 movl TAB(,r6,4),r6 ## E; \
87 roll $16,r2 ## E; \
88 shrl $16,r4 ## E; \
89 movzbl r4 ## H,r7 ## E; \
90 movzbl r4 ## L,r4 ## E; \
91 xorl OFFSET(r8),ra ## E; \
92 xorl OFFSET+4(r8),rb ## E; \
93 xorl TAB+3072(,r7,4),r5 ## E;\
94 xorl TAB+2048(,r4,4),r6 ## E;\
95 movzbl r1 ## L,r7 ## E; \
96 movzbl r1 ## H,r4 ## E; \
97 movl TAB+1024(,r4,4),r4 ## E;\
98 movw r3 ## X,r1 ## X; \
99 roll $16,r1 ## E; \
100 shrl $16,r3 ## E; \
101 xorl TAB(,r7,4),r5 ## E; \
102 movzbl r3 ## H,r7 ## E; \
103 movzbl r3 ## L,r3 ## E; \
104 xorl TAB+3072(,r7,4),r4 ## E;\
105 xorl TAB+2048(,r3,4),r5 ## E;\
106 movzbl r1 ## H,r7 ## E; \
107 movzbl r1 ## L,r3 ## E; \
108 shrl $16,r1 ## E; \
109 xorl TAB+3072(,r7,4),r6 ## E;\
110 movl TAB+2048(,r3,4),r3 ## E;\
111 movzbl r1 ## H,r7 ## E; \
112 movzbl r1 ## L,r1 ## E; \
113 xorl TAB+1024(,r7,4),r6 ## E;\
114 xorl TAB(,r1,4),r3 ## E; \
115 movzbl r2 ## H,r1 ## E; \
116 movzbl r2 ## L,r7 ## E; \
117 shrl $16,r2 ## E; \
118 xorl TAB+3072(,r1,4),r3 ## E;\
119 xorl TAB+2048(,r7,4),r4 ## E;\
120 movzbl r2 ## H,r1 ## E; \
121 movzbl r2 ## L,r2 ## E; \
122 xorl OFFSET+8(r8),rc ## E; \
123 xorl OFFSET+12(r8),rd ## E; \
124 xorl TAB+1024(,r1,4),r3 ## E;\
125 xorl TAB(,r2,4),r4 ## E;
126
127#define move_regs(r1,r2,r3,r4) \
128 movl r3 ## E,r1 ## E; \
129 movl r4 ## E,r2 ## E;
130
131#define entry(FUNC,BASE,B128,B192) \
132 prologue(FUNC,BASE,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
133
134#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
135
136#define encrypt_round(TAB,OFFSET) \
137 round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
138 move_regs(R1,R2,R5,R6)
139
140#define encrypt_final(TAB,OFFSET) \
141 round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
142
143#define decrypt_round(TAB,OFFSET) \
144 round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
145 move_regs(R1,R2,R5,R6)
146
147#define decrypt_final(TAB,OFFSET) \
148 round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
149
150/* void aes_encrypt(void *ctx, u8 *out, const u8 *in) */
151
152 entry(aes_encrypt,0,enc128,enc192)
153 encrypt_round(aes_ft_tab,-96)
154 encrypt_round(aes_ft_tab,-80)
155enc192: encrypt_round(aes_ft_tab,-64)
156 encrypt_round(aes_ft_tab,-48)
157enc128: encrypt_round(aes_ft_tab,-32)
158 encrypt_round(aes_ft_tab,-16)
159 encrypt_round(aes_ft_tab, 0)
160 encrypt_round(aes_ft_tab, 16)
161 encrypt_round(aes_ft_tab, 32)
162 encrypt_round(aes_ft_tab, 48)
163 encrypt_round(aes_ft_tab, 64)
164 encrypt_round(aes_ft_tab, 80)
165 encrypt_round(aes_ft_tab, 96)
166 encrypt_final(aes_fl_tab,112)
167 return
168
169/* void aes_decrypt(void *ctx, u8 *out, const u8 *in) */
170
171 entry(aes_decrypt,240,dec128,dec192)
172 decrypt_round(aes_it_tab,-96)
173 decrypt_round(aes_it_tab,-80)
174dec192: decrypt_round(aes_it_tab,-64)
175 decrypt_round(aes_it_tab,-48)
176dec128: decrypt_round(aes_it_tab,-32)
177 decrypt_round(aes_it_tab,-16)
178 decrypt_round(aes_it_tab, 0)
179 decrypt_round(aes_it_tab, 16)
180 decrypt_round(aes_it_tab, 32)
181 decrypt_round(aes_it_tab, 48)
182 decrypt_round(aes_it_tab, 64)
183 decrypt_round(aes_it_tab, 80)
184 decrypt_round(aes_it_tab, 96)
185 decrypt_final(aes_il_tab,112)
186 return