diff options
Diffstat (limited to 'arch/x86/crypto/camellia-x86_64-asm_64.S')
-rw-r--r-- | arch/x86/crypto/camellia-x86_64-asm_64.S | 520 |
1 files changed, 520 insertions, 0 deletions
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S new file mode 100644 index 000000000000..0b3374335fdc --- /dev/null +++ b/arch/x86/crypto/camellia-x86_64-asm_64.S | |||
@@ -0,0 +1,520 @@ | |||
1 | /* | ||
2 | * Camellia Cipher Algorithm (x86_64) | ||
3 | * | ||
4 | * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it will be useful, | ||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
14 | * GNU General Public License for more details. | ||
15 | * | ||
16 | * You should have received a copy of the GNU General Public License | ||
17 | * along with this program; if not, write to the Free Software | ||
18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | ||
19 | * USA | ||
20 | * | ||
21 | */ | ||
22 | |||
23 | .file "camellia-x86_64-asm_64.S" | ||
24 | .text | ||
25 | |||
26 | .extern camellia_sp10011110; | ||
27 | .extern camellia_sp22000222; | ||
28 | .extern camellia_sp03303033; | ||
29 | .extern camellia_sp00444404; | ||
30 | .extern camellia_sp02220222; | ||
31 | .extern camellia_sp30333033; | ||
32 | .extern camellia_sp44044404; | ||
33 | .extern camellia_sp11101110; | ||
34 | |||
35 | #define sp10011110 camellia_sp10011110 | ||
36 | #define sp22000222 camellia_sp22000222 | ||
37 | #define sp03303033 camellia_sp03303033 | ||
38 | #define sp00444404 camellia_sp00444404 | ||
39 | #define sp02220222 camellia_sp02220222 | ||
40 | #define sp30333033 camellia_sp30333033 | ||
41 | #define sp44044404 camellia_sp44044404 | ||
42 | #define sp11101110 camellia_sp11101110 | ||
43 | |||
44 | #define CAMELLIA_TABLE_BYTE_LEN 272 | ||
45 | |||
46 | /* struct camellia_ctx: */ | ||
47 | #define key_table 0 | ||
48 | #define key_length CAMELLIA_TABLE_BYTE_LEN | ||
49 | |||
50 | /* register macros */ | ||
51 | #define CTX %rdi | ||
52 | #define RIO %rsi | ||
53 | #define RIOd %esi | ||
54 | |||
55 | #define RAB0 %rax | ||
56 | #define RCD0 %rcx | ||
57 | #define RAB1 %rbx | ||
58 | #define RCD1 %rdx | ||
59 | |||
60 | #define RAB0d %eax | ||
61 | #define RCD0d %ecx | ||
62 | #define RAB1d %ebx | ||
63 | #define RCD1d %edx | ||
64 | |||
65 | #define RAB0bl %al | ||
66 | #define RCD0bl %cl | ||
67 | #define RAB1bl %bl | ||
68 | #define RCD1bl %dl | ||
69 | |||
70 | #define RAB0bh %ah | ||
71 | #define RCD0bh %ch | ||
72 | #define RAB1bh %bh | ||
73 | #define RCD1bh %dh | ||
74 | |||
75 | #define RT0 %rsi | ||
76 | #define RT1 %rbp | ||
77 | #define RT2 %r8 | ||
78 | |||
79 | #define RT0d %esi | ||
80 | #define RT1d %ebp | ||
81 | #define RT2d %r8d | ||
82 | |||
83 | #define RT2bl %r8b | ||
84 | |||
85 | #define RXOR %r9 | ||
86 | #define RRBP %r10 | ||
87 | #define RDST %r11 | ||
88 | |||
89 | #define RXORd %r9d | ||
90 | #define RXORbl %r9b | ||
91 | |||
92 | #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ | ||
93 | movzbl ab ## bl, tmp2 ## d; \ | ||
94 | movzbl ab ## bh, tmp1 ## d; \ | ||
95 | rorq $16, ab; \ | ||
96 | xorq T0(, tmp2, 8), dst; \ | ||
97 | xorq T1(, tmp1, 8), dst; | ||
98 | |||
99 | /********************************************************************** | ||
100 | 1-way camellia | ||
101 | **********************************************************************/ | ||
102 | #define roundsm(ab, subkey, cd) \ | ||
103 | movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ | ||
104 | \ | ||
105 | xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ | ||
106 | xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ | ||
107 | xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ | ||
108 | xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ | ||
109 | \ | ||
110 | xorq RT2, cd ## 0; | ||
111 | |||
112 | #define fls(l, r, kl, kr) \ | ||
113 | movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ | ||
114 | andl l ## 0d, RT0d; \ | ||
115 | roll $1, RT0d; \ | ||
116 | shlq $32, RT0; \ | ||
117 | xorq RT0, l ## 0; \ | ||
118 | movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ | ||
119 | orq r ## 0, RT1; \ | ||
120 | shrq $32, RT1; \ | ||
121 | xorq RT1, r ## 0; \ | ||
122 | \ | ||
123 | movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ | ||
124 | orq l ## 0, RT2; \ | ||
125 | shrq $32, RT2; \ | ||
126 | xorq RT2, l ## 0; \ | ||
127 | movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ | ||
128 | andl r ## 0d, RT0d; \ | ||
129 | roll $1, RT0d; \ | ||
130 | shlq $32, RT0; \ | ||
131 | xorq RT0, r ## 0; | ||
132 | |||
133 | #define enc_rounds(i) \ | ||
134 | roundsm(RAB, i + 2, RCD); \ | ||
135 | roundsm(RCD, i + 3, RAB); \ | ||
136 | roundsm(RAB, i + 4, RCD); \ | ||
137 | roundsm(RCD, i + 5, RAB); \ | ||
138 | roundsm(RAB, i + 6, RCD); \ | ||
139 | roundsm(RCD, i + 7, RAB); | ||
140 | |||
141 | #define enc_fls(i) \ | ||
142 | fls(RAB, RCD, i + 0, i + 1); | ||
143 | |||
144 | #define enc_inpack() \ | ||
145 | movq (RIO), RAB0; \ | ||
146 | bswapq RAB0; \ | ||
147 | rolq $32, RAB0; \ | ||
148 | movq 4*2(RIO), RCD0; \ | ||
149 | bswapq RCD0; \ | ||
150 | rorq $32, RCD0; \ | ||
151 | xorq key_table(CTX), RAB0; | ||
152 | |||
153 | #define enc_outunpack(op, max) \ | ||
154 | xorq key_table(CTX, max, 8), RCD0; \ | ||
155 | rorq $32, RCD0; \ | ||
156 | bswapq RCD0; \ | ||
157 | op ## q RCD0, (RIO); \ | ||
158 | rolq $32, RAB0; \ | ||
159 | bswapq RAB0; \ | ||
160 | op ## q RAB0, 4*2(RIO); | ||
161 | |||
162 | #define dec_rounds(i) \ | ||
163 | roundsm(RAB, i + 7, RCD); \ | ||
164 | roundsm(RCD, i + 6, RAB); \ | ||
165 | roundsm(RAB, i + 5, RCD); \ | ||
166 | roundsm(RCD, i + 4, RAB); \ | ||
167 | roundsm(RAB, i + 3, RCD); \ | ||
168 | roundsm(RCD, i + 2, RAB); | ||
169 | |||
170 | #define dec_fls(i) \ | ||
171 | fls(RAB, RCD, i + 1, i + 0); | ||
172 | |||
173 | #define dec_inpack(max) \ | ||
174 | movq (RIO), RAB0; \ | ||
175 | bswapq RAB0; \ | ||
176 | rolq $32, RAB0; \ | ||
177 | movq 4*2(RIO), RCD0; \ | ||
178 | bswapq RCD0; \ | ||
179 | rorq $32, RCD0; \ | ||
180 | xorq key_table(CTX, max, 8), RAB0; | ||
181 | |||
182 | #define dec_outunpack() \ | ||
183 | xorq key_table(CTX), RCD0; \ | ||
184 | rorq $32, RCD0; \ | ||
185 | bswapq RCD0; \ | ||
186 | movq RCD0, (RIO); \ | ||
187 | rolq $32, RAB0; \ | ||
188 | bswapq RAB0; \ | ||
189 | movq RAB0, 4*2(RIO); | ||
190 | |||
191 | .global __camellia_enc_blk; | ||
192 | .type __camellia_enc_blk,@function; | ||
193 | |||
194 | __camellia_enc_blk: | ||
195 | /* input: | ||
196 | * %rdi: ctx, CTX | ||
197 | * %rsi: dst | ||
198 | * %rdx: src | ||
199 | * %rcx: bool xor | ||
200 | */ | ||
201 | movq %rbp, RRBP; | ||
202 | |||
203 | movq %rcx, RXOR; | ||
204 | movq %rsi, RDST; | ||
205 | movq %rdx, RIO; | ||
206 | |||
207 | enc_inpack(); | ||
208 | |||
209 | enc_rounds(0); | ||
210 | enc_fls(8); | ||
211 | enc_rounds(8); | ||
212 | enc_fls(16); | ||
213 | enc_rounds(16); | ||
214 | movl $24, RT1d; /* max */ | ||
215 | |||
216 | cmpb $16, key_length(CTX); | ||
217 | je __enc_done; | ||
218 | |||
219 | enc_fls(24); | ||
220 | enc_rounds(24); | ||
221 | movl $32, RT1d; /* max */ | ||
222 | |||
223 | __enc_done: | ||
224 | testb RXORbl, RXORbl; | ||
225 | movq RDST, RIO; | ||
226 | |||
227 | jnz __enc_xor; | ||
228 | |||
229 | enc_outunpack(mov, RT1); | ||
230 | |||
231 | movq RRBP, %rbp; | ||
232 | ret; | ||
233 | |||
234 | __enc_xor: | ||
235 | enc_outunpack(xor, RT1); | ||
236 | |||
237 | movq RRBP, %rbp; | ||
238 | ret; | ||
239 | |||
240 | .global camellia_dec_blk; | ||
241 | .type camellia_dec_blk,@function; | ||
242 | |||
243 | camellia_dec_blk: | ||
244 | /* input: | ||
245 | * %rdi: ctx, CTX | ||
246 | * %rsi: dst | ||
247 | * %rdx: src | ||
248 | */ | ||
249 | cmpl $16, key_length(CTX); | ||
250 | movl $32, RT2d; | ||
251 | movl $24, RXORd; | ||
252 | cmovel RXORd, RT2d; /* max */ | ||
253 | |||
254 | movq %rbp, RRBP; | ||
255 | movq %rsi, RDST; | ||
256 | movq %rdx, RIO; | ||
257 | |||
258 | dec_inpack(RT2); | ||
259 | |||
260 | cmpb $24, RT2bl; | ||
261 | je __dec_rounds16; | ||
262 | |||
263 | dec_rounds(24); | ||
264 | dec_fls(24); | ||
265 | |||
266 | __dec_rounds16: | ||
267 | dec_rounds(16); | ||
268 | dec_fls(16); | ||
269 | dec_rounds(8); | ||
270 | dec_fls(8); | ||
271 | dec_rounds(0); | ||
272 | |||
273 | movq RDST, RIO; | ||
274 | |||
275 | dec_outunpack(); | ||
276 | |||
277 | movq RRBP, %rbp; | ||
278 | ret; | ||
279 | |||
280 | /********************************************************************** | ||
281 | 2-way camellia | ||
282 | **********************************************************************/ | ||
283 | #define roundsm2(ab, subkey, cd) \ | ||
284 | movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ | ||
285 | xorq RT2, cd ## 1; \ | ||
286 | \ | ||
287 | xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ | ||
288 | xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ | ||
289 | xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ | ||
290 | xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ | ||
291 | \ | ||
292 | xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ | ||
293 | xorq RT2, cd ## 0; \ | ||
294 | xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ | ||
295 | xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ | ||
296 | xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); | ||
297 | |||
298 | #define fls2(l, r, kl, kr) \ | ||
299 | movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ | ||
300 | andl l ## 0d, RT0d; \ | ||
301 | roll $1, RT0d; \ | ||
302 | shlq $32, RT0; \ | ||
303 | xorq RT0, l ## 0; \ | ||
304 | movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ | ||
305 | orq r ## 0, RT1; \ | ||
306 | shrq $32, RT1; \ | ||
307 | xorq RT1, r ## 0; \ | ||
308 | \ | ||
309 | movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ | ||
310 | andl l ## 1d, RT2d; \ | ||
311 | roll $1, RT2d; \ | ||
312 | shlq $32, RT2; \ | ||
313 | xorq RT2, l ## 1; \ | ||
314 | movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ | ||
315 | orq r ## 1, RT0; \ | ||
316 | shrq $32, RT0; \ | ||
317 | xorq RT0, r ## 1; \ | ||
318 | \ | ||
319 | movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ | ||
320 | orq l ## 0, RT1; \ | ||
321 | shrq $32, RT1; \ | ||
322 | xorq RT1, l ## 0; \ | ||
323 | movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ | ||
324 | andl r ## 0d, RT2d; \ | ||
325 | roll $1, RT2d; \ | ||
326 | shlq $32, RT2; \ | ||
327 | xorq RT2, r ## 0; \ | ||
328 | \ | ||
329 | movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ | ||
330 | orq l ## 1, RT0; \ | ||
331 | shrq $32, RT0; \ | ||
332 | xorq RT0, l ## 1; \ | ||
333 | movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ | ||
334 | andl r ## 1d, RT1d; \ | ||
335 | roll $1, RT1d; \ | ||
336 | shlq $32, RT1; \ | ||
337 | xorq RT1, r ## 1; | ||
338 | |||
339 | #define enc_rounds2(i) \ | ||
340 | roundsm2(RAB, i + 2, RCD); \ | ||
341 | roundsm2(RCD, i + 3, RAB); \ | ||
342 | roundsm2(RAB, i + 4, RCD); \ | ||
343 | roundsm2(RCD, i + 5, RAB); \ | ||
344 | roundsm2(RAB, i + 6, RCD); \ | ||
345 | roundsm2(RCD, i + 7, RAB); | ||
346 | |||
347 | #define enc_fls2(i) \ | ||
348 | fls2(RAB, RCD, i + 0, i + 1); | ||
349 | |||
350 | #define enc_inpack2() \ | ||
351 | movq (RIO), RAB0; \ | ||
352 | bswapq RAB0; \ | ||
353 | rorq $32, RAB0; \ | ||
354 | movq 4*2(RIO), RCD0; \ | ||
355 | bswapq RCD0; \ | ||
356 | rolq $32, RCD0; \ | ||
357 | xorq key_table(CTX), RAB0; \ | ||
358 | \ | ||
359 | movq 8*2(RIO), RAB1; \ | ||
360 | bswapq RAB1; \ | ||
361 | rorq $32, RAB1; \ | ||
362 | movq 12*2(RIO), RCD1; \ | ||
363 | bswapq RCD1; \ | ||
364 | rolq $32, RCD1; \ | ||
365 | xorq key_table(CTX), RAB1; | ||
366 | |||
367 | #define enc_outunpack2(op, max) \ | ||
368 | xorq key_table(CTX, max, 8), RCD0; \ | ||
369 | rolq $32, RCD0; \ | ||
370 | bswapq RCD0; \ | ||
371 | op ## q RCD0, (RIO); \ | ||
372 | rorq $32, RAB0; \ | ||
373 | bswapq RAB0; \ | ||
374 | op ## q RAB0, 4*2(RIO); \ | ||
375 | \ | ||
376 | xorq key_table(CTX, max, 8), RCD1; \ | ||
377 | rolq $32, RCD1; \ | ||
378 | bswapq RCD1; \ | ||
379 | op ## q RCD1, 8*2(RIO); \ | ||
380 | rorq $32, RAB1; \ | ||
381 | bswapq RAB1; \ | ||
382 | op ## q RAB1, 12*2(RIO); | ||
383 | |||
384 | #define dec_rounds2(i) \ | ||
385 | roundsm2(RAB, i + 7, RCD); \ | ||
386 | roundsm2(RCD, i + 6, RAB); \ | ||
387 | roundsm2(RAB, i + 5, RCD); \ | ||
388 | roundsm2(RCD, i + 4, RAB); \ | ||
389 | roundsm2(RAB, i + 3, RCD); \ | ||
390 | roundsm2(RCD, i + 2, RAB); | ||
391 | |||
392 | #define dec_fls2(i) \ | ||
393 | fls2(RAB, RCD, i + 1, i + 0); | ||
394 | |||
395 | #define dec_inpack2(max) \ | ||
396 | movq (RIO), RAB0; \ | ||
397 | bswapq RAB0; \ | ||
398 | rorq $32, RAB0; \ | ||
399 | movq 4*2(RIO), RCD0; \ | ||
400 | bswapq RCD0; \ | ||
401 | rolq $32, RCD0; \ | ||
402 | xorq key_table(CTX, max, 8), RAB0; \ | ||
403 | \ | ||
404 | movq 8*2(RIO), RAB1; \ | ||
405 | bswapq RAB1; \ | ||
406 | rorq $32, RAB1; \ | ||
407 | movq 12*2(RIO), RCD1; \ | ||
408 | bswapq RCD1; \ | ||
409 | rolq $32, RCD1; \ | ||
410 | xorq key_table(CTX, max, 8), RAB1; | ||
411 | |||
412 | #define dec_outunpack2() \ | ||
413 | xorq key_table(CTX), RCD0; \ | ||
414 | rolq $32, RCD0; \ | ||
415 | bswapq RCD0; \ | ||
416 | movq RCD0, (RIO); \ | ||
417 | rorq $32, RAB0; \ | ||
418 | bswapq RAB0; \ | ||
419 | movq RAB0, 4*2(RIO); \ | ||
420 | \ | ||
421 | xorq key_table(CTX), RCD1; \ | ||
422 | rolq $32, RCD1; \ | ||
423 | bswapq RCD1; \ | ||
424 | movq RCD1, 8*2(RIO); \ | ||
425 | rorq $32, RAB1; \ | ||
426 | bswapq RAB1; \ | ||
427 | movq RAB1, 12*2(RIO); | ||
428 | |||
429 | .global __camellia_enc_blk_2way; | ||
430 | .type __camellia_enc_blk_2way,@function; | ||
431 | |||
432 | __camellia_enc_blk_2way: | ||
433 | /* input: | ||
434 | * %rdi: ctx, CTX | ||
435 | * %rsi: dst | ||
436 | * %rdx: src | ||
437 | * %rcx: bool xor | ||
438 | */ | ||
439 | pushq %rbx; | ||
440 | |||
441 | movq %rbp, RRBP; | ||
442 | movq %rcx, RXOR; | ||
443 | movq %rsi, RDST; | ||
444 | movq %rdx, RIO; | ||
445 | |||
446 | enc_inpack2(); | ||
447 | |||
448 | enc_rounds2(0); | ||
449 | enc_fls2(8); | ||
450 | enc_rounds2(8); | ||
451 | enc_fls2(16); | ||
452 | enc_rounds2(16); | ||
453 | movl $24, RT2d; /* max */ | ||
454 | |||
455 | cmpb $16, key_length(CTX); | ||
456 | je __enc2_done; | ||
457 | |||
458 | enc_fls2(24); | ||
459 | enc_rounds2(24); | ||
460 | movl $32, RT2d; /* max */ | ||
461 | |||
462 | __enc2_done: | ||
463 | test RXORbl, RXORbl; | ||
464 | movq RDST, RIO; | ||
465 | jnz __enc2_xor; | ||
466 | |||
467 | enc_outunpack2(mov, RT2); | ||
468 | |||
469 | movq RRBP, %rbp; | ||
470 | popq %rbx; | ||
471 | ret; | ||
472 | |||
473 | __enc2_xor: | ||
474 | enc_outunpack2(xor, RT2); | ||
475 | |||
476 | movq RRBP, %rbp; | ||
477 | popq %rbx; | ||
478 | ret; | ||
479 | |||
480 | .global camellia_dec_blk_2way; | ||
481 | .type camellia_dec_blk_2way,@function; | ||
482 | |||
483 | camellia_dec_blk_2way: | ||
484 | /* input: | ||
485 | * %rdi: ctx, CTX | ||
486 | * %rsi: dst | ||
487 | * %rdx: src | ||
488 | */ | ||
489 | cmpl $16, key_length(CTX); | ||
490 | movl $32, RT2d; | ||
491 | movl $24, RXORd; | ||
492 | cmovel RXORd, RT2d; /* max */ | ||
493 | |||
494 | movq %rbx, RXOR; | ||
495 | movq %rbp, RRBP; | ||
496 | movq %rsi, RDST; | ||
497 | movq %rdx, RIO; | ||
498 | |||
499 | dec_inpack2(RT2); | ||
500 | |||
501 | cmpb $24, RT2bl; | ||
502 | je __dec2_rounds16; | ||
503 | |||
504 | dec_rounds2(24); | ||
505 | dec_fls2(24); | ||
506 | |||
507 | __dec2_rounds16: | ||
508 | dec_rounds2(16); | ||
509 | dec_fls2(16); | ||
510 | dec_rounds2(8); | ||
511 | dec_fls2(8); | ||
512 | dec_rounds2(0); | ||
513 | |||
514 | movq RDST, RIO; | ||
515 | |||
516 | dec_outunpack2(); | ||
517 | |||
518 | movq RRBP, %rbp; | ||
519 | movq RXOR, %rbx; | ||
520 | ret; | ||