diff options
Diffstat (limited to 'arch/x86/crypto/ghash-clmulni-intel_asm.S')
-rw-r--r-- | arch/x86/crypto/ghash-clmulni-intel_asm.S | 157 |
1 files changed, 157 insertions, 0 deletions
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S new file mode 100644 index 000000000000..b9e787a511da --- /dev/null +++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S | |||
@@ -0,0 +1,157 @@ | |||
1 | /* | ||
2 | * Accelerated GHASH implementation with Intel PCLMULQDQ-NI | ||
3 | * instructions. This file contains accelerated part of ghash | ||
4 | * implementation. More information about PCLMULQDQ can be found at: | ||
5 | * | ||
6 | * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ | ||
7 | * | ||
8 | * Copyright (c) 2009 Intel Corp. | ||
9 | * Author: Huang Ying <ying.huang@intel.com> | ||
10 | * Vinodh Gopal | ||
11 | * Erdinc Ozturk | ||
12 | * Deniz Karakoyunlu | ||
13 | * | ||
14 | * This program is free software; you can redistribute it and/or modify it | ||
15 | * under the terms of the GNU General Public License version 2 as published | ||
16 | * by the Free Software Foundation. | ||
17 | */ | ||
18 | |||
19 | #include <linux/linkage.h> | ||
20 | |||
21 | .align 16 | ||
22 | .Lbswap_mask: | ||
23 | .octa 0x000102030405060708090a0b0c0d0e0f | ||
24 | .Lpoly: | ||
25 | .octa 0xc2000000000000000000000000000001 | ||
26 | .Ltwo_one: | ||
27 | .octa 0x00000001000000000000000000000001 | ||
28 | |||
29 | #define DATA %xmm0 | ||
30 | #define SHASH %xmm1 | ||
31 | #define T1 %xmm2 | ||
32 | #define T2 %xmm3 | ||
33 | #define T3 %xmm4 | ||
34 | #define BSWAP %xmm5 | ||
35 | #define IN1 %xmm6 | ||
36 | |||
37 | .text | ||
38 | |||
39 | /* | ||
40 | * __clmul_gf128mul_ble: internal ABI | ||
41 | * input: | ||
42 | * DATA: operand1 | ||
43 | * SHASH: operand2, hash_key << 1 mod poly | ||
44 | * output: | ||
45 | * DATA: operand1 * operand2 mod poly | ||
46 | * changed: | ||
47 | * T1 | ||
48 | * T2 | ||
49 | * T3 | ||
50 | */ | ||
51 | __clmul_gf128mul_ble: | ||
52 | movaps DATA, T1 | ||
53 | pshufd $0b01001110, DATA, T2 | ||
54 | pshufd $0b01001110, SHASH, T3 | ||
55 | pxor DATA, T2 | ||
56 | pxor SHASH, T3 | ||
57 | |||
58 | # pclmulqdq $0x00, SHASH, DATA # DATA = a0 * b0 | ||
59 | .byte 0x66, 0x0f, 0x3a, 0x44, 0xc1, 0x00 | ||
60 | # pclmulqdq $0x11, SHASH, T1 # T1 = a1 * b1 | ||
61 | .byte 0x66, 0x0f, 0x3a, 0x44, 0xd1, 0x11 | ||
62 | # pclmulqdq $0x00, T3, T2 # T2 = (a1 + a0) * (b1 + b0) | ||
63 | .byte 0x66, 0x0f, 0x3a, 0x44, 0xdc, 0x00 | ||
64 | pxor DATA, T2 | ||
65 | pxor T1, T2 # T2 = a0 * b1 + a1 * b0 | ||
66 | |||
67 | movaps T2, T3 | ||
68 | pslldq $8, T3 | ||
69 | psrldq $8, T2 | ||
70 | pxor T3, DATA | ||
71 | pxor T2, T1 # <T1:DATA> is result of | ||
72 | # carry-less multiplication | ||
73 | |||
74 | # first phase of the reduction | ||
75 | movaps DATA, T3 | ||
76 | psllq $1, T3 | ||
77 | pxor DATA, T3 | ||
78 | psllq $5, T3 | ||
79 | pxor DATA, T3 | ||
80 | psllq $57, T3 | ||
81 | movaps T3, T2 | ||
82 | pslldq $8, T2 | ||
83 | psrldq $8, T3 | ||
84 | pxor T2, DATA | ||
85 | pxor T3, T1 | ||
86 | |||
87 | # second phase of the reduction | ||
88 | movaps DATA, T2 | ||
89 | psrlq $5, T2 | ||
90 | pxor DATA, T2 | ||
91 | psrlq $1, T2 | ||
92 | pxor DATA, T2 | ||
93 | psrlq $1, T2 | ||
94 | pxor T2, T1 | ||
95 | pxor T1, DATA | ||
96 | ret | ||
97 | |||
98 | /* void clmul_ghash_mul(char *dst, const be128 *shash) */ | ||
99 | ENTRY(clmul_ghash_mul) | ||
100 | movups (%rdi), DATA | ||
101 | movups (%rsi), SHASH | ||
102 | movaps .Lbswap_mask, BSWAP | ||
103 | pshufb BSWAP, DATA | ||
104 | call __clmul_gf128mul_ble | ||
105 | pshufb BSWAP, DATA | ||
106 | movups DATA, (%rdi) | ||
107 | ret | ||
108 | |||
109 | /* | ||
110 | * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, | ||
111 | * const be128 *shash); | ||
112 | */ | ||
113 | ENTRY(clmul_ghash_update) | ||
114 | cmp $16, %rdx | ||
115 | jb .Lupdate_just_ret # check length | ||
116 | movaps .Lbswap_mask, BSWAP | ||
117 | movups (%rdi), DATA | ||
118 | movups (%rcx), SHASH | ||
119 | pshufb BSWAP, DATA | ||
120 | .align 4 | ||
121 | .Lupdate_loop: | ||
122 | movups (%rsi), IN1 | ||
123 | pshufb BSWAP, IN1 | ||
124 | pxor IN1, DATA | ||
125 | call __clmul_gf128mul_ble | ||
126 | sub $16, %rdx | ||
127 | add $16, %rsi | ||
128 | cmp $16, %rdx | ||
129 | jge .Lupdate_loop | ||
130 | pshufb BSWAP, DATA | ||
131 | movups DATA, (%rdi) | ||
132 | .Lupdate_just_ret: | ||
133 | ret | ||
134 | |||
135 | /* | ||
136 | * void clmul_ghash_setkey(be128 *shash, const u8 *key); | ||
137 | * | ||
138 | * Calculate hash_key << 1 mod poly | ||
139 | */ | ||
140 | ENTRY(clmul_ghash_setkey) | ||
141 | movaps .Lbswap_mask, BSWAP | ||
142 | movups (%rsi), %xmm0 | ||
143 | pshufb BSWAP, %xmm0 | ||
144 | movaps %xmm0, %xmm1 | ||
145 | psllq $1, %xmm0 | ||
146 | psrlq $63, %xmm1 | ||
147 | movaps %xmm1, %xmm2 | ||
148 | pslldq $8, %xmm1 | ||
149 | psrldq $8, %xmm2 | ||
150 | por %xmm1, %xmm0 | ||
151 | # reduction | ||
152 | pshufd $0b00100100, %xmm2, %xmm1 | ||
153 | pcmpeqd .Ltwo_one, %xmm1 | ||
154 | pand .Lpoly, %xmm1 | ||
155 | pxor %xmm1, %xmm0 | ||
156 | movups %xmm0, (%rdi) | ||
157 | ret | ||