aboutsummaryrefslogtreecommitdiffstats
path: root/include/asm-x86/xor_64.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/asm-x86/xor_64.h')
-rw-r--r--include/asm-x86/xor_64.h294
1 files changed, 149 insertions, 145 deletions
diff --git a/include/asm-x86/xor_64.h b/include/asm-x86/xor_64.h
index 1eee7fcb2420..24957e39ac8a 100644
--- a/include/asm-x86/xor_64.h
+++ b/include/asm-x86/xor_64.h
@@ -24,20 +24,23 @@
24 */ 24 */
25 25
26/* 26/*
27 * x86-64 changes / gcc fixes from Andi Kleen. 27 * x86-64 changes / gcc fixes from Andi Kleen.
28 * Copyright 2002 Andi Kleen, SuSE Labs. 28 * Copyright 2002 Andi Kleen, SuSE Labs.
29 * 29 *
30 * This hasn't been optimized for the hammer yet, but there are likely 30 * This hasn't been optimized for the hammer yet, but there are likely
31 * no advantages to be gotten from x86-64 here anyways. 31 * no advantages to be gotten from x86-64 here anyways.
32 */ 32 */
33 33
34typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; 34typedef struct {
35 unsigned long a, b;
36} __attribute__((aligned(16))) xmm_store_t;
35 37
36/* Doesn't use gcc to save the XMM registers, because there is no easy way to 38/* Doesn't use gcc to save the XMM registers, because there is no easy way to
37 tell it to do a clts before the register saving. */ 39 tell it to do a clts before the register saving. */
38#define XMMS_SAVE do { \ 40#define XMMS_SAVE \
41do { \
39 preempt_disable(); \ 42 preempt_disable(); \
40 asm volatile ( \ 43 asm volatile( \
41 "movq %%cr0,%0 ;\n\t" \ 44 "movq %%cr0,%0 ;\n\t" \
42 "clts ;\n\t" \ 45 "clts ;\n\t" \
43 "movups %%xmm0,(%1) ;\n\t" \ 46 "movups %%xmm0,(%1) ;\n\t" \
@@ -47,10 +50,11 @@ typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
47 : "=&r" (cr0) \ 50 : "=&r" (cr0) \
48 : "r" (xmm_save) \ 51 : "r" (xmm_save) \
49 : "memory"); \ 52 : "memory"); \
50} while(0) 53} while (0)
51 54
52#define XMMS_RESTORE do { \ 55#define XMMS_RESTORE \
53 asm volatile ( \ 56do { \
57 asm volatile( \
54 "sfence ;\n\t" \ 58 "sfence ;\n\t" \
55 "movups (%1),%%xmm0 ;\n\t" \ 59 "movups (%1),%%xmm0 ;\n\t" \
56 "movups 0x10(%1),%%xmm1 ;\n\t" \ 60 "movups 0x10(%1),%%xmm1 ;\n\t" \
@@ -61,72 +65,72 @@ typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
61 : "r" (cr0), "r" (xmm_save) \ 65 : "r" (cr0), "r" (xmm_save) \
62 : "memory"); \ 66 : "memory"); \
63 preempt_enable(); \ 67 preempt_enable(); \
64} while(0) 68} while (0)
65 69
66#define OFFS(x) "16*("#x")" 70#define OFFS(x) "16*("#x")"
67#define PF_OFFS(x) "256+16*("#x")" 71#define PF_OFFS(x) "256+16*("#x")"
68#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n" 72#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
69#define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n" 73#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
70#define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n" 74#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
71#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n" 75#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
72#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n" 76#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
73#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n" 77#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
74#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n" 78#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
75#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n" 79#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
76#define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n" 80#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
77#define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 81#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
78#define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 82#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
79#define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 83#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
80#define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n" 84#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
81 85
82 86
83static void 87static void
84xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 88xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
85{ 89{
86 unsigned int lines = bytes >> 8; 90 unsigned int lines = bytes >> 8;
87 unsigned long cr0; 91 unsigned long cr0;
88 xmm_store_t xmm_save[4]; 92 xmm_store_t xmm_save[4];
89 93
90 XMMS_SAVE; 94 XMMS_SAVE;
91 95
92 asm volatile ( 96 asm volatile(
93#undef BLOCK 97#undef BLOCK
94#define BLOCK(i) \ 98#define BLOCK(i) \
95 LD(i,0) \ 99 LD(i, 0) \
96 LD(i+1,1) \ 100 LD(i + 1, 1) \
97 PF1(i) \ 101 PF1(i) \
98 PF1(i+2) \ 102 PF1(i + 2) \
99 LD(i+2,2) \ 103 LD(i + 2, 2) \
100 LD(i+3,3) \ 104 LD(i + 3, 3) \
101 PF0(i+4) \ 105 PF0(i + 4) \
102 PF0(i+6) \ 106 PF0(i + 6) \
103 XO1(i,0) \ 107 XO1(i, 0) \
104 XO1(i+1,1) \ 108 XO1(i + 1, 1) \
105 XO1(i+2,2) \ 109 XO1(i + 2, 2) \
106 XO1(i+3,3) \ 110 XO1(i + 3, 3) \
107 ST(i,0) \ 111 ST(i, 0) \
108 ST(i+1,1) \ 112 ST(i + 1, 1) \
109 ST(i+2,2) \ 113 ST(i + 2, 2) \
110 ST(i+3,3) \ 114 ST(i + 3, 3) \
111 115
112 116
113 PF0(0) 117 PF0(0)
114 PF0(2) 118 PF0(2)
115 119
116 " .align 32 ;\n" 120 " .align 32 ;\n"
117 " 1: ;\n" 121 " 1: ;\n"
118 122
119 BLOCK(0) 123 BLOCK(0)
120 BLOCK(4) 124 BLOCK(4)
121 BLOCK(8) 125 BLOCK(8)
122 BLOCK(12) 126 BLOCK(12)
123 127
124 " addq %[inc], %[p1] ;\n" 128 " addq %[inc], %[p1] ;\n"
125 " addq %[inc], %[p2] ;\n" 129 " addq %[inc], %[p2] ;\n"
126 " decl %[cnt] ; jnz 1b" 130 " decl %[cnt] ; jnz 1b"
127 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines) 131 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
128 : [inc] "r" (256UL) 132 : [inc] "r" (256UL)
129 : "memory"); 133 : "memory");
130 134
131 XMMS_RESTORE; 135 XMMS_RESTORE;
132} 136}
@@ -141,52 +145,52 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
141 145
142 XMMS_SAVE; 146 XMMS_SAVE;
143 147
144 __asm__ __volatile__ ( 148 asm volatile(
145#undef BLOCK 149#undef BLOCK
146#define BLOCK(i) \ 150#define BLOCK(i) \
147 PF1(i) \ 151 PF1(i) \
148 PF1(i+2) \ 152 PF1(i + 2) \
149 LD(i,0) \ 153 LD(i, 0) \
150 LD(i+1,1) \ 154 LD(i + 1, 1) \
151 LD(i+2,2) \ 155 LD(i + 2, 2) \
152 LD(i+3,3) \ 156 LD(i + 3, 3) \
153 PF2(i) \ 157 PF2(i) \
154 PF2(i+2) \ 158 PF2(i + 2) \
155 PF0(i+4) \ 159 PF0(i + 4) \
156 PF0(i+6) \ 160 PF0(i + 6) \
157 XO1(i,0) \ 161 XO1(i, 0) \
158 XO1(i+1,1) \ 162 XO1(i + 1, 1) \
159 XO1(i+2,2) \ 163 XO1(i + 2, 2) \
160 XO1(i+3,3) \ 164 XO1(i + 3, 3) \
161 XO2(i,0) \ 165 XO2(i, 0) \
162 XO2(i+1,1) \ 166 XO2(i + 1, 1) \
163 XO2(i+2,2) \ 167 XO2(i + 2, 2) \
164 XO2(i+3,3) \ 168 XO2(i + 3, 3) \
165 ST(i,0) \ 169 ST(i, 0) \
166 ST(i+1,1) \ 170 ST(i + 1, 1) \
167 ST(i+2,2) \ 171 ST(i + 2, 2) \
168 ST(i+3,3) \ 172 ST(i + 3, 3) \
169 173
170 174
171 PF0(0) 175 PF0(0)
172 PF0(2) 176 PF0(2)
173 177
174 " .align 32 ;\n" 178 " .align 32 ;\n"
175 " 1: ;\n" 179 " 1: ;\n"
176 180
177 BLOCK(0) 181 BLOCK(0)
178 BLOCK(4) 182 BLOCK(4)
179 BLOCK(8) 183 BLOCK(8)
180 BLOCK(12) 184 BLOCK(12)
181 185
182 " addq %[inc], %[p1] ;\n" 186 " addq %[inc], %[p1] ;\n"
183 " addq %[inc], %[p2] ;\n" 187 " addq %[inc], %[p2] ;\n"
184 " addq %[inc], %[p3] ;\n" 188 " addq %[inc], %[p3] ;\n"
185 " decl %[cnt] ; jnz 1b" 189 " decl %[cnt] ; jnz 1b"
186 : [cnt] "+r" (lines), 190 : [cnt] "+r" (lines),
187 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) 191 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
188 : [inc] "r" (256UL) 192 : [inc] "r" (256UL)
189 : "memory"); 193 : "memory");
190 XMMS_RESTORE; 194 XMMS_RESTORE;
191} 195}
192 196
@@ -195,64 +199,64 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195 unsigned long *p3, unsigned long *p4) 199 unsigned long *p3, unsigned long *p4)
196{ 200{
197 unsigned int lines = bytes >> 8; 201 unsigned int lines = bytes >> 8;
198 xmm_store_t xmm_save[4]; 202 xmm_store_t xmm_save[4];
199 unsigned long cr0; 203 unsigned long cr0;
200 204
201 XMMS_SAVE; 205 XMMS_SAVE;
202 206
203 __asm__ __volatile__ ( 207 asm volatile(
204#undef BLOCK 208#undef BLOCK
205#define BLOCK(i) \ 209#define BLOCK(i) \
206 PF1(i) \ 210 PF1(i) \
207 PF1(i+2) \ 211 PF1(i + 2) \
208 LD(i,0) \ 212 LD(i, 0) \
209 LD(i+1,1) \ 213 LD(i + 1, 1) \
210 LD(i+2,2) \ 214 LD(i + 2, 2) \
211 LD(i+3,3) \ 215 LD(i + 3, 3) \
212 PF2(i) \ 216 PF2(i) \
213 PF2(i+2) \ 217 PF2(i + 2) \
214 XO1(i,0) \ 218 XO1(i, 0) \
215 XO1(i+1,1) \ 219 XO1(i + 1, 1) \
216 XO1(i+2,2) \ 220 XO1(i + 2, 2) \
217 XO1(i+3,3) \ 221 XO1(i + 3, 3) \
218 PF3(i) \ 222 PF3(i) \
219 PF3(i+2) \ 223 PF3(i + 2) \
220 PF0(i+4) \ 224 PF0(i + 4) \
221 PF0(i+6) \ 225 PF0(i + 6) \
222 XO2(i,0) \ 226 XO2(i, 0) \
223 XO2(i+1,1) \ 227 XO2(i + 1, 1) \
224 XO2(i+2,2) \ 228 XO2(i + 2, 2) \
225 XO2(i+3,3) \ 229 XO2(i + 3, 3) \
226 XO3(i,0) \ 230 XO3(i, 0) \
227 XO3(i+1,1) \ 231 XO3(i + 1, 1) \
228 XO3(i+2,2) \ 232 XO3(i + 2, 2) \
229 XO3(i+3,3) \ 233 XO3(i + 3, 3) \
230 ST(i,0) \ 234 ST(i, 0) \
231 ST(i+1,1) \ 235 ST(i + 1, 1) \
232 ST(i+2,2) \ 236 ST(i + 2, 2) \
233 ST(i+3,3) \ 237 ST(i + 3, 3) \
234 238
235 239
236 PF0(0) 240 PF0(0)
237 PF0(2) 241 PF0(2)
238 242
239 " .align 32 ;\n" 243 " .align 32 ;\n"
240 " 1: ;\n" 244 " 1: ;\n"
241 245
242 BLOCK(0) 246 BLOCK(0)
243 BLOCK(4) 247 BLOCK(4)
244 BLOCK(8) 248 BLOCK(8)
245 BLOCK(12) 249 BLOCK(12)
246 250
247 " addq %[inc], %[p1] ;\n" 251 " addq %[inc], %[p1] ;\n"
248 " addq %[inc], %[p2] ;\n" 252 " addq %[inc], %[p2] ;\n"
249 " addq %[inc], %[p3] ;\n" 253 " addq %[inc], %[p3] ;\n"
250 " addq %[inc], %[p4] ;\n" 254 " addq %[inc], %[p4] ;\n"
251 " decl %[cnt] ; jnz 1b" 255 " decl %[cnt] ; jnz 1b"
252 : [cnt] "+c" (lines), 256 : [cnt] "+c" (lines),
253 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) 257 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
254 : [inc] "r" (256UL) 258 : [inc] "r" (256UL)
255 : "memory" ); 259 : "memory" );
256 260
257 XMMS_RESTORE; 261 XMMS_RESTORE;
258} 262}
@@ -261,70 +265,70 @@ static void
261xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 265xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
262 unsigned long *p3, unsigned long *p4, unsigned long *p5) 266 unsigned long *p3, unsigned long *p4, unsigned long *p5)
263{ 267{
264 unsigned int lines = bytes >> 8; 268 unsigned int lines = bytes >> 8;
265 xmm_store_t xmm_save[4]; 269 xmm_store_t xmm_save[4];
266 unsigned long cr0; 270 unsigned long cr0;
267 271
268 XMMS_SAVE; 272 XMMS_SAVE;
269 273
270 __asm__ __volatile__ ( 274 asm volatile(
271#undef BLOCK 275#undef BLOCK
272#define BLOCK(i) \ 276#define BLOCK(i) \
273 PF1(i) \ 277 PF1(i) \
274 PF1(i+2) \ 278 PF1(i + 2) \
275 LD(i,0) \ 279 LD(i, 0) \
276 LD(i+1,1) \ 280 LD(i + 1, 1) \
277 LD(i+2,2) \ 281 LD(i + 2, 2) \
278 LD(i+3,3) \ 282 LD(i + 3, 3) \
279 PF2(i) \ 283 PF2(i) \
280 PF2(i+2) \ 284 PF2(i + 2) \
281 XO1(i,0) \ 285 XO1(i, 0) \
282 XO1(i+1,1) \ 286 XO1(i + 1, 1) \
283 XO1(i+2,2) \ 287 XO1(i + 2, 2) \
284 XO1(i+3,3) \ 288 XO1(i + 3, 3) \
285 PF3(i) \ 289 PF3(i) \
286 PF3(i+2) \ 290 PF3(i + 2) \
287 XO2(i,0) \ 291 XO2(i, 0) \
288 XO2(i+1,1) \ 292 XO2(i + 1, 1) \
289 XO2(i+2,2) \ 293 XO2(i + 2, 2) \
290 XO2(i+3,3) \ 294 XO2(i + 3, 3) \
291 PF4(i) \ 295 PF4(i) \
292 PF4(i+2) \ 296 PF4(i + 2) \
293 PF0(i+4) \ 297 PF0(i + 4) \
294 PF0(i+6) \ 298 PF0(i + 6) \
295 XO3(i,0) \ 299 XO3(i, 0) \
296 XO3(i+1,1) \ 300 XO3(i + 1, 1) \
297 XO3(i+2,2) \ 301 XO3(i + 2, 2) \
298 XO3(i+3,3) \ 302 XO3(i + 3, 3) \
299 XO4(i,0) \ 303 XO4(i, 0) \
300 XO4(i+1,1) \ 304 XO4(i + 1, 1) \
301 XO4(i+2,2) \ 305 XO4(i + 2, 2) \
302 XO4(i+3,3) \ 306 XO4(i + 3, 3) \
303 ST(i,0) \ 307 ST(i, 0) \
304 ST(i+1,1) \ 308 ST(i + 1, 1) \
305 ST(i+2,2) \ 309 ST(i + 2, 2) \
306 ST(i+3,3) \ 310 ST(i + 3, 3) \
307 311
308 312
309 PF0(0) 313 PF0(0)
310 PF0(2) 314 PF0(2)
311 315
312 " .align 32 ;\n" 316 " .align 32 ;\n"
313 " 1: ;\n" 317 " 1: ;\n"
314 318
315 BLOCK(0) 319 BLOCK(0)
316 BLOCK(4) 320 BLOCK(4)
317 BLOCK(8) 321 BLOCK(8)
318 BLOCK(12) 322 BLOCK(12)
319 323
320 " addq %[inc], %[p1] ;\n" 324 " addq %[inc], %[p1] ;\n"
321 " addq %[inc], %[p2] ;\n" 325 " addq %[inc], %[p2] ;\n"
322 " addq %[inc], %[p3] ;\n" 326 " addq %[inc], %[p3] ;\n"
323 " addq %[inc], %[p4] ;\n" 327 " addq %[inc], %[p4] ;\n"
324 " addq %[inc], %[p5] ;\n" 328 " addq %[inc], %[p5] ;\n"
325 " decl %[cnt] ; jnz 1b" 329 " decl %[cnt] ; jnz 1b"
326 : [cnt] "+c" (lines), 330 : [cnt] "+c" (lines),
327 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4), 331 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
328 [p5] "+r" (p5) 332 [p5] "+r" (p5)
329 : [inc] "r" (256UL) 333 : [inc] "r" (256UL)
330 : "memory"); 334 : "memory");
@@ -333,18 +337,18 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
333} 337}
334 338
335static struct xor_block_template xor_block_sse = { 339static struct xor_block_template xor_block_sse = {
336 .name = "generic_sse", 340 .name = "generic_sse",
337 .do_2 = xor_sse_2, 341 .do_2 = xor_sse_2,
338 .do_3 = xor_sse_3, 342 .do_3 = xor_sse_3,
339 .do_4 = xor_sse_4, 343 .do_4 = xor_sse_4,
340 .do_5 = xor_sse_5, 344 .do_5 = xor_sse_5,
341}; 345};
342 346
343#undef XOR_TRY_TEMPLATES 347#undef XOR_TRY_TEMPLATES
344#define XOR_TRY_TEMPLATES \ 348#define XOR_TRY_TEMPLATES \
345 do { \ 349do { \
346 xor_speed(&xor_block_sse); \ 350 xor_speed(&xor_block_sse); \
347 } while (0) 351} while (0)
348 352
349/* We force the use of the SSE xor block because it can write around L2. 353/* We force the use of the SSE xor block because it can write around L2.
350 We may also be able to load into the L1 only depending on how the cpu 354 We may also be able to load into the L1 only depending on how the cpu