diff options
author | Jan Beulich <JBeulich@suse.com> | 2012-11-02 10:20:24 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-01-25 03:23:50 -0500 |
commit | f317820cb6ee3fb173319bf76e0e62437be78ad2 (patch) | |
tree | fc57358da4ba9f11a8d80e508d01e99c2c62c1f9 /arch | |
parent | e8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 (diff) |
x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line
On CPUs with 64-byte last level cache lines, this yields roughly
10% better performance, independent of CPU vendor or specific
model (as far as I was able to test).
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/5093E4B802000078000A615E@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/include/asm/xor.h | 172 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_32.h | 23 | ||||
-rw-r--r-- | arch/x86/include/asm/xor_64.h | 10 |
3 files changed, 187 insertions, 18 deletions
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index c661571ca0b7..d8829751b3f8 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h | |||
@@ -58,6 +58,14 @@ | |||
58 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | 58 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" |
59 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | 59 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" |
60 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | 60 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" |
61 | #define NOP(x) | ||
62 | |||
63 | #define BLK64(pf, op, i) \ | ||
64 | pf(i) \ | ||
65 | op(i, 0) \ | ||
66 | op(i + 1, 1) \ | ||
67 | op(i + 2, 2) \ | ||
68 | op(i + 3, 3) | ||
61 | 69 | ||
62 | static void | 70 | static void |
63 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | 71 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) |
@@ -111,6 +119,40 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |||
111 | } | 119 | } |
112 | 120 | ||
113 | static void | 121 | static void |
122 | xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
123 | { | ||
124 | unsigned long lines = bytes >> 8; | ||
125 | |||
126 | kernel_fpu_begin(); | ||
127 | |||
128 | asm volatile( | ||
129 | #undef BLOCK | ||
130 | #define BLOCK(i) \ | ||
131 | BLK64(PF0, LD, i) \ | ||
132 | BLK64(PF1, XO1, i) \ | ||
133 | BLK64(NOP, ST, i) \ | ||
134 | |||
135 | " .align 32 ;\n" | ||
136 | " 1: ;\n" | ||
137 | |||
138 | BLOCK(0) | ||
139 | BLOCK(4) | ||
140 | BLOCK(8) | ||
141 | BLOCK(12) | ||
142 | |||
143 | " add %[inc], %[p1] ;\n" | ||
144 | " add %[inc], %[p2] ;\n" | ||
145 | " dec %[cnt] ;\n" | ||
146 | " jnz 1b ;\n" | ||
147 | : [cnt] "+r" (lines), | ||
148 | [p1] "+r" (p1), [p2] "+r" (p2) | ||
149 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
150 | : "memory"); | ||
151 | |||
152 | kernel_fpu_end(); | ||
153 | } | ||
154 | |||
155 | static void | ||
114 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 156 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
115 | unsigned long *p3) | 157 | unsigned long *p3) |
116 | { | 158 | { |
@@ -170,6 +212,43 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
170 | } | 212 | } |
171 | 213 | ||
172 | static void | 214 | static void |
215 | xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
216 | unsigned long *p3) | ||
217 | { | ||
218 | unsigned long lines = bytes >> 8; | ||
219 | |||
220 | kernel_fpu_begin(); | ||
221 | |||
222 | asm volatile( | ||
223 | #undef BLOCK | ||
224 | #define BLOCK(i) \ | ||
225 | BLK64(PF0, LD, i) \ | ||
226 | BLK64(PF1, XO1, i) \ | ||
227 | BLK64(PF2, XO2, i) \ | ||
228 | BLK64(NOP, ST, i) \ | ||
229 | |||
230 | " .align 32 ;\n" | ||
231 | " 1: ;\n" | ||
232 | |||
233 | BLOCK(0) | ||
234 | BLOCK(4) | ||
235 | BLOCK(8) | ||
236 | BLOCK(12) | ||
237 | |||
238 | " add %[inc], %[p1] ;\n" | ||
239 | " add %[inc], %[p2] ;\n" | ||
240 | " add %[inc], %[p3] ;\n" | ||
241 | " dec %[cnt] ;\n" | ||
242 | " jnz 1b ;\n" | ||
243 | : [cnt] "+r" (lines), | ||
244 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | ||
245 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
246 | : "memory"); | ||
247 | |||
248 | kernel_fpu_end(); | ||
249 | } | ||
250 | |||
251 | static void | ||
173 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 252 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
174 | unsigned long *p3, unsigned long *p4) | 253 | unsigned long *p3, unsigned long *p4) |
175 | { | 254 | { |
@@ -236,6 +315,45 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
236 | } | 315 | } |
237 | 316 | ||
238 | static void | 317 | static void |
318 | xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
319 | unsigned long *p3, unsigned long *p4) | ||
320 | { | ||
321 | unsigned long lines = bytes >> 8; | ||
322 | |||
323 | kernel_fpu_begin(); | ||
324 | |||
325 | asm volatile( | ||
326 | #undef BLOCK | ||
327 | #define BLOCK(i) \ | ||
328 | BLK64(PF0, LD, i) \ | ||
329 | BLK64(PF1, XO1, i) \ | ||
330 | BLK64(PF2, XO2, i) \ | ||
331 | BLK64(PF3, XO3, i) \ | ||
332 | BLK64(NOP, ST, i) \ | ||
333 | |||
334 | " .align 32 ;\n" | ||
335 | " 1: ;\n" | ||
336 | |||
337 | BLOCK(0) | ||
338 | BLOCK(4) | ||
339 | BLOCK(8) | ||
340 | BLOCK(12) | ||
341 | |||
342 | " add %[inc], %[p1] ;\n" | ||
343 | " add %[inc], %[p2] ;\n" | ||
344 | " add %[inc], %[p3] ;\n" | ||
345 | " add %[inc], %[p4] ;\n" | ||
346 | " dec %[cnt] ;\n" | ||
347 | " jnz 1b ;\n" | ||
348 | : [cnt] "+r" (lines), [p1] "+r" (p1), | ||
349 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | ||
350 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
351 | : "memory"); | ||
352 | |||
353 | kernel_fpu_end(); | ||
354 | } | ||
355 | |||
356 | static void | ||
239 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 357 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
240 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | 358 | unsigned long *p3, unsigned long *p4, unsigned long *p5) |
241 | { | 359 | { |
@@ -308,12 +426,63 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
308 | kernel_fpu_end(); | 426 | kernel_fpu_end(); |
309 | } | 427 | } |
310 | 428 | ||
429 | static void | ||
430 | xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
431 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
432 | { | ||
433 | unsigned long lines = bytes >> 8; | ||
434 | |||
435 | kernel_fpu_begin(); | ||
436 | |||
437 | asm volatile( | ||
438 | #undef BLOCK | ||
439 | #define BLOCK(i) \ | ||
440 | BLK64(PF0, LD, i) \ | ||
441 | BLK64(PF1, XO1, i) \ | ||
442 | BLK64(PF2, XO2, i) \ | ||
443 | BLK64(PF3, XO3, i) \ | ||
444 | BLK64(PF4, XO4, i) \ | ||
445 | BLK64(NOP, ST, i) \ | ||
446 | |||
447 | " .align 32 ;\n" | ||
448 | " 1: ;\n" | ||
449 | |||
450 | BLOCK(0) | ||
451 | BLOCK(4) | ||
452 | BLOCK(8) | ||
453 | BLOCK(12) | ||
454 | |||
455 | " add %[inc], %[p1] ;\n" | ||
456 | " add %[inc], %[p2] ;\n" | ||
457 | " add %[inc], %[p3] ;\n" | ||
458 | " add %[inc], %[p4] ;\n" | ||
459 | " add %[inc], %[p5] ;\n" | ||
460 | " dec %[cnt] ;\n" | ||
461 | " jnz 1b ;\n" | ||
462 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), | ||
463 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) | ||
464 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
465 | : "memory"); | ||
466 | |||
467 | kernel_fpu_end(); | ||
468 | } | ||
469 | |||
470 | static struct xor_block_template xor_block_sse_pf64 = { | ||
471 | .name = "prefetch64-sse", | ||
472 | .do_2 = xor_sse_2_pf64, | ||
473 | .do_3 = xor_sse_3_pf64, | ||
474 | .do_4 = xor_sse_4_pf64, | ||
475 | .do_5 = xor_sse_5_pf64, | ||
476 | }; | ||
477 | |||
311 | #undef LD | 478 | #undef LD |
312 | #undef XO1 | 479 | #undef XO1 |
313 | #undef XO2 | 480 | #undef XO2 |
314 | #undef XO3 | 481 | #undef XO3 |
315 | #undef XO4 | 482 | #undef XO4 |
316 | #undef ST | 483 | #undef ST |
484 | #undef NOP | ||
485 | #undef BLK64 | ||
317 | #undef BLOCK | 486 | #undef BLOCK |
318 | 487 | ||
319 | #undef XOR_CONSTANT_CONSTRAINT | 488 | #undef XOR_CONSTANT_CONSTRAINT |
@@ -324,4 +493,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
324 | # include <asm/xor_64.h> | 493 | # include <asm/xor_64.h> |
325 | #endif | 494 | #endif |
326 | 495 | ||
496 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | ||
497 | AVX_SELECT(FASTEST) | ||
498 | |||
327 | #endif /* _ASM_X86_XOR_H */ | 499 | #endif /* _ASM_X86_XOR_H */ |
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index b85dc87f3cc7..ce05722e3c68 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h | |||
@@ -543,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = { | |||
543 | /* Also try the generic routines. */ | 543 | /* Also try the generic routines. */ |
544 | #include <asm-generic/xor.h> | 544 | #include <asm-generic/xor.h> |
545 | 545 | ||
546 | /* We force the use of the SSE xor block because it can write around L2. | ||
547 | We may also be able to load into the L1 only depending on how the cpu | ||
548 | deals with a load to a line that is being prefetched. */ | ||
546 | #undef XOR_TRY_TEMPLATES | 549 | #undef XOR_TRY_TEMPLATES |
547 | #define XOR_TRY_TEMPLATES \ | 550 | #define XOR_TRY_TEMPLATES \ |
548 | do { \ | 551 | do { \ |
549 | xor_speed(&xor_block_8regs); \ | ||
550 | xor_speed(&xor_block_8regs_p); \ | ||
551 | xor_speed(&xor_block_32regs); \ | ||
552 | xor_speed(&xor_block_32regs_p); \ | ||
553 | AVX_XOR_SPEED; \ | 552 | AVX_XOR_SPEED; \ |
554 | if (cpu_has_xmm) \ | 553 | if (cpu_has_xmm) { \ |
555 | xor_speed(&xor_block_pIII_sse); \ | 554 | xor_speed(&xor_block_pIII_sse); \ |
556 | if (cpu_has_mmx) { \ | 555 | xor_speed(&xor_block_sse_pf64); \ |
556 | } else if (cpu_has_mmx) { \ | ||
557 | xor_speed(&xor_block_pII_mmx); \ | 557 | xor_speed(&xor_block_pII_mmx); \ |
558 | xor_speed(&xor_block_p5_mmx); \ | 558 | xor_speed(&xor_block_p5_mmx); \ |
559 | } else { \ | ||
560 | xor_speed(&xor_block_8regs); \ | ||
561 | xor_speed(&xor_block_8regs_p); \ | ||
562 | xor_speed(&xor_block_32regs); \ | ||
563 | xor_speed(&xor_block_32regs_p); \ | ||
559 | } \ | 564 | } \ |
560 | } while (0) | 565 | } while (0) |
561 | 566 | ||
562 | /* We force the use of the SSE xor block because it can write around L2. | ||
563 | We may also be able to load into the L1 only depending on how the cpu | ||
564 | deals with a load to a line that is being prefetched. */ | ||
565 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | ||
566 | AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) | ||
567 | |||
568 | #endif /* _ASM_X86_XOR_32_H */ | 567 | #endif /* _ASM_X86_XOR_32_H */ |
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1baf89dcc423..546f1e3b87cc 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h | |||
@@ -13,17 +13,15 @@ static struct xor_block_template xor_block_sse = { | |||
13 | /* Also try the AVX routines */ | 13 | /* Also try the AVX routines */ |
14 | #include <asm/xor_avx.h> | 14 | #include <asm/xor_avx.h> |
15 | 15 | ||
16 | /* We force the use of the SSE xor block because it can write around L2. | ||
17 | We may also be able to load into the L1 only depending on how the cpu | ||
18 | deals with a load to a line that is being prefetched. */ | ||
16 | #undef XOR_TRY_TEMPLATES | 19 | #undef XOR_TRY_TEMPLATES |
17 | #define XOR_TRY_TEMPLATES \ | 20 | #define XOR_TRY_TEMPLATES \ |
18 | do { \ | 21 | do { \ |
19 | AVX_XOR_SPEED; \ | 22 | AVX_XOR_SPEED; \ |
23 | xor_speed(&xor_block_sse_pf64); \ | ||
20 | xor_speed(&xor_block_sse); \ | 24 | xor_speed(&xor_block_sse); \ |
21 | } while (0) | 25 | } while (0) |
22 | 26 | ||
23 | /* We force the use of the SSE xor block because it can write around L2. | ||
24 | We may also be able to load into the L1 only depending on how the cpu | ||
25 | deals with a load to a line that is being prefetched. */ | ||
26 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | ||
27 | AVX_SELECT(&xor_block_sse) | ||
28 | |||
29 | #endif /* _ASM_X86_XOR_64_H */ | 27 | #endif /* _ASM_X86_XOR_64_H */ |