aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorJan Beulich <JBeulich@suse.com>2012-11-02 10:20:24 -0400
committerIngo Molnar <mingo@kernel.org>2013-01-25 03:23:50 -0500
commitf317820cb6ee3fb173319bf76e0e62437be78ad2 (patch)
treefc57358da4ba9f11a8d80e508d01e99c2c62c1f9 /arch
parente8f6e3f8a14bae98197c6d9f280cd23d22eb1a33 (diff)
x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line
On CPUs with 64-byte last level cache lines, this yields roughly 10% better performance, independent of CPU vendor or specific model (as far as I was able to test). Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/5093E4B802000078000A615E@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/xor.h172
-rw-r--r--arch/x86/include/asm/xor_32.h23
-rw-r--r--arch/x86/include/asm/xor_64.h10
3 files changed, 187 insertions, 18 deletions
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index c661571ca0b7..d8829751b3f8 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -58,6 +58,14 @@
58#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" 58#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
59#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" 59#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
60#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" 60#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
61#define NOP(x)
62
63#define BLK64(pf, op, i) \
64 pf(i) \
65 op(i, 0) \
66 op(i + 1, 1) \
67 op(i + 2, 2) \
68 op(i + 3, 3)
61 69
62static void 70static void
63xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) 71xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
@@ -111,6 +119,40 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
111} 119}
112 120
113static void 121static void
122xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
123{
124 unsigned long lines = bytes >> 8;
125
126 kernel_fpu_begin();
127
128 asm volatile(
129#undef BLOCK
130#define BLOCK(i) \
131 BLK64(PF0, LD, i) \
132 BLK64(PF1, XO1, i) \
133 BLK64(NOP, ST, i) \
134
135 " .align 32 ;\n"
136 " 1: ;\n"
137
138 BLOCK(0)
139 BLOCK(4)
140 BLOCK(8)
141 BLOCK(12)
142
143 " add %[inc], %[p1] ;\n"
144 " add %[inc], %[p2] ;\n"
145 " dec %[cnt] ;\n"
146 " jnz 1b ;\n"
147 : [cnt] "+r" (lines),
148 [p1] "+r" (p1), [p2] "+r" (p2)
149 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
150 : "memory");
151
152 kernel_fpu_end();
153}
154
155static void
114xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, 156xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
115 unsigned long *p3) 157 unsigned long *p3)
116{ 158{
@@ -170,6 +212,43 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
170} 212}
171 213
172static void 214static void
215xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
216 unsigned long *p3)
217{
218 unsigned long lines = bytes >> 8;
219
220 kernel_fpu_begin();
221
222 asm volatile(
223#undef BLOCK
224#define BLOCK(i) \
225 BLK64(PF0, LD, i) \
226 BLK64(PF1, XO1, i) \
227 BLK64(PF2, XO2, i) \
228 BLK64(NOP, ST, i) \
229
230 " .align 32 ;\n"
231 " 1: ;\n"
232
233 BLOCK(0)
234 BLOCK(4)
235 BLOCK(8)
236 BLOCK(12)
237
238 " add %[inc], %[p1] ;\n"
239 " add %[inc], %[p2] ;\n"
240 " add %[inc], %[p3] ;\n"
241 " dec %[cnt] ;\n"
242 " jnz 1b ;\n"
243 : [cnt] "+r" (lines),
244 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
245 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
246 : "memory");
247
248 kernel_fpu_end();
249}
250
251static void
173xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, 252xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
174 unsigned long *p3, unsigned long *p4) 253 unsigned long *p3, unsigned long *p4)
175{ 254{
@@ -236,6 +315,45 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
236} 315}
237 316
238static void 317static void
318xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
319 unsigned long *p3, unsigned long *p4)
320{
321 unsigned long lines = bytes >> 8;
322
323 kernel_fpu_begin();
324
325 asm volatile(
326#undef BLOCK
327#define BLOCK(i) \
328 BLK64(PF0, LD, i) \
329 BLK64(PF1, XO1, i) \
330 BLK64(PF2, XO2, i) \
331 BLK64(PF3, XO3, i) \
332 BLK64(NOP, ST, i) \
333
334 " .align 32 ;\n"
335 " 1: ;\n"
336
337 BLOCK(0)
338 BLOCK(4)
339 BLOCK(8)
340 BLOCK(12)
341
342 " add %[inc], %[p1] ;\n"
343 " add %[inc], %[p2] ;\n"
344 " add %[inc], %[p3] ;\n"
345 " add %[inc], %[p4] ;\n"
346 " dec %[cnt] ;\n"
347 " jnz 1b ;\n"
348 : [cnt] "+r" (lines), [p1] "+r" (p1),
349 [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
350 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
351 : "memory");
352
353 kernel_fpu_end();
354}
355
356static void
239xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, 357xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
240 unsigned long *p3, unsigned long *p4, unsigned long *p5) 358 unsigned long *p3, unsigned long *p4, unsigned long *p5)
241{ 359{
@@ -308,12 +426,63 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
308 kernel_fpu_end(); 426 kernel_fpu_end();
309} 427}
310 428
429static void
430xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
431 unsigned long *p3, unsigned long *p4, unsigned long *p5)
432{
433 unsigned long lines = bytes >> 8;
434
435 kernel_fpu_begin();
436
437 asm volatile(
438#undef BLOCK
439#define BLOCK(i) \
440 BLK64(PF0, LD, i) \
441 BLK64(PF1, XO1, i) \
442 BLK64(PF2, XO2, i) \
443 BLK64(PF3, XO3, i) \
444 BLK64(PF4, XO4, i) \
445 BLK64(NOP, ST, i) \
446
447 " .align 32 ;\n"
448 " 1: ;\n"
449
450 BLOCK(0)
451 BLOCK(4)
452 BLOCK(8)
453 BLOCK(12)
454
455 " add %[inc], %[p1] ;\n"
456 " add %[inc], %[p2] ;\n"
457 " add %[inc], %[p3] ;\n"
458 " add %[inc], %[p4] ;\n"
459 " add %[inc], %[p5] ;\n"
460 " dec %[cnt] ;\n"
461 " jnz 1b ;\n"
462 : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
463 [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
464 : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
465 : "memory");
466
467 kernel_fpu_end();
468}
469
470static struct xor_block_template xor_block_sse_pf64 = {
471 .name = "prefetch64-sse",
472 .do_2 = xor_sse_2_pf64,
473 .do_3 = xor_sse_3_pf64,
474 .do_4 = xor_sse_4_pf64,
475 .do_5 = xor_sse_5_pf64,
476};
477
311#undef LD 478#undef LD
312#undef XO1 479#undef XO1
313#undef XO2 480#undef XO2
314#undef XO3 481#undef XO3
315#undef XO4 482#undef XO4
316#undef ST 483#undef ST
484#undef NOP
485#undef BLK64
317#undef BLOCK 486#undef BLOCK
318 487
319#undef XOR_CONSTANT_CONSTRAINT 488#undef XOR_CONSTANT_CONSTRAINT
@@ -324,4 +493,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
324# include <asm/xor_64.h> 493# include <asm/xor_64.h>
325#endif 494#endif
326 495
496#define XOR_SELECT_TEMPLATE(FASTEST) \
497 AVX_SELECT(FASTEST)
498
327#endif /* _ASM_X86_XOR_H */ 499#endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index b85dc87f3cc7..ce05722e3c68 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -543,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
543/* Also try the generic routines. */ 543/* Also try the generic routines. */
544#include <asm-generic/xor.h> 544#include <asm-generic/xor.h>
545 545
546/* We force the use of the SSE xor block because it can write around L2.
547 We may also be able to load into the L1 only depending on how the cpu
548 deals with a load to a line that is being prefetched. */
546#undef XOR_TRY_TEMPLATES 549#undef XOR_TRY_TEMPLATES
547#define XOR_TRY_TEMPLATES \ 550#define XOR_TRY_TEMPLATES \
548do { \ 551do { \
549 xor_speed(&xor_block_8regs); \
550 xor_speed(&xor_block_8regs_p); \
551 xor_speed(&xor_block_32regs); \
552 xor_speed(&xor_block_32regs_p); \
553 AVX_XOR_SPEED; \ 552 AVX_XOR_SPEED; \
554 if (cpu_has_xmm) \ 553 if (cpu_has_xmm) { \
555 xor_speed(&xor_block_pIII_sse); \ 554 xor_speed(&xor_block_pIII_sse); \
556 if (cpu_has_mmx) { \ 555 xor_speed(&xor_block_sse_pf64); \
556 } else if (cpu_has_mmx) { \
557 xor_speed(&xor_block_pII_mmx); \ 557 xor_speed(&xor_block_pII_mmx); \
558 xor_speed(&xor_block_p5_mmx); \ 558 xor_speed(&xor_block_p5_mmx); \
559 } else { \
560 xor_speed(&xor_block_8regs); \
561 xor_speed(&xor_block_8regs_p); \
562 xor_speed(&xor_block_32regs); \
563 xor_speed(&xor_block_32regs_p); \
559 } \ 564 } \
560} while (0) 565} while (0)
561 566
562/* We force the use of the SSE xor block because it can write around L2.
563 We may also be able to load into the L1 only depending on how the cpu
564 deals with a load to a line that is being prefetched. */
565#define XOR_SELECT_TEMPLATE(FASTEST) \
566 AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
567
568#endif /* _ASM_X86_XOR_32_H */ 567#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1baf89dcc423..546f1e3b87cc 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -13,17 +13,15 @@ static struct xor_block_template xor_block_sse = {
13/* Also try the AVX routines */ 13/* Also try the AVX routines */
14#include <asm/xor_avx.h> 14#include <asm/xor_avx.h>
15 15
16/* We force the use of the SSE xor block because it can write around L2.
17 We may also be able to load into the L1 only depending on how the cpu
18 deals with a load to a line that is being prefetched. */
16#undef XOR_TRY_TEMPLATES 19#undef XOR_TRY_TEMPLATES
17#define XOR_TRY_TEMPLATES \ 20#define XOR_TRY_TEMPLATES \
18do { \ 21do { \
19 AVX_XOR_SPEED; \ 22 AVX_XOR_SPEED; \
23 xor_speed(&xor_block_sse_pf64); \
20 xor_speed(&xor_block_sse); \ 24 xor_speed(&xor_block_sse); \
21} while (0) 25} while (0)
22 26
23/* We force the use of the SSE xor block because it can write around L2.
24 We may also be able to load into the L1 only depending on how the cpu
25 deals with a load to a line that is being prefetched. */
26#define XOR_SELECT_TEMPLATE(FASTEST) \
27 AVX_SELECT(&xor_block_sse)
28
29#endif /* _ASM_X86_XOR_64_H */ 27#endif /* _ASM_X86_XOR_64_H */