diff options
| -rw-r--r-- | arch/x86/include/asm/xor.h | 172 | ||||
| -rw-r--r-- | arch/x86/include/asm/xor_32.h | 23 | ||||
| -rw-r--r-- | arch/x86/include/asm/xor_64.h | 10 |
3 files changed, 187 insertions, 18 deletions
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index c661571ca0b7..d8829751b3f8 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h | |||
| @@ -58,6 +58,14 @@ | |||
| 58 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" | 58 | #define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n" |
| 59 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" | 59 | #define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n" |
| 60 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" | 60 | #define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n" |
| 61 | #define NOP(x) | ||
| 62 | |||
| 63 | #define BLK64(pf, op, i) \ | ||
| 64 | pf(i) \ | ||
| 65 | op(i, 0) \ | ||
| 66 | op(i + 1, 1) \ | ||
| 67 | op(i + 2, 2) \ | ||
| 68 | op(i + 3, 3) | ||
| 61 | 69 | ||
| 62 | static void | 70 | static void |
| 63 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | 71 | xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) |
| @@ -111,6 +119,40 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2) | |||
| 111 | } | 119 | } |
| 112 | 120 | ||
| 113 | static void | 121 | static void |
| 122 | xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2) | ||
| 123 | { | ||
| 124 | unsigned long lines = bytes >> 8; | ||
| 125 | |||
| 126 | kernel_fpu_begin(); | ||
| 127 | |||
| 128 | asm volatile( | ||
| 129 | #undef BLOCK | ||
| 130 | #define BLOCK(i) \ | ||
| 131 | BLK64(PF0, LD, i) \ | ||
| 132 | BLK64(PF1, XO1, i) \ | ||
| 133 | BLK64(NOP, ST, i) \ | ||
| 134 | |||
| 135 | " .align 32 ;\n" | ||
| 136 | " 1: ;\n" | ||
| 137 | |||
| 138 | BLOCK(0) | ||
| 139 | BLOCK(4) | ||
| 140 | BLOCK(8) | ||
| 141 | BLOCK(12) | ||
| 142 | |||
| 143 | " add %[inc], %[p1] ;\n" | ||
| 144 | " add %[inc], %[p2] ;\n" | ||
| 145 | " dec %[cnt] ;\n" | ||
| 146 | " jnz 1b ;\n" | ||
| 147 | : [cnt] "+r" (lines), | ||
| 148 | [p1] "+r" (p1), [p2] "+r" (p2) | ||
| 149 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
| 150 | : "memory"); | ||
| 151 | |||
| 152 | kernel_fpu_end(); | ||
| 153 | } | ||
| 154 | |||
| 155 | static void | ||
| 114 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 156 | xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
| 115 | unsigned long *p3) | 157 | unsigned long *p3) |
| 116 | { | 158 | { |
| @@ -170,6 +212,43 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
| 170 | } | 212 | } |
| 171 | 213 | ||
| 172 | static void | 214 | static void |
| 215 | xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
| 216 | unsigned long *p3) | ||
| 217 | { | ||
| 218 | unsigned long lines = bytes >> 8; | ||
| 219 | |||
| 220 | kernel_fpu_begin(); | ||
| 221 | |||
| 222 | asm volatile( | ||
| 223 | #undef BLOCK | ||
| 224 | #define BLOCK(i) \ | ||
| 225 | BLK64(PF0, LD, i) \ | ||
| 226 | BLK64(PF1, XO1, i) \ | ||
| 227 | BLK64(PF2, XO2, i) \ | ||
| 228 | BLK64(NOP, ST, i) \ | ||
| 229 | |||
| 230 | " .align 32 ;\n" | ||
| 231 | " 1: ;\n" | ||
| 232 | |||
| 233 | BLOCK(0) | ||
| 234 | BLOCK(4) | ||
| 235 | BLOCK(8) | ||
| 236 | BLOCK(12) | ||
| 237 | |||
| 238 | " add %[inc], %[p1] ;\n" | ||
| 239 | " add %[inc], %[p2] ;\n" | ||
| 240 | " add %[inc], %[p3] ;\n" | ||
| 241 | " dec %[cnt] ;\n" | ||
| 242 | " jnz 1b ;\n" | ||
| 243 | : [cnt] "+r" (lines), | ||
| 244 | [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3) | ||
| 245 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
| 246 | : "memory"); | ||
| 247 | |||
| 248 | kernel_fpu_end(); | ||
| 249 | } | ||
| 250 | |||
| 251 | static void | ||
| 173 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 252 | xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
| 174 | unsigned long *p3, unsigned long *p4) | 253 | unsigned long *p3, unsigned long *p4) |
| 175 | { | 254 | { |
| @@ -236,6 +315,45 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
| 236 | } | 315 | } |
| 237 | 316 | ||
| 238 | static void | 317 | static void |
| 318 | xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
| 319 | unsigned long *p3, unsigned long *p4) | ||
| 320 | { | ||
| 321 | unsigned long lines = bytes >> 8; | ||
| 322 | |||
| 323 | kernel_fpu_begin(); | ||
| 324 | |||
| 325 | asm volatile( | ||
| 326 | #undef BLOCK | ||
| 327 | #define BLOCK(i) \ | ||
| 328 | BLK64(PF0, LD, i) \ | ||
| 329 | BLK64(PF1, XO1, i) \ | ||
| 330 | BLK64(PF2, XO2, i) \ | ||
| 331 | BLK64(PF3, XO3, i) \ | ||
| 332 | BLK64(NOP, ST, i) \ | ||
| 333 | |||
| 334 | " .align 32 ;\n" | ||
| 335 | " 1: ;\n" | ||
| 336 | |||
| 337 | BLOCK(0) | ||
| 338 | BLOCK(4) | ||
| 339 | BLOCK(8) | ||
| 340 | BLOCK(12) | ||
| 341 | |||
| 342 | " add %[inc], %[p1] ;\n" | ||
| 343 | " add %[inc], %[p2] ;\n" | ||
| 344 | " add %[inc], %[p3] ;\n" | ||
| 345 | " add %[inc], %[p4] ;\n" | ||
| 346 | " dec %[cnt] ;\n" | ||
| 347 | " jnz 1b ;\n" | ||
| 348 | : [cnt] "+r" (lines), [p1] "+r" (p1), | ||
| 349 | [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4) | ||
| 350 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
| 351 | : "memory"); | ||
| 352 | |||
| 353 | kernel_fpu_end(); | ||
| 354 | } | ||
| 355 | |||
| 356 | static void | ||
| 239 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | 357 | xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, |
| 240 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | 358 | unsigned long *p3, unsigned long *p4, unsigned long *p5) |
| 241 | { | 359 | { |
| @@ -308,12 +426,63 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
| 308 | kernel_fpu_end(); | 426 | kernel_fpu_end(); |
| 309 | } | 427 | } |
| 310 | 428 | ||
| 429 | static void | ||
| 430 | xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2, | ||
| 431 | unsigned long *p3, unsigned long *p4, unsigned long *p5) | ||
| 432 | { | ||
| 433 | unsigned long lines = bytes >> 8; | ||
| 434 | |||
| 435 | kernel_fpu_begin(); | ||
| 436 | |||
| 437 | asm volatile( | ||
| 438 | #undef BLOCK | ||
| 439 | #define BLOCK(i) \ | ||
| 440 | BLK64(PF0, LD, i) \ | ||
| 441 | BLK64(PF1, XO1, i) \ | ||
| 442 | BLK64(PF2, XO2, i) \ | ||
| 443 | BLK64(PF3, XO3, i) \ | ||
| 444 | BLK64(PF4, XO4, i) \ | ||
| 445 | BLK64(NOP, ST, i) \ | ||
| 446 | |||
| 447 | " .align 32 ;\n" | ||
| 448 | " 1: ;\n" | ||
| 449 | |||
| 450 | BLOCK(0) | ||
| 451 | BLOCK(4) | ||
| 452 | BLOCK(8) | ||
| 453 | BLOCK(12) | ||
| 454 | |||
| 455 | " add %[inc], %[p1] ;\n" | ||
| 456 | " add %[inc], %[p2] ;\n" | ||
| 457 | " add %[inc], %[p3] ;\n" | ||
| 458 | " add %[inc], %[p4] ;\n" | ||
| 459 | " add %[inc], %[p5] ;\n" | ||
| 460 | " dec %[cnt] ;\n" | ||
| 461 | " jnz 1b ;\n" | ||
| 462 | : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2), | ||
| 463 | [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5) | ||
| 464 | : [inc] XOR_CONSTANT_CONSTRAINT (256UL) | ||
| 465 | : "memory"); | ||
| 466 | |||
| 467 | kernel_fpu_end(); | ||
| 468 | } | ||
| 469 | |||
| 470 | static struct xor_block_template xor_block_sse_pf64 = { | ||
| 471 | .name = "prefetch64-sse", | ||
| 472 | .do_2 = xor_sse_2_pf64, | ||
| 473 | .do_3 = xor_sse_3_pf64, | ||
| 474 | .do_4 = xor_sse_4_pf64, | ||
| 475 | .do_5 = xor_sse_5_pf64, | ||
| 476 | }; | ||
| 477 | |||
| 311 | #undef LD | 478 | #undef LD |
| 312 | #undef XO1 | 479 | #undef XO1 |
| 313 | #undef XO2 | 480 | #undef XO2 |
| 314 | #undef XO3 | 481 | #undef XO3 |
| 315 | #undef XO4 | 482 | #undef XO4 |
| 316 | #undef ST | 483 | #undef ST |
| 484 | #undef NOP | ||
| 485 | #undef BLK64 | ||
| 317 | #undef BLOCK | 486 | #undef BLOCK |
| 318 | 487 | ||
| 319 | #undef XOR_CONSTANT_CONSTRAINT | 488 | #undef XOR_CONSTANT_CONSTRAINT |
| @@ -324,4 +493,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2, | |||
| 324 | # include <asm/xor_64.h> | 493 | # include <asm/xor_64.h> |
| 325 | #endif | 494 | #endif |
| 326 | 495 | ||
| 496 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | ||
| 497 | AVX_SELECT(FASTEST) | ||
| 498 | |||
| 327 | #endif /* _ASM_X86_XOR_H */ | 499 | #endif /* _ASM_X86_XOR_H */ |
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index b85dc87f3cc7..ce05722e3c68 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h | |||
| @@ -543,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = { | |||
| 543 | /* Also try the generic routines. */ | 543 | /* Also try the generic routines. */ |
| 544 | #include <asm-generic/xor.h> | 544 | #include <asm-generic/xor.h> |
| 545 | 545 | ||
| 546 | /* We force the use of the SSE xor block because it can write around L2. | ||
| 547 | We may also be able to load into the L1 only depending on how the cpu | ||
| 548 | deals with a load to a line that is being prefetched. */ | ||
| 546 | #undef XOR_TRY_TEMPLATES | 549 | #undef XOR_TRY_TEMPLATES |
| 547 | #define XOR_TRY_TEMPLATES \ | 550 | #define XOR_TRY_TEMPLATES \ |
| 548 | do { \ | 551 | do { \ |
| 549 | xor_speed(&xor_block_8regs); \ | ||
| 550 | xor_speed(&xor_block_8regs_p); \ | ||
| 551 | xor_speed(&xor_block_32regs); \ | ||
| 552 | xor_speed(&xor_block_32regs_p); \ | ||
| 553 | AVX_XOR_SPEED; \ | 552 | AVX_XOR_SPEED; \ |
| 554 | if (cpu_has_xmm) \ | 553 | if (cpu_has_xmm) { \ |
| 555 | xor_speed(&xor_block_pIII_sse); \ | 554 | xor_speed(&xor_block_pIII_sse); \ |
| 556 | if (cpu_has_mmx) { \ | 555 | xor_speed(&xor_block_sse_pf64); \ |
| 556 | } else if (cpu_has_mmx) { \ | ||
| 557 | xor_speed(&xor_block_pII_mmx); \ | 557 | xor_speed(&xor_block_pII_mmx); \ |
| 558 | xor_speed(&xor_block_p5_mmx); \ | 558 | xor_speed(&xor_block_p5_mmx); \ |
| 559 | } else { \ | ||
| 560 | xor_speed(&xor_block_8regs); \ | ||
| 561 | xor_speed(&xor_block_8regs_p); \ | ||
| 562 | xor_speed(&xor_block_32regs); \ | ||
| 563 | xor_speed(&xor_block_32regs_p); \ | ||
| 559 | } \ | 564 | } \ |
| 560 | } while (0) | 565 | } while (0) |
| 561 | 566 | ||
| 562 | /* We force the use of the SSE xor block because it can write around L2. | ||
| 563 | We may also be able to load into the L1 only depending on how the cpu | ||
| 564 | deals with a load to a line that is being prefetched. */ | ||
| 565 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | ||
| 566 | AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) | ||
| 567 | |||
| 568 | #endif /* _ASM_X86_XOR_32_H */ | 567 | #endif /* _ASM_X86_XOR_32_H */ |
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index 1baf89dcc423..546f1e3b87cc 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h | |||
| @@ -13,17 +13,15 @@ static struct xor_block_template xor_block_sse = { | |||
| 13 | /* Also try the AVX routines */ | 13 | /* Also try the AVX routines */ |
| 14 | #include <asm/xor_avx.h> | 14 | #include <asm/xor_avx.h> |
| 15 | 15 | ||
| 16 | /* We force the use of the SSE xor block because it can write around L2. | ||
| 17 | We may also be able to load into the L1 only depending on how the cpu | ||
| 18 | deals with a load to a line that is being prefetched. */ | ||
| 16 | #undef XOR_TRY_TEMPLATES | 19 | #undef XOR_TRY_TEMPLATES |
| 17 | #define XOR_TRY_TEMPLATES \ | 20 | #define XOR_TRY_TEMPLATES \ |
| 18 | do { \ | 21 | do { \ |
| 19 | AVX_XOR_SPEED; \ | 22 | AVX_XOR_SPEED; \ |
| 23 | xor_speed(&xor_block_sse_pf64); \ | ||
| 20 | xor_speed(&xor_block_sse); \ | 24 | xor_speed(&xor_block_sse); \ |
| 21 | } while (0) | 25 | } while (0) |
| 22 | 26 | ||
| 23 | /* We force the use of the SSE xor block because it can write around L2. | ||
| 24 | We may also be able to load into the L1 only depending on how the cpu | ||
| 25 | deals with a load to a line that is being prefetched. */ | ||
| 26 | #define XOR_SELECT_TEMPLATE(FASTEST) \ | ||
| 27 | AVX_SELECT(&xor_block_sse) | ||
| 28 | |||
| 29 | #endif /* _ASM_X86_XOR_64_H */ | 27 | #endif /* _ASM_X86_XOR_64_H */ |
