cpuops: Use cmpxchg for xchg to avoid lock semantics

Use cmpxchg instead of xchg to realize this_cpu_xchg. xchg will cause LOCK overhead since LOCK is always implied but cmpxchg will not. Baselines: xchg() = 18 cycles (no segment prefix, LOCK semantics) __this_cpu_xchg = 1 cycle (simulated using this_cpu_read/write, two prefixes. Looks like the cpu can use loop optimization to get rid of most of the overhead) Cycles before: this_cpu_xchg = 37 cycles (segment prefix and LOCK (implied by xchg)) After: this_cpu_xchg = 11 cycle (using cmpxchg without lock semantics) Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Tejun Heo <tj@kernel.org>
author: Christoph Lameter <cl@linux.com> 2010-12-14 11:28:47 -0500
committer: Tejun Heo <tj@kernel.org> 2010-12-18 09:54:04 -0500
commit: 8270137a0d50507a5b40f880db636527045b8466 (patch)
tree: 3490a31fcbea09ab5fffb6b2f4330dc92896f413 /arch
parent: 7296e08abac0a22a2534a4f6e493c764f2c77583 (diff)
1 files changed, 15 insertions, 6 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index b85ade511a53..8ee45167e817 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -263,8 +263,9 @@ do {									\
 })
 /*
- * Beware: xchg on x86 has an implied lock prefix. There will be the cost of
+ * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * full lock semantics even though they are not needed.
+ * expensive due to the implied lock prefix.  The processor cannot prefetch
+ * cachelines if xchg is used.
 */
 #define percpu_xchg_op(var, nval)                                       \
 ({                                                                      \
@@ -272,25 +273,33 @@ do {									\
        typeof(var) pxo_new__ = (nval);                                 \
        switch (sizeof(var)) {                                          \
        case 1:                                                         \
-                asm("xchgb %2, "__percpu_arg(1)                         \
+                asm("\n1:mov "__percpu_arg(1)",%%al"                    \
+                    "\n\tcmpxchgb %2, "__percpu_arg(1)                  \
+                    "\n\tjnz 1b"                                        \
                            : "=a" (pxo_ret__), "+m" (var)              \
                            : "q" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        case 2:                                                         \
-                asm("xchgw %2, "__percpu_arg(1)                         \
+                asm("\n1:mov "__percpu_arg(1)",%%ax"                    \
+                    "\n\tcmpxchgw %2, "__percpu_arg(1)                  \
+                    "\n\tjnz 1b"                                        \
                            : "=a" (pxo_ret__), "+m" (var)              \
                            : "r" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        case 4:                                                         \
-                asm("xchgl %2, "__percpu_arg(1)                         \
+                asm("\n1:mov "__percpu_arg(1)",%%eax"                   \
+                    "\n\tcmpxchgl %2, "__percpu_arg(1)                  \
+                    "\n\tjnz 1b"                                        \
                            : "=a" (pxo_ret__), "+m" (var)              \
                            : "r" (pxo_new__)                           \
                            : "memory");                                \
                break;                                                  \
        case 8:                                                         \
-                asm("xchgq %2, "__percpu_arg(1)                         \
+                asm("\n1:mov "__percpu_arg(1)",%%rax"                   \
+                    "\n\tcmpxchgq %2, "__percpu_arg(1)                  \
+                    "\n\tjnz 1b"                                        \
                            : "=a" (pxo_ret__), "+m" (var)              \
                            : "r" (pxo_new__)                           \
                            : "memory");                                \
author	Christoph Lameter <cl@linux.com>	2010-12-14 11:28:47 -0500
committer	Tejun Heo <tj@kernel.org>	2010-12-18 09:54:04 -0500
commit	8270137a0d50507a5b40f880db636527045b8466 (patch)
tree	3490a31fcbea09ab5fffb6b2f4330dc92896f413 /arch
parent	7296e08abac0a22a2534a4f6e493c764f2c77583 (diff)