aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorChristoph Lameter <cl@linux.com>2010-12-14 11:28:47 -0500
committerTejun Heo <tj@kernel.org>2010-12-18 09:54:04 -0500
commit8270137a0d50507a5b40f880db636527045b8466 (patch)
tree3490a31fcbea09ab5fffb6b2f4330dc92896f413 /arch
parent7296e08abac0a22a2534a4f6e493c764f2c77583 (diff)
cpuops: Use cmpxchg for xchg to avoid lock semantics
Use cmpxchg instead of xchg to realize this_cpu_xchg. xchg will cause LOCK overhead since LOCK is always implied but cmpxchg will not. Baselines: xchg() = 18 cycles (no segment prefix, LOCK semantics) __this_cpu_xchg = 1 cycle (simulated using this_cpu_read/write, two prefixes. Looks like the cpu can use loop optimization to get rid of most of the overhead) Cycles before: this_cpu_xchg = 37 cycles (segment prefix and LOCK (implied by xchg)) After: this_cpu_xchg = 11 cycle (using cmpxchg without lock semantics) Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86/include/asm/percpu.h21
1 files changed, 15 insertions, 6 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index b85ade511a53..8ee45167e817 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -263,8 +263,9 @@ do { \
263}) 263})
264 264
265/* 265/*
266 * Beware: xchg on x86 has an implied lock prefix. There will be the cost of 266 * xchg is implemented using cmpxchg without a lock prefix. xchg is
267 * full lock semantics even though they are not needed. 267 * expensive due to the implied lock prefix. The processor cannot prefetch
268 * cachelines if xchg is used.
268 */ 269 */
269#define percpu_xchg_op(var, nval) \ 270#define percpu_xchg_op(var, nval) \
270({ \ 271({ \
@@ -272,25 +273,33 @@ do { \
272 typeof(var) pxo_new__ = (nval); \ 273 typeof(var) pxo_new__ = (nval); \
273 switch (sizeof(var)) { \ 274 switch (sizeof(var)) { \
274 case 1: \ 275 case 1: \
275 asm("xchgb %2, "__percpu_arg(1) \ 276 asm("\n1:mov "__percpu_arg(1)",%%al" \
277 "\n\tcmpxchgb %2, "__percpu_arg(1) \
278 "\n\tjnz 1b" \
276 : "=a" (pxo_ret__), "+m" (var) \ 279 : "=a" (pxo_ret__), "+m" (var) \
277 : "q" (pxo_new__) \ 280 : "q" (pxo_new__) \
278 : "memory"); \ 281 : "memory"); \
279 break; \ 282 break; \
280 case 2: \ 283 case 2: \
281 asm("xchgw %2, "__percpu_arg(1) \ 284 asm("\n1:mov "__percpu_arg(1)",%%ax" \
285 "\n\tcmpxchgw %2, "__percpu_arg(1) \
286 "\n\tjnz 1b" \
282 : "=a" (pxo_ret__), "+m" (var) \ 287 : "=a" (pxo_ret__), "+m" (var) \
283 : "r" (pxo_new__) \ 288 : "r" (pxo_new__) \
284 : "memory"); \ 289 : "memory"); \
285 break; \ 290 break; \
286 case 4: \ 291 case 4: \
287 asm("xchgl %2, "__percpu_arg(1) \ 292 asm("\n1:mov "__percpu_arg(1)",%%eax" \
293 "\n\tcmpxchgl %2, "__percpu_arg(1) \
294 "\n\tjnz 1b" \
288 : "=a" (pxo_ret__), "+m" (var) \ 295 : "=a" (pxo_ret__), "+m" (var) \
289 : "r" (pxo_new__) \ 296 : "r" (pxo_new__) \
290 : "memory"); \ 297 : "memory"); \
291 break; \ 298 break; \
292 case 8: \ 299 case 8: \
293 asm("xchgq %2, "__percpu_arg(1) \ 300 asm("\n1:mov "__percpu_arg(1)",%%rax" \
301 "\n\tcmpxchgq %2, "__percpu_arg(1) \
302 "\n\tjnz 1b" \
294 : "=a" (pxo_ret__), "+m" (var) \ 303 : "=a" (pxo_ret__), "+m" (var) \
295 : "r" (pxo_new__) \ 304 : "r" (pxo_new__) \
296 : "memory"); \ 305 : "memory"); \