diff options
author | Christoph Lameter <cl@linux.com> | 2010-12-14 11:28:47 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2010-12-18 09:54:04 -0500 |
commit | 8270137a0d50507a5b40f880db636527045b8466 (patch) | |
tree | 3490a31fcbea09ab5fffb6b2f4330dc92896f413 /arch | |
parent | 7296e08abac0a22a2534a4f6e493c764f2c77583 (diff) |
cpuops: Use cmpxchg for xchg to avoid lock semantics
Use cmpxchg instead of xchg to realize this_cpu_xchg.
xchg will cause LOCK overhead since LOCK is always implied but cmpxchg
will not.
Baselines:
xchg() = 18 cycles (no segment prefix, LOCK semantics)
__this_cpu_xchg = 1 cycle
(simulated using this_cpu_read/write, two prefixes. Looks like the
cpu can use loop optimization to get rid of most of the overhead)
Cycles before:
this_cpu_xchg = 37 cycles (segment prefix and LOCK (implied by xchg))
After:
this_cpu_xchg = 11 cycle (using cmpxchg without lock semantics)
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/include/asm/percpu.h | 21 |
1 files changed, 15 insertions, 6 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index b85ade511a53..8ee45167e817 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h | |||
@@ -263,8 +263,9 @@ do { \ | |||
263 | }) | 263 | }) |
264 | 264 | ||
265 | /* | 265 | /* |
266 | * Beware: xchg on x86 has an implied lock prefix. There will be the cost of | 266 | * xchg is implemented using cmpxchg without a lock prefix. xchg is |
267 | * full lock semantics even though they are not needed. | 267 | * expensive due to the implied lock prefix. The processor cannot prefetch |
268 | * cachelines if xchg is used. | ||
268 | */ | 269 | */ |
269 | #define percpu_xchg_op(var, nval) \ | 270 | #define percpu_xchg_op(var, nval) \ |
270 | ({ \ | 271 | ({ \ |
@@ -272,25 +273,33 @@ do { \ | |||
272 | typeof(var) pxo_new__ = (nval); \ | 273 | typeof(var) pxo_new__ = (nval); \ |
273 | switch (sizeof(var)) { \ | 274 | switch (sizeof(var)) { \ |
274 | case 1: \ | 275 | case 1: \ |
275 | asm("xchgb %2, "__percpu_arg(1) \ | 276 | asm("\n1:mov "__percpu_arg(1)",%%al" \ |
277 | "\n\tcmpxchgb %2, "__percpu_arg(1) \ | ||
278 | "\n\tjnz 1b" \ | ||
276 | : "=a" (pxo_ret__), "+m" (var) \ | 279 | : "=a" (pxo_ret__), "+m" (var) \ |
277 | : "q" (pxo_new__) \ | 280 | : "q" (pxo_new__) \ |
278 | : "memory"); \ | 281 | : "memory"); \ |
279 | break; \ | 282 | break; \ |
280 | case 2: \ | 283 | case 2: \ |
281 | asm("xchgw %2, "__percpu_arg(1) \ | 284 | asm("\n1:mov "__percpu_arg(1)",%%ax" \ |
285 | "\n\tcmpxchgw %2, "__percpu_arg(1) \ | ||
286 | "\n\tjnz 1b" \ | ||
282 | : "=a" (pxo_ret__), "+m" (var) \ | 287 | : "=a" (pxo_ret__), "+m" (var) \ |
283 | : "r" (pxo_new__) \ | 288 | : "r" (pxo_new__) \ |
284 | : "memory"); \ | 289 | : "memory"); \ |
285 | break; \ | 290 | break; \ |
286 | case 4: \ | 291 | case 4: \ |
287 | asm("xchgl %2, "__percpu_arg(1) \ | 292 | asm("\n1:mov "__percpu_arg(1)",%%eax" \ |
293 | "\n\tcmpxchgl %2, "__percpu_arg(1) \ | ||
294 | "\n\tjnz 1b" \ | ||
288 | : "=a" (pxo_ret__), "+m" (var) \ | 295 | : "=a" (pxo_ret__), "+m" (var) \ |
289 | : "r" (pxo_new__) \ | 296 | : "r" (pxo_new__) \ |
290 | : "memory"); \ | 297 | : "memory"); \ |
291 | break; \ | 298 | break; \ |
292 | case 8: \ | 299 | case 8: \ |
293 | asm("xchgq %2, "__percpu_arg(1) \ | 300 | asm("\n1:mov "__percpu_arg(1)",%%rax" \ |
301 | "\n\tcmpxchgq %2, "__percpu_arg(1) \ | ||
302 | "\n\tjnz 1b" \ | ||
294 | : "=a" (pxo_ret__), "+m" (var) \ | 303 | : "=a" (pxo_ret__), "+m" (var) \ |
295 | : "r" (pxo_new__) \ | 304 | : "r" (pxo_new__) \ |
296 | : "memory"); \ | 305 | : "memory"); \ |