aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/include/asm/percpu.h
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/include/asm/percpu.h
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/include/asm/percpu.h')
-rw-r--r--arch/x86/include/asm/percpu.h256
1 files changed, 254 insertions, 2 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index cd28f9ad910d..a0a9779084d1 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -45,12 +45,28 @@
45#include <linux/stringify.h> 45#include <linux/stringify.h>
46 46
47#ifdef CONFIG_SMP 47#ifdef CONFIG_SMP
48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x 48#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
49#define __my_cpu_offset percpu_read(this_cpu_off) 49#define __my_cpu_offset percpu_read(this_cpu_off)
50
51/*
52 * Compared to the generic __my_cpu_offset version, the following
53 * saves one instruction and avoids clobbering a temp register.
54 */
55#define __this_cpu_ptr(ptr) \
56({ \
57 unsigned long tcp_ptr__; \
58 __verify_pcpu_ptr(ptr); \
59 asm volatile("add " __percpu_arg(1) ", %0" \
60 : "=r" (tcp_ptr__) \
61 : "m" (this_cpu_off), "0" (ptr)); \
62 (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
63})
50#else 64#else
51#define __percpu_arg(x) "%P" #x 65#define __percpu_prefix ""
52#endif 66#endif
53 67
68#define __percpu_arg(x) __percpu_prefix "%P" #x
69
54/* 70/*
55 * Initialized pointers to per-cpu variables needed for the boot 71 * Initialized pointers to per-cpu variables needed for the boot
56 * processor need to use these macros to get the proper address 72 * processor need to use these macros to get the proper address
@@ -216,6 +232,125 @@ do { \
216}) 232})
217 233
218/* 234/*
235 * Add return operation
236 */
237#define percpu_add_return_op(var, val) \
238({ \
239 typeof(var) paro_ret__ = val; \
240 switch (sizeof(var)) { \
241 case 1: \
242 asm("xaddb %0, "__percpu_arg(1) \
243 : "+q" (paro_ret__), "+m" (var) \
244 : : "memory"); \
245 break; \
246 case 2: \
247 asm("xaddw %0, "__percpu_arg(1) \
248 : "+r" (paro_ret__), "+m" (var) \
249 : : "memory"); \
250 break; \
251 case 4: \
252 asm("xaddl %0, "__percpu_arg(1) \
253 : "+r" (paro_ret__), "+m" (var) \
254 : : "memory"); \
255 break; \
256 case 8: \
257 asm("xaddq %0, "__percpu_arg(1) \
258 : "+re" (paro_ret__), "+m" (var) \
259 : : "memory"); \
260 break; \
261 default: __bad_percpu_size(); \
262 } \
263 paro_ret__ += val; \
264 paro_ret__; \
265})
266
267/*
268 * xchg is implemented using cmpxchg without a lock prefix. xchg is
269 * expensive due to the implied lock prefix. The processor cannot prefetch
270 * cachelines if xchg is used.
271 */
272#define percpu_xchg_op(var, nval) \
273({ \
274 typeof(var) pxo_ret__; \
275 typeof(var) pxo_new__ = (nval); \
276 switch (sizeof(var)) { \
277 case 1: \
278 asm("\n\tmov "__percpu_arg(1)",%%al" \
279 "\n1:\tcmpxchgb %2, "__percpu_arg(1) \
280 "\n\tjnz 1b" \
281 : "=&a" (pxo_ret__), "+m" (var) \
282 : "q" (pxo_new__) \
283 : "memory"); \
284 break; \
285 case 2: \
286 asm("\n\tmov "__percpu_arg(1)",%%ax" \
287 "\n1:\tcmpxchgw %2, "__percpu_arg(1) \
288 "\n\tjnz 1b" \
289 : "=&a" (pxo_ret__), "+m" (var) \
290 : "r" (pxo_new__) \
291 : "memory"); \
292 break; \
293 case 4: \
294 asm("\n\tmov "__percpu_arg(1)",%%eax" \
295 "\n1:\tcmpxchgl %2, "__percpu_arg(1) \
296 "\n\tjnz 1b" \
297 : "=&a" (pxo_ret__), "+m" (var) \
298 : "r" (pxo_new__) \
299 : "memory"); \
300 break; \
301 case 8: \
302 asm("\n\tmov "__percpu_arg(1)",%%rax" \
303 "\n1:\tcmpxchgq %2, "__percpu_arg(1) \
304 "\n\tjnz 1b" \
305 : "=&a" (pxo_ret__), "+m" (var) \
306 : "r" (pxo_new__) \
307 : "memory"); \
308 break; \
309 default: __bad_percpu_size(); \
310 } \
311 pxo_ret__; \
312})
313
314/*
315 * cmpxchg has no such implied lock semantics as a result it is much
316 * more efficient for cpu local operations.
317 */
318#define percpu_cmpxchg_op(var, oval, nval) \
319({ \
320 typeof(var) pco_ret__; \
321 typeof(var) pco_old__ = (oval); \
322 typeof(var) pco_new__ = (nval); \
323 switch (sizeof(var)) { \
324 case 1: \
325 asm("cmpxchgb %2, "__percpu_arg(1) \
326 : "=a" (pco_ret__), "+m" (var) \
327 : "q" (pco_new__), "0" (pco_old__) \
328 : "memory"); \
329 break; \
330 case 2: \
331 asm("cmpxchgw %2, "__percpu_arg(1) \
332 : "=a" (pco_ret__), "+m" (var) \
333 : "r" (pco_new__), "0" (pco_old__) \
334 : "memory"); \
335 break; \
336 case 4: \
337 asm("cmpxchgl %2, "__percpu_arg(1) \
338 : "=a" (pco_ret__), "+m" (var) \
339 : "r" (pco_new__), "0" (pco_old__) \
340 : "memory"); \
341 break; \
342 case 8: \
343 asm("cmpxchgq %2, "__percpu_arg(1) \
344 : "=a" (pco_ret__), "+m" (var) \
345 : "r" (pco_new__), "0" (pco_old__) \
346 : "memory"); \
347 break; \
348 default: __bad_percpu_size(); \
349 } \
350 pco_ret__; \
351})
352
353/*
219 * percpu_read() makes gcc load the percpu variable every time it is 354 * percpu_read() makes gcc load the percpu variable every time it is
220 * accessed while percpu_read_stable() allows the value to be cached. 355 * accessed while percpu_read_stable() allows the value to be cached.
221 * percpu_read_stable() is more efficient and can be used if its value 356 * percpu_read_stable() is more efficient and can be used if its value
@@ -253,6 +388,12 @@ do { \
253#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 388#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
254#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 389#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
255#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 390#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
391/*
392 * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
393 * faster than an xchg with forced lock semantics.
394 */
395#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
396#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
256 397
257#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 398#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
258#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 399#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -272,6 +413,9 @@ do { \
272#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 413#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
273#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 414#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
274#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 415#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
416#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
417#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
418#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
275 419
276#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) 420#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
277#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) 421#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
@@ -285,6 +429,49 @@ do { \
285#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 429#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
286#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 430#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
287#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 431#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
432#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
433#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
434#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
435
436#ifndef CONFIG_M386
437#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
438#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
439#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
440#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
441#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
442#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
443
444#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
445#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
446#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
447#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
448#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
449#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
450
451#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
452#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
453#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
454#endif /* !CONFIG_M386 */
455
456#ifdef CONFIG_X86_CMPXCHG64
457#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \
458({ \
459 char __ret; \
460 typeof(o1) __o1 = o1; \
461 typeof(o1) __n1 = n1; \
462 typeof(o2) __o2 = o2; \
463 typeof(o2) __n2 = n2; \
464 typeof(o2) __dummy = n2; \
465 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
466 : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \
467 : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \
468 __ret; \
469})
470
471#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
472#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
473#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
474#endif /* CONFIG_X86_CMPXCHG64 */
288 475
289/* 476/*
290 * Per cpu atomic 64 bit operations are only available under 64 bit. 477 * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -297,6 +484,7 @@ do { \
297#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 484#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
298#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 485#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
299#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 486#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
487#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
300 488
301#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 489#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
302#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 490#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
@@ -304,11 +492,48 @@ do { \
304#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 492#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
305#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 493#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
306#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 494#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
495#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
496#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
497#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
307 498
308#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 499#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
309#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 500#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
310#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 501#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
311#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 502#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
503#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
504#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
505
506/*
507 * Pretty complex macro to generate cmpxchg16 instruction. The instruction
508 * is not supported on early AMD64 processors so we must be able to emulate
509 * it in software. The address used in the cmpxchg16 instruction must be
510 * aligned to a 16 byte boundary.
511 */
512#ifdef CONFIG_SMP
513#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
514#else
515#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
516#endif
517#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
518({ \
519 char __ret; \
520 typeof(o1) __o1 = o1; \
521 typeof(o1) __n1 = n1; \
522 typeof(o2) __o2 = o2; \
523 typeof(o2) __n2 = n2; \
524 typeof(o2) __dummy; \
525 alternative_io(CMPXCHG16B_EMU_CALL, \
526 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
527 X86_FEATURE_CX16, \
528 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
529 "S" (&pcp1), "b"(__n1), "c"(__n2), \
530 "a"(__o1), "d"(__o2) : "memory"); \
531 __ret; \
532})
533
534#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
535#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
536#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
312 537
313#endif 538#endif
314 539
@@ -322,6 +547,33 @@ do { \
322 old__; \ 547 old__; \
323}) 548})
324 549
550static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
551 const unsigned long __percpu *addr)
552{
553 unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
554
555 return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
556}
557
558static inline int x86_this_cpu_variable_test_bit(int nr,
559 const unsigned long __percpu *addr)
560{
561 int oldbit;
562
563 asm volatile("bt "__percpu_arg(2)",%1\n\t"
564 "sbb %0,%0"
565 : "=r" (oldbit)
566 : "m" (*(unsigned long *)addr), "Ir" (nr));
567
568 return oldbit;
569}
570
571#define x86_this_cpu_test_bit(nr, addr) \
572 (__builtin_constant_p((nr)) \
573 ? x86_this_cpu_constant_test_bit((nr), (addr)) \
574 : x86_this_cpu_variable_test_bit((nr), (addr)))
575
576
325#include <asm-generic/percpu.h> 577#include <asm-generic/percpu.h>
326 578
327/* We can use this directly for local CPU (faster). */ 579/* We can use this directly for local CPU (faster). */