aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 17:18:38 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-11 17:18:38 -0500
commit24af98c4cf5f5e69266e270c7f3fb34b82ff6656 (patch)
tree70d71381c841c92b2d28397bf0c5d6a7d9bbbaac
parent9061cbe62adeccf8c986883bcd40f4aeee59ea75 (diff)
parent337f13046ff03717a9e99675284a817527440a49 (diff)
Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar: "So we have a laundry list of locking subsystem changes: - continuing barrier API and code improvements - futex enhancements - atomics API improvements - pvqspinlock enhancements: in particular lock stealing and adaptive spinning - qspinlock micro-enhancements" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op futex: Cleanup the goto confusion in requeue_pi() futex: Remove pointless put_pi_state calls in requeue() futex: Document pi_state refcounting in requeue code futex: Rename free_pi_state() to put_pi_state() futex: Drop refcount if requeue_pi() acquired the rtmutex locking/barriers, arch: Remove ambiguous statement in the smp_store_mb() documentation lcoking/barriers, arch: Use smp barriers in smp_store_release() locking/cmpxchg, arch: Remove tas() definitions locking/pvqspinlock: Queue node adaptive spinning locking/pvqspinlock: Allow limited lock stealing locking/pvqspinlock: Collect slowpath lock statistics sched/core, locking: Document Program-Order guarantees locking, sched: Introduce smp_cond_acquire() and use it locking/pvqspinlock, x86: Optimize the PV unlock code path locking/qspinlock: Avoid redundant read of next pointer locking/qspinlock: Prefetch the next node cacheline locking/qspinlock: Use _acquire/_release() versions of cmpxchg() & xchg() atomics: Add test for atomic operations with _relaxed variants
-rw-r--r--Documentation/memory-barriers.txt4
-rw-r--r--arch/blackfin/include/asm/cmpxchg.h1
-rw-r--r--arch/c6x/include/asm/cmpxchg.h2
-rw-r--r--arch/frv/include/asm/cmpxchg.h2
-rw-r--r--arch/ia64/include/asm/barrier.h2
-rw-r--r--arch/powerpc/include/asm/barrier.h2
-rw-r--r--arch/s390/include/asm/barrier.h2
-rw-r--r--arch/tile/include/asm/cmpxchg.h2
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h59
-rw-r--r--include/asm-generic/barrier.h2
-rw-r--r--include/asm-generic/qspinlock.h9
-rw-r--r--include/linux/compiler.h17
-rw-r--r--kernel/futex.c83
-rw-r--r--kernel/locking/qspinlock.c82
-rw-r--r--kernel/locking/qspinlock_paravirt.h252
-rw-r--r--kernel/locking/qspinlock_stat.h300
-rw-r--r--kernel/sched/core.c99
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--lib/atomic64_test.c120
20 files changed, 904 insertions, 146 deletions
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 85304ebd187c..a61be39c7b51 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1673,8 +1673,8 @@ There are some more advanced barrier functions:
1673 (*) smp_store_mb(var, value) 1673 (*) smp_store_mb(var, value)
1674 1674
1675 This assigns the value to the variable and then inserts a full memory 1675 This assigns the value to the variable and then inserts a full memory
1676 barrier after it, depending on the function. It isn't guaranteed to 1676 barrier after it. It isn't guaranteed to insert anything more than a
1677 insert anything more than a compiler barrier in a UP compilation. 1677 compiler barrier in a UP compilation.
1678 1678
1679 1679
1680 (*) smp_mb__before_atomic(); 1680 (*) smp_mb__before_atomic();
diff --git a/arch/blackfin/include/asm/cmpxchg.h b/arch/blackfin/include/asm/cmpxchg.h
index c05868cc61c1..253928854299 100644
--- a/arch/blackfin/include/asm/cmpxchg.h
+++ b/arch/blackfin/include/asm/cmpxchg.h
@@ -128,6 +128,5 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
128#endif /* !CONFIG_SMP */ 128#endif /* !CONFIG_SMP */
129 129
130#define xchg(ptr, x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))) 130#define xchg(ptr, x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
131#define tas(ptr) ((void)xchg((ptr), 1))
132 131
133#endif /* __ARCH_BLACKFIN_CMPXCHG__ */ 132#endif /* __ARCH_BLACKFIN_CMPXCHG__ */
diff --git a/arch/c6x/include/asm/cmpxchg.h b/arch/c6x/include/asm/cmpxchg.h
index b27c8cefb8c3..93d0a5a047a2 100644
--- a/arch/c6x/include/asm/cmpxchg.h
+++ b/arch/c6x/include/asm/cmpxchg.h
@@ -47,8 +47,6 @@ static inline unsigned int __xchg(unsigned int x, volatile void *ptr, int size)
47#define xchg(ptr, x) \ 47#define xchg(ptr, x) \
48 ((__typeof__(*(ptr)))__xchg((unsigned int)(x), (void *) (ptr), \ 48 ((__typeof__(*(ptr)))__xchg((unsigned int)(x), (void *) (ptr), \
49 sizeof(*(ptr)))) 49 sizeof(*(ptr))))
50#define tas(ptr) xchg((ptr), 1)
51
52 50
53#include <asm-generic/cmpxchg-local.h> 51#include <asm-generic/cmpxchg-local.h>
54 52
diff --git a/arch/frv/include/asm/cmpxchg.h b/arch/frv/include/asm/cmpxchg.h
index 5b04dd0aecab..a899765102ea 100644
--- a/arch/frv/include/asm/cmpxchg.h
+++ b/arch/frv/include/asm/cmpxchg.h
@@ -69,8 +69,6 @@ extern uint32_t __xchg_32(uint32_t i, volatile void *v);
69 69
70#endif 70#endif
71 71
72#define tas(ptr) (xchg((ptr), 1))
73
74/*****************************************************************************/ 72/*****************************************************************************/
75/* 73/*
76 * compare and conditionally exchange value with memory 74 * compare and conditionally exchange value with memory
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
index df896a1c41d3..209c4b817c95 100644
--- a/arch/ia64/include/asm/barrier.h
+++ b/arch/ia64/include/asm/barrier.h
@@ -77,7 +77,7 @@ do { \
77 ___p1; \ 77 ___p1; \
78}) 78})
79 79
80#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) 80#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
81 81
82/* 82/*
83 * The group barrier in front of the rsm & ssm are necessary to ensure 83 * The group barrier in front of the rsm & ssm are necessary to ensure
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 0eca6efc0631..a7af5fb7b914 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -34,7 +34,7 @@
34#define rmb() __asm__ __volatile__ ("sync" : : : "memory") 34#define rmb() __asm__ __volatile__ ("sync" : : : "memory")
35#define wmb() __asm__ __volatile__ ("sync" : : : "memory") 35#define wmb() __asm__ __volatile__ ("sync" : : : "memory")
36 36
37#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) 37#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
38 38
39#ifdef __SUBARCH_HAS_LWSYNC 39#ifdef __SUBARCH_HAS_LWSYNC
40# define SMPWMB LWSYNC 40# define SMPWMB LWSYNC
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index d68e11e0df5e..7ffd0b19135c 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -36,7 +36,7 @@
36#define smp_mb__before_atomic() smp_mb() 36#define smp_mb__before_atomic() smp_mb()
37#define smp_mb__after_atomic() smp_mb() 37#define smp_mb__after_atomic() smp_mb()
38 38
39#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) 39#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
40 40
41#define smp_store_release(p, v) \ 41#define smp_store_release(p, v) \
42do { \ 42do { \
diff --git a/arch/tile/include/asm/cmpxchg.h b/arch/tile/include/asm/cmpxchg.h
index 0ccda3c425be..25d5899497be 100644
--- a/arch/tile/include/asm/cmpxchg.h
+++ b/arch/tile/include/asm/cmpxchg.h
@@ -127,8 +127,6 @@ long long _atomic64_cmpxchg(long long *v, long long o, long long n);
127 127
128#endif 128#endif
129 129
130#define tas(ptr) xchg((ptr), 1)
131
132#endif /* __ASSEMBLY__ */ 130#endif /* __ASSEMBLY__ */
133 131
134#endif /* _ASM_TILE_CMPXCHG_H */ 132#endif /* _ASM_TILE_CMPXCHG_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db3622f22b61..965fc4216f76 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -687,6 +687,14 @@ config PARAVIRT_SPINLOCKS
687 687
688 If you are unsure how to answer this question, answer Y. 688 If you are unsure how to answer this question, answer Y.
689 689
690config QUEUED_LOCK_STAT
691 bool "Paravirt queued spinlock statistics"
692 depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
693 ---help---
694 Enable the collection of statistical data on the slowpath
695 behavior of paravirtualized queued spinlocks and report
696 them on debugfs.
697
690source "arch/x86/xen/Kconfig" 698source "arch/x86/xen/Kconfig"
691 699
692config KVM_GUEST 700config KVM_GUEST
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index b002e711ba88..9f92c180ed2f 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -1,6 +1,65 @@
1#ifndef __ASM_QSPINLOCK_PARAVIRT_H 1#ifndef __ASM_QSPINLOCK_PARAVIRT_H
2#define __ASM_QSPINLOCK_PARAVIRT_H 2#define __ASM_QSPINLOCK_PARAVIRT_H
3 3
4/*
5 * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
6 * registers. For i386, however, only 1 32-bit register needs to be saved
7 * and restored. So an optimized version of __pv_queued_spin_unlock() is
8 * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
9 */
10#ifdef CONFIG_64BIT
11
12PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
13#define __pv_queued_spin_unlock __pv_queued_spin_unlock
14#define PV_UNLOCK "__raw_callee_save___pv_queued_spin_unlock"
15#define PV_UNLOCK_SLOWPATH "__raw_callee_save___pv_queued_spin_unlock_slowpath"
16
17/*
18 * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
19 * which combines the registers saving trunk and the body of the following
20 * C code:
21 *
22 * void __pv_queued_spin_unlock(struct qspinlock *lock)
23 * {
24 * struct __qspinlock *l = (void *)lock;
25 * u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
26 *
27 * if (likely(lockval == _Q_LOCKED_VAL))
28 * return;
29 * pv_queued_spin_unlock_slowpath(lock, lockval);
30 * }
31 *
32 * For x86-64,
33 * rdi = lock (first argument)
34 * rsi = lockval (second argument)
35 * rdx = internal variable (set to 0)
36 */
37asm (".pushsection .text;"
38 ".globl " PV_UNLOCK ";"
39 ".align 4,0x90;"
40 PV_UNLOCK ": "
41 "push %rdx;"
42 "mov $0x1,%eax;"
43 "xor %edx,%edx;"
44 "lock cmpxchg %dl,(%rdi);"
45 "cmp $0x1,%al;"
46 "jne .slowpath;"
47 "pop %rdx;"
48 "ret;"
49 ".slowpath: "
50 "push %rsi;"
51 "movzbl %al,%esi;"
52 "call " PV_UNLOCK_SLOWPATH ";"
53 "pop %rsi;"
54 "pop %rdx;"
55 "ret;"
56 ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
57 ".popsection");
58
59#else /* CONFIG_64BIT */
60
61extern void __pv_queued_spin_unlock(struct qspinlock *lock);
4PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); 62PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
5 63
64#endif /* CONFIG_64BIT */
6#endif 65#endif
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index b42afada1280..0f45f93ef692 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -93,7 +93,7 @@
93#endif /* CONFIG_SMP */ 93#endif /* CONFIG_SMP */
94 94
95#ifndef smp_store_mb 95#ifndef smp_store_mb
96#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) 96#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
97#endif 97#endif
98 98
99#ifndef smp_mb__before_atomic 99#ifndef smp_mb__before_atomic
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e2aadbc7151f..39e1cb201b8e 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -12,8 +12,9 @@
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
15 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
15 * 16 *
16 * Authors: Waiman Long <waiman.long@hp.com> 17 * Authors: Waiman Long <waiman.long@hpe.com>
17 */ 18 */
18#ifndef __ASM_GENERIC_QSPINLOCK_H 19#ifndef __ASM_GENERIC_QSPINLOCK_H
19#define __ASM_GENERIC_QSPINLOCK_H 20#define __ASM_GENERIC_QSPINLOCK_H
@@ -62,7 +63,7 @@ static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
62static __always_inline int queued_spin_trylock(struct qspinlock *lock) 63static __always_inline int queued_spin_trylock(struct qspinlock *lock)
63{ 64{
64 if (!atomic_read(&lock->val) && 65 if (!atomic_read(&lock->val) &&
65 (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0)) 66 (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0))
66 return 1; 67 return 1;
67 return 0; 68 return 0;
68} 69}
@@ -77,7 +78,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
77{ 78{
78 u32 val; 79 u32 val;
79 80
80 val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL); 81 val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
81 if (likely(val == 0)) 82 if (likely(val == 0))
82 return; 83 return;
83 queued_spin_lock_slowpath(lock, val); 84 queued_spin_lock_slowpath(lock, val);
@@ -93,7 +94,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
93 /* 94 /*
94 * smp_mb__before_atomic() in order to guarantee release semantics 95 * smp_mb__before_atomic() in order to guarantee release semantics
95 */ 96 */
96 smp_mb__before_atomic_dec(); 97 smp_mb__before_atomic();
97 atomic_sub(_Q_LOCKED_VAL, &lock->val); 98 atomic_sub(_Q_LOCKED_VAL, &lock->val);
98} 99}
99#endif 100#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 4dac1036594f..00b042c49ccd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -299,6 +299,23 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
299 __u.__val; \ 299 __u.__val; \
300}) 300})
301 301
302/**
303 * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
304 * @cond: boolean expression to wait for
305 *
306 * Equivalent to using smp_load_acquire() on the condition variable but employs
307 * the control dependency of the wait to reduce the barrier on many platforms.
308 *
309 * The control dependency provides a LOAD->STORE order, the additional RMB
310 * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
311 * aka. ACQUIRE.
312 */
313#define smp_cond_acquire(cond) do { \
314 while (!(cond)) \
315 cpu_relax(); \
316 smp_rmb(); /* ctrl + rmb := acquire */ \
317} while (0)
318
302#endif /* __KERNEL__ */ 319#endif /* __KERNEL__ */
303 320
304#endif /* __ASSEMBLY__ */ 321#endif /* __ASSEMBLY__ */
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d7549825a..8a310e240cda 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -725,9 +725,12 @@ static struct futex_pi_state * alloc_pi_state(void)
725} 725}
726 726
727/* 727/*
728 * Drops a reference to the pi_state object and frees or caches it
729 * when the last reference is gone.
730 *
728 * Must be called with the hb lock held. 731 * Must be called with the hb lock held.
729 */ 732 */
730static void free_pi_state(struct futex_pi_state *pi_state) 733static void put_pi_state(struct futex_pi_state *pi_state)
731{ 734{
732 if (!pi_state) 735 if (!pi_state)
733 return; 736 return;
@@ -1706,31 +1709,35 @@ retry_private:
1706 * exist yet, look it up one more time to ensure we have a 1709 * exist yet, look it up one more time to ensure we have a
1707 * reference to it. If the lock was taken, ret contains the 1710 * reference to it. If the lock was taken, ret contains the
1708 * vpid of the top waiter task. 1711 * vpid of the top waiter task.
1712 * If the lock was not taken, we have pi_state and an initial
1713 * refcount on it. In case of an error we have nothing.
1709 */ 1714 */
1710 if (ret > 0) { 1715 if (ret > 0) {
1711 WARN_ON(pi_state); 1716 WARN_ON(pi_state);
1712 drop_count++; 1717 drop_count++;
1713 task_count++; 1718 task_count++;
1714 /* 1719 /*
1715 * If we acquired the lock, then the user 1720 * If we acquired the lock, then the user space value
1716 * space value of uaddr2 should be vpid. It 1721 * of uaddr2 should be vpid. It cannot be changed by
1717 * cannot be changed by the top waiter as it 1722 * the top waiter as it is blocked on hb2 lock if it
1718 * is blocked on hb2 lock if it tries to do 1723 * tries to do so. If something fiddled with it behind
1719 * so. If something fiddled with it behind our 1724 * our back the pi state lookup might unearth it. So
1720 * back the pi state lookup might unearth 1725 * we rather use the known value than rereading and
1721 * it. So we rather use the known value than 1726 * handing potential crap to lookup_pi_state.
1722 * rereading and handing potential crap to 1727 *
1723 * lookup_pi_state. 1728 * If that call succeeds then we have pi_state and an
1729 * initial refcount on it.
1724 */ 1730 */
1725 ret = lookup_pi_state(ret, hb2, &key2, &pi_state); 1731 ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
1726 } 1732 }
1727 1733
1728 switch (ret) { 1734 switch (ret) {
1729 case 0: 1735 case 0:
1736 /* We hold a reference on the pi state. */
1730 break; 1737 break;
1738
1739 /* If the above failed, then pi_state is NULL */
1731 case -EFAULT: 1740 case -EFAULT:
1732 free_pi_state(pi_state);
1733 pi_state = NULL;
1734 double_unlock_hb(hb1, hb2); 1741 double_unlock_hb(hb1, hb2);
1735 hb_waiters_dec(hb2); 1742 hb_waiters_dec(hb2);
1736 put_futex_key(&key2); 1743 put_futex_key(&key2);
@@ -1746,8 +1753,6 @@ retry_private:
1746 * exit to complete. 1753 * exit to complete.
1747 * - The user space value changed. 1754 * - The user space value changed.
1748 */ 1755 */
1749 free_pi_state(pi_state);
1750 pi_state = NULL;
1751 double_unlock_hb(hb1, hb2); 1756 double_unlock_hb(hb1, hb2);
1752 hb_waiters_dec(hb2); 1757 hb_waiters_dec(hb2);
1753 put_futex_key(&key2); 1758 put_futex_key(&key2);
@@ -1801,30 +1806,58 @@ retry_private:
1801 * of requeue_pi if we couldn't acquire the lock atomically. 1806 * of requeue_pi if we couldn't acquire the lock atomically.
1802 */ 1807 */
1803 if (requeue_pi) { 1808 if (requeue_pi) {
1804 /* Prepare the waiter to take the rt_mutex. */ 1809 /*
1810 * Prepare the waiter to take the rt_mutex. Take a
1811 * refcount on the pi_state and store the pointer in
1812 * the futex_q object of the waiter.
1813 */
1805 atomic_inc(&pi_state->refcount); 1814 atomic_inc(&pi_state->refcount);
1806 this->pi_state = pi_state; 1815 this->pi_state = pi_state;
1807 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, 1816 ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1808 this->rt_waiter, 1817 this->rt_waiter,
1809 this->task); 1818 this->task);
1810 if (ret == 1) { 1819 if (ret == 1) {
1811 /* We got the lock. */ 1820 /*
1821 * We got the lock. We do neither drop the
1822 * refcount on pi_state nor clear
1823 * this->pi_state because the waiter needs the
1824 * pi_state for cleaning up the user space
1825 * value. It will drop the refcount after
1826 * doing so.
1827 */
1812 requeue_pi_wake_futex(this, &key2, hb2); 1828 requeue_pi_wake_futex(this, &key2, hb2);
1813 drop_count++; 1829 drop_count++;
1814 continue; 1830 continue;
1815 } else if (ret) { 1831 } else if (ret) {
1816 /* -EDEADLK */ 1832 /*
1833 * rt_mutex_start_proxy_lock() detected a
1834 * potential deadlock when we tried to queue
1835 * that waiter. Drop the pi_state reference
1836 * which we took above and remove the pointer
1837 * to the state from the waiters futex_q
1838 * object.
1839 */
1817 this->pi_state = NULL; 1840 this->pi_state = NULL;
1818 free_pi_state(pi_state); 1841 put_pi_state(pi_state);
1819 goto out_unlock; 1842 /*
1843 * We stop queueing more waiters and let user
1844 * space deal with the mess.
1845 */
1846 break;
1820 } 1847 }
1821 } 1848 }
1822 requeue_futex(this, hb1, hb2, &key2); 1849 requeue_futex(this, hb1, hb2, &key2);
1823 drop_count++; 1850 drop_count++;
1824 } 1851 }
1825 1852
1853 /*
1854 * We took an extra initial reference to the pi_state either
1855 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
1856 * need to drop it here again.
1857 */
1858 put_pi_state(pi_state);
1859
1826out_unlock: 1860out_unlock:
1827 free_pi_state(pi_state);
1828 double_unlock_hb(hb1, hb2); 1861 double_unlock_hb(hb1, hb2);
1829 wake_up_q(&wake_q); 1862 wake_up_q(&wake_q);
1830 hb_waiters_dec(hb2); 1863 hb_waiters_dec(hb2);
@@ -1973,7 +2006,7 @@ static void unqueue_me_pi(struct futex_q *q)
1973 __unqueue_futex(q); 2006 __unqueue_futex(q);
1974 2007
1975 BUG_ON(!q->pi_state); 2008 BUG_ON(!q->pi_state);
1976 free_pi_state(q->pi_state); 2009 put_pi_state(q->pi_state);
1977 q->pi_state = NULL; 2010 q->pi_state = NULL;
1978 2011
1979 spin_unlock(q->lock_ptr); 2012 spin_unlock(q->lock_ptr);
@@ -2755,6 +2788,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2755 if (q.pi_state && (q.pi_state->owner != current)) { 2788 if (q.pi_state && (q.pi_state->owner != current)) {
2756 spin_lock(q.lock_ptr); 2789 spin_lock(q.lock_ptr);
2757 ret = fixup_pi_state_owner(uaddr2, &q, current); 2790 ret = fixup_pi_state_owner(uaddr2, &q, current);
2791 /*
2792 * Drop the reference to the pi state which
2793 * the requeue_pi() code acquired for us.
2794 */
2795 put_pi_state(q.pi_state);
2758 spin_unlock(q.lock_ptr); 2796 spin_unlock(q.lock_ptr);
2759 } 2797 }
2760 } else { 2798 } else {
@@ -3046,7 +3084,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
3046 3084
3047 if (op & FUTEX_CLOCK_REALTIME) { 3085 if (op & FUTEX_CLOCK_REALTIME) {
3048 flags |= FLAGS_CLOCKRT; 3086 flags |= FLAGS_CLOCKRT;
3049 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) 3087 if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
3088 cmd != FUTEX_WAIT_REQUEUE_PI)
3050 return -ENOSYS; 3089 return -ENOSYS;
3051 } 3090 }
3052 3091
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5..393d1874b9e0 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
15 * (C) Copyright 2013-2014 Red Hat, Inc. 15 * (C) Copyright 2013-2014 Red Hat, Inc.
16 * (C) Copyright 2015 Intel Corp. 16 * (C) Copyright 2015 Intel Corp.
17 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
17 * 18 *
18 * Authors: Waiman Long <waiman.long@hp.com> 19 * Authors: Waiman Long <waiman.long@hpe.com>
19 * Peter Zijlstra <peterz@infradead.org> 20 * Peter Zijlstra <peterz@infradead.org>
20 */ 21 */
21 22
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
176{ 177{
177 struct __qspinlock *l = (void *)lock; 178 struct __qspinlock *l = (void *)lock;
178 179
179 return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 180 /*
181 * Use release semantics to make sure that the MCS node is properly
182 * initialized before changing the tail code.
183 */
184 return (u32)xchg_release(&l->tail,
185 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
180} 186}
181 187
182#else /* _Q_PENDING_BITS == 8 */ 188#else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
208 214
209 for (;;) { 215 for (;;) {
210 new = (val & _Q_LOCKED_PENDING_MASK) | tail; 216 new = (val & _Q_LOCKED_PENDING_MASK) | tail;
211 old = atomic_cmpxchg(&lock->val, val, new); 217 /*
218 * Use release semantics to make sure that the MCS node is
219 * properly initialized before changing the tail code.
220 */
221 old = atomic_cmpxchg_release(&lock->val, val, new);
212 if (old == val) 222 if (old == val)
213 break; 223 break;
214 224
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
238 */ 248 */
239 249
240static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } 250static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
241static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } 251static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
252 struct mcs_spinlock *prev) { }
242static __always_inline void __pv_kick_node(struct qspinlock *lock, 253static __always_inline void __pv_kick_node(struct qspinlock *lock,
243 struct mcs_spinlock *node) { } 254 struct mcs_spinlock *node) { }
244static __always_inline void __pv_wait_head(struct qspinlock *lock, 255static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
245 struct mcs_spinlock *node) { } 256 struct mcs_spinlock *node)
257 { return 0; }
246 258
247#define pv_enabled() false 259#define pv_enabled() false
248 260
249#define pv_init_node __pv_init_node 261#define pv_init_node __pv_init_node
250#define pv_wait_node __pv_wait_node 262#define pv_wait_node __pv_wait_node
251#define pv_kick_node __pv_kick_node 263#define pv_kick_node __pv_kick_node
252#define pv_wait_head __pv_wait_head 264#define pv_wait_head_or_lock __pv_wait_head_or_lock
253 265
254#ifdef CONFIG_PARAVIRT_SPINLOCKS 266#ifdef CONFIG_PARAVIRT_SPINLOCKS
255#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath 267#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
319 if (val == new) 331 if (val == new)
320 new |= _Q_PENDING_VAL; 332 new |= _Q_PENDING_VAL;
321 333
322 old = atomic_cmpxchg(&lock->val, val, new); 334 /*
335 * Acquire semantic is required here as the function may
336 * return immediately if the lock was free.
337 */
338 old = atomic_cmpxchg_acquire(&lock->val, val, new);
323 if (old == val) 339 if (old == val)
324 break; 340 break;
325 341
@@ -382,6 +398,7 @@ queue:
382 * p,*,* -> n,*,* 398 * p,*,* -> n,*,*
383 */ 399 */
384 old = xchg_tail(lock, tail); 400 old = xchg_tail(lock, tail);
401 next = NULL;
385 402
386 /* 403 /*
387 * if there was a previous node; link it and wait until reaching the 404 * if there was a previous node; link it and wait until reaching the
@@ -391,8 +408,18 @@ queue:
391 prev = decode_tail(old); 408 prev = decode_tail(old);
392 WRITE_ONCE(prev->next, node); 409 WRITE_ONCE(prev->next, node);
393 410
394 pv_wait_node(node); 411 pv_wait_node(node, prev);
395 arch_mcs_spin_lock_contended(&node->locked); 412 arch_mcs_spin_lock_contended(&node->locked);
413
414 /*
415 * While waiting for the MCS lock, the next pointer may have
416 * been set by another lock waiter. We optimistically load
417 * the next pointer & prefetch the cacheline for writing
418 * to reduce latency in the upcoming MCS unlock operation.
419 */
420 next = READ_ONCE(node->next);
421 if (next)
422 prefetchw(next);
396 } 423 }
397 424
398 /* 425 /*
@@ -406,11 +433,22 @@ queue:
406 * sequentiality; this is because the set_locked() function below 433 * sequentiality; this is because the set_locked() function below
407 * does not imply a full barrier. 434 * does not imply a full barrier.
408 * 435 *
436 * The PV pv_wait_head_or_lock function, if active, will acquire
437 * the lock and return a non-zero value. So we have to skip the
438 * smp_load_acquire() call. As the next PV queue head hasn't been
439 * designated yet, there is no way for the locked value to become
440 * _Q_SLOW_VAL. So both the set_locked() and the
441 * atomic_cmpxchg_relaxed() calls will be safe.
442 *
443 * If PV isn't active, 0 will be returned instead.
444 *
409 */ 445 */
410 pv_wait_head(lock, node); 446 if ((val = pv_wait_head_or_lock(lock, node)))
411 while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) 447 goto locked;
412 cpu_relax();
413 448
449 smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
450
451locked:
414 /* 452 /*
415 * claim the lock: 453 * claim the lock:
416 * 454 *
@@ -422,11 +460,17 @@ queue:
422 * to grab the lock. 460 * to grab the lock.
423 */ 461 */
424 for (;;) { 462 for (;;) {
425 if (val != tail) { 463 /* In the PV case we might already have _Q_LOCKED_VAL set */
464 if ((val & _Q_TAIL_MASK) != tail) {
426 set_locked(lock); 465 set_locked(lock);
427 break; 466 break;
428 } 467 }
429 old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); 468 /*
469 * The smp_load_acquire() call above has provided the necessary
470 * acquire semantics required for locking. At most two
471 * iterations of this loop may be ran.
472 */
473 old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
430 if (old == val) 474 if (old == val)
431 goto release; /* No contention */ 475 goto release; /* No contention */
432 476
@@ -434,10 +478,12 @@ queue:
434 } 478 }
435 479
436 /* 480 /*
437 * contended path; wait for next, release. 481 * contended path; wait for next if not observed yet, release.
438 */ 482 */
439 while (!(next = READ_ONCE(node->next))) 483 if (!next) {
440 cpu_relax(); 484 while (!(next = READ_ONCE(node->next)))
485 cpu_relax();
486 }
441 487
442 arch_mcs_spin_unlock_contended(&next->locked); 488 arch_mcs_spin_unlock_contended(&next->locked);
443 pv_kick_node(lock, next); 489 pv_kick_node(lock, next);
@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
462#undef pv_init_node 508#undef pv_init_node
463#undef pv_wait_node 509#undef pv_wait_node
464#undef pv_kick_node 510#undef pv_kick_node
465#undef pv_wait_head 511#undef pv_wait_head_or_lock
466 512
467#undef queued_spin_lock_slowpath 513#undef queued_spin_lock_slowpath
468#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath 514#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff4829b..87bb235c3448 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
23#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) 23#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
24 24
25/* 25/*
26 * Queue Node Adaptive Spinning
27 *
28 * A queue node vCPU will stop spinning if the vCPU in the previous node is
29 * not running. The one lock stealing attempt allowed at slowpath entry
30 * mitigates the slight slowdown for non-overcommitted guest with this
31 * aggressive wait-early mechanism.
32 *
33 * The status of the previous node will be checked at fixed interval
34 * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
35 * pound on the cacheline of the previous node too heavily.
36 */
37#define PV_PREV_CHECK_MASK 0xff
38
39/*
26 * Queue node uses: vcpu_running & vcpu_halted. 40 * Queue node uses: vcpu_running & vcpu_halted.
27 * Queue head uses: vcpu_running & vcpu_hashed. 41 * Queue head uses: vcpu_running & vcpu_hashed.
28 */ 42 */
@@ -41,6 +55,94 @@ struct pv_node {
41}; 55};
42 56
43/* 57/*
58 * By replacing the regular queued_spin_trylock() with the function below,
59 * it will be called once when a lock waiter enter the PV slowpath before
60 * being queued. By allowing one lock stealing attempt here when the pending
61 * bit is off, it helps to reduce the performance impact of lock waiter
62 * preemption without the drawback of lock starvation.
63 */
64#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
65static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
66{
67 struct __qspinlock *l = (void *)lock;
68
69 return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
70 (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
71}
72
73/*
74 * The pending bit is used by the queue head vCPU to indicate that it
75 * is actively spinning on the lock and no lock stealing is allowed.
76 */
77#if _Q_PENDING_BITS == 8
78static __always_inline void set_pending(struct qspinlock *lock)
79{
80 struct __qspinlock *l = (void *)lock;
81
82 WRITE_ONCE(l->pending, 1);
83}
84
85static __always_inline void clear_pending(struct qspinlock *lock)
86{
87 struct __qspinlock *l = (void *)lock;
88
89 WRITE_ONCE(l->pending, 0);
90}
91
92/*
93 * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
94 * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
95 * just to be sure that it will get it.
96 */
97static __always_inline int trylock_clear_pending(struct qspinlock *lock)
98{
99 struct __qspinlock *l = (void *)lock;
100
101 return !READ_ONCE(l->locked) &&
102 (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
103 == _Q_PENDING_VAL);
104}
105#else /* _Q_PENDING_BITS == 8 */
106static __always_inline void set_pending(struct qspinlock *lock)
107{
108 atomic_set_mask(_Q_PENDING_VAL, &lock->val);
109}
110
111static __always_inline void clear_pending(struct qspinlock *lock)
112{
113 atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
114}
115
116static __always_inline int trylock_clear_pending(struct qspinlock *lock)
117{
118 int val = atomic_read(&lock->val);
119
120 for (;;) {
121 int old, new;
122
123 if (val & _Q_LOCKED_MASK)
124 break;
125
126 /*
127 * Try to clear pending bit & set locked bit
128 */
129 old = val;
130 new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
131 val = atomic_cmpxchg(&lock->val, old, new);
132
133 if (val == old)
134 return 1;
135 }
136 return 0;
137}
138#endif /* _Q_PENDING_BITS == 8 */
139
140/*
141 * Include queued spinlock statistics code
142 */
143#include "qspinlock_stat.h"
144
145/*
44 * Lock and MCS node addresses hash table for fast lookup 146 * Lock and MCS node addresses hash table for fast lookup
45 * 147 *
46 * Hashing is done on a per-cacheline basis to minimize the need to access 148 * Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
100{ 202{
101 unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); 203 unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
102 struct pv_hash_entry *he; 204 struct pv_hash_entry *he;
205 int hopcnt = 0;
103 206
104 for_each_hash_entry(he, offset, hash) { 207 for_each_hash_entry(he, offset, hash) {
208 hopcnt++;
105 if (!cmpxchg(&he->lock, NULL, lock)) { 209 if (!cmpxchg(&he->lock, NULL, lock)) {
106 WRITE_ONCE(he->node, node); 210 WRITE_ONCE(he->node, node);
211 qstat_hop(hopcnt);
107 return &he->lock; 212 return &he->lock;
108 } 213 }
109 } 214 }
@@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
144} 249}
145 250
146/* 251/*
252 * Return true if when it is time to check the previous node which is not
253 * in a running state.
254 */
255static inline bool
256pv_wait_early(struct pv_node *prev, int loop)
257{
258
259 if ((loop & PV_PREV_CHECK_MASK) != 0)
260 return false;
261
262 return READ_ONCE(prev->state) != vcpu_running;
263}
264
265/*
147 * Initialize the PV part of the mcs_spinlock node. 266 * Initialize the PV part of the mcs_spinlock node.
148 */ 267 */
149static void pv_init_node(struct mcs_spinlock *node) 268static void pv_init_node(struct mcs_spinlock *node)
@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
161 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its 280 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
162 * behalf. 281 * behalf.
163 */ 282 */
164static void pv_wait_node(struct mcs_spinlock *node) 283static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
165{ 284{
166 struct pv_node *pn = (struct pv_node *)node; 285 struct pv_node *pn = (struct pv_node *)node;
286 struct pv_node *pp = (struct pv_node *)prev;
287 int waitcnt = 0;
167 int loop; 288 int loop;
289 bool wait_early;
168 290
169 for (;;) { 291 /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
170 for (loop = SPIN_THRESHOLD; loop; loop--) { 292 for (;; waitcnt++) {
293 for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
171 if (READ_ONCE(node->locked)) 294 if (READ_ONCE(node->locked))
172 return; 295 return;
296 if (pv_wait_early(pp, loop)) {
297 wait_early = true;
298 break;
299 }
173 cpu_relax(); 300 cpu_relax();
174 } 301 }
175 302
@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
184 */ 311 */
185 smp_store_mb(pn->state, vcpu_halted); 312 smp_store_mb(pn->state, vcpu_halted);
186 313
187 if (!READ_ONCE(node->locked)) 314 if (!READ_ONCE(node->locked)) {
315 qstat_inc(qstat_pv_wait_node, true);
316 qstat_inc(qstat_pv_wait_again, waitcnt);
317 qstat_inc(qstat_pv_wait_early, wait_early);
188 pv_wait(&pn->state, vcpu_halted); 318 pv_wait(&pn->state, vcpu_halted);
319 }
189 320
190 /* 321 /*
191 * If pv_kick_node() changed us to vcpu_hashed, retain that value 322 * If pv_kick_node() changed us to vcpu_hashed, retain that
192 * so that pv_wait_head() knows to not also try to hash this lock. 323 * value so that pv_wait_head_or_lock() knows to not also try
324 * to hash this lock.
193 */ 325 */
194 cmpxchg(&pn->state, vcpu_halted, vcpu_running); 326 cmpxchg(&pn->state, vcpu_halted, vcpu_running);
195 327
@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
200 * So it is better to spin for a while in the hope that the 332 * So it is better to spin for a while in the hope that the
201 * MCS lock will be released soon. 333 * MCS lock will be released soon.
202 */ 334 */
335 qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
203 } 336 }
204 337
205 /* 338 /*
@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
212/* 345/*
213 * Called after setting next->locked = 1 when we're the lock owner. 346 * Called after setting next->locked = 1 when we're the lock owner.
214 * 347 *
215 * Instead of waking the waiters stuck in pv_wait_node() advance their state such 348 * Instead of waking the waiters stuck in pv_wait_node() advance their state
216 * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. 349 * such that they're waiting in pv_wait_head_or_lock(), this avoids a
350 * wake/sleep cycle.
217 */ 351 */
218static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) 352static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
219{ 353{
@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
242} 376}
243 377
244/* 378/*
245 * Wait for l->locked to become clear; halt the vcpu after a short spin. 379 * Wait for l->locked to become clear and acquire the lock;
380 * halt the vcpu after a short spin.
246 * __pv_queued_spin_unlock() will wake us. 381 * __pv_queued_spin_unlock() will wake us.
382 *
383 * The current value of the lock will be returned for additional processing.
247 */ 384 */
248static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) 385static u32
386pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
249{ 387{
250 struct pv_node *pn = (struct pv_node *)node; 388 struct pv_node *pn = (struct pv_node *)node;
251 struct __qspinlock *l = (void *)lock; 389 struct __qspinlock *l = (void *)lock;
252 struct qspinlock **lp = NULL; 390 struct qspinlock **lp = NULL;
391 int waitcnt = 0;
253 int loop; 392 int loop;
254 393
255 /* 394 /*
@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
259 if (READ_ONCE(pn->state) == vcpu_hashed) 398 if (READ_ONCE(pn->state) == vcpu_hashed)
260 lp = (struct qspinlock **)1; 399 lp = (struct qspinlock **)1;
261 400
262 for (;;) { 401 for (;; waitcnt++) {
402 /*
403 * Set correct vCPU state to be used by queue node wait-early
404 * mechanism.
405 */
406 WRITE_ONCE(pn->state, vcpu_running);
407
408 /*
409 * Set the pending bit in the active lock spinning loop to
410 * disable lock stealing before attempting to acquire the lock.
411 */
412 set_pending(lock);
263 for (loop = SPIN_THRESHOLD; loop; loop--) { 413 for (loop = SPIN_THRESHOLD; loop; loop--) {
264 if (!READ_ONCE(l->locked)) 414 if (trylock_clear_pending(lock))
265 return; 415 goto gotlock;
266 cpu_relax(); 416 cpu_relax();
267 } 417 }
418 clear_pending(lock);
419
268 420
269 if (!lp) { /* ONCE */ 421 if (!lp) { /* ONCE */
270 lp = pv_hash(lock, pn); 422 lp = pv_hash(lock, pn);
@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
280 * 432 *
281 * Matches the smp_rmb() in __pv_queued_spin_unlock(). 433 * Matches the smp_rmb() in __pv_queued_spin_unlock().
282 */ 434 */
283 if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { 435 if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
284 /* 436 /*
285 * The lock is free and _Q_SLOW_VAL has never 437 * The lock was free and now we own the lock.
286 * been set. Therefore we need to unhash before 438 * Change the lock value back to _Q_LOCKED_VAL
287 * getting the lock. 439 * and unhash the table.
288 */ 440 */
441 WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
289 WRITE_ONCE(*lp, NULL); 442 WRITE_ONCE(*lp, NULL);
290 return; 443 goto gotlock;
291 } 444 }
292 } 445 }
446 WRITE_ONCE(pn->state, vcpu_halted);
447 qstat_inc(qstat_pv_wait_head, true);
448 qstat_inc(qstat_pv_wait_again, waitcnt);
293 pv_wait(&l->locked, _Q_SLOW_VAL); 449 pv_wait(&l->locked, _Q_SLOW_VAL);
294 450
295 /* 451 /*
296 * The unlocker should have freed the lock before kicking the 452 * The unlocker should have freed the lock before kicking the
297 * CPU. So if the lock is still not free, it is a spurious 453 * CPU. So if the lock is still not free, it is a spurious
298 * wakeup and so the vCPU should wait again after spinning for 454 * wakeup or another vCPU has stolen the lock. The current
299 * a while. 455 * vCPU should spin again.
300 */ 456 */
457 qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
301 } 458 }
302 459
303 /* 460 /*
304 * Lock is unlocked now; the caller will acquire it without waiting. 461 * The cmpxchg() or xchg() call before coming here provides the
305 * As with pv_wait_node() we rely on the caller to do a load-acquire 462 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
306 * for us. 463 * here is to indicate to the compiler that the value will always
464 * be nozero to enable better code optimization.
307 */ 465 */
466gotlock:
467 return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
308} 468}
309 469
310/* 470/*
311 * PV version of the unlock function to be used in stead of 471 * PV versions of the unlock fastpath and slowpath functions to be used
312 * queued_spin_unlock(). 472 * instead of queued_spin_unlock().
313 */ 473 */
314__visible void __pv_queued_spin_unlock(struct qspinlock *lock) 474__visible void
475__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
315{ 476{
316 struct __qspinlock *l = (void *)lock; 477 struct __qspinlock *l = (void *)lock;
317 struct pv_node *node; 478 struct pv_node *node;
318 u8 locked;
319
320 /*
321 * We must not unlock if SLOW, because in that case we must first
322 * unhash. Otherwise it would be possible to have multiple @lock
323 * entries, which would be BAD.
324 */
325 locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
326 if (likely(locked == _Q_LOCKED_VAL))
327 return;
328 479
329 if (unlikely(locked != _Q_SLOW_VAL)) { 480 if (unlikely(locked != _Q_SLOW_VAL)) {
330 WARN(!debug_locks_silent, 481 WARN(!debug_locks_silent,
@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
338 * so we need a barrier to order the read of the node data in 489 * so we need a barrier to order the read of the node data in
339 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. 490 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
340 * 491 *
341 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. 492 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
342 */ 493 */
343 smp_rmb(); 494 smp_rmb();
344 495
@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
361 * vCPU is harmless other than the additional latency in completing 512 * vCPU is harmless other than the additional latency in completing
362 * the unlock. 513 * the unlock.
363 */ 514 */
515 qstat_inc(qstat_pv_kick_unlock, true);
364 pv_kick(node->cpu); 516 pv_kick(node->cpu);
365} 517}
518
366/* 519/*
367 * Include the architecture specific callee-save thunk of the 520 * Include the architecture specific callee-save thunk of the
368 * __pv_queued_spin_unlock(). This thunk is put together with 521 * __pv_queued_spin_unlock(). This thunk is put together with
369 * __pv_queued_spin_unlock() near the top of the file to make sure 522 * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
370 * that the callee-save thunk and the real unlock function are close 523 * function close to each other sharing consecutive instruction cachelines.
371 * to each other sharing consecutive instruction cachelines. 524 * Alternatively, architecture specific version of __pv_queued_spin_unlock()
525 * can be defined.
372 */ 526 */
373#include <asm/qspinlock_paravirt.h> 527#include <asm/qspinlock_paravirt.h>
374 528
529#ifndef __pv_queued_spin_unlock
530__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
531{
532 struct __qspinlock *l = (void *)lock;
533 u8 locked;
534
535 /*
536 * We must not unlock if SLOW, because in that case we must first
537 * unhash. Otherwise it would be possible to have multiple @lock
538 * entries, which would be BAD.
539 */
540 locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
541 if (likely(locked == _Q_LOCKED_VAL))
542 return;
543
544 __pv_queued_spin_unlock_slowpath(lock, locked);
545}
546#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000000..640dcecdd1df
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,300 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * Authors: Waiman Long <waiman.long@hpe.com>
13 */
14
15/*
16 * When queued spinlock statistical counters are enabled, the following
17 * debugfs files will be created for reporting the counter values:
18 *
19 * <debugfs>/qlockstat/
20 * pv_hash_hops - average # of hops per hashing operation
21 * pv_kick_unlock - # of vCPU kicks issued at unlock time
22 * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
23 * pv_latency_kick - average latency (ns) of vCPU kick operation
24 * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
25 * pv_lock_stealing - # of lock stealing operations
26 * pv_spurious_wakeup - # of spurious wakeups
27 * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
28 * pv_wait_early - # of early vCPU wait's
29 * pv_wait_head - # of vCPU wait's at the queue head
30 * pv_wait_node - # of vCPU wait's at a non-head queue node
31 *
32 * Writing to the "reset_counters" file will reset all the above counter
33 * values.
34 *
35 * These statistical counters are implemented as per-cpu variables which are
36 * summed and computed whenever the corresponding debugfs files are read. This
37 * minimizes added overhead making the counters usable even in a production
38 * environment.
39 *
40 * There may be slight difference between pv_kick_wake and pv_kick_unlock.
41 */
42enum qlock_stats {
43 qstat_pv_hash_hops,
44 qstat_pv_kick_unlock,
45 qstat_pv_kick_wake,
46 qstat_pv_latency_kick,
47 qstat_pv_latency_wake,
48 qstat_pv_lock_stealing,
49 qstat_pv_spurious_wakeup,
50 qstat_pv_wait_again,
51 qstat_pv_wait_early,
52 qstat_pv_wait_head,
53 qstat_pv_wait_node,
54 qstat_num, /* Total number of statistical counters */
55 qstat_reset_cnts = qstat_num,
56};
57
58#ifdef CONFIG_QUEUED_LOCK_STAT
59/*
60 * Collect pvqspinlock statistics
61 */
62#include <linux/debugfs.h>
63#include <linux/sched.h>
64#include <linux/fs.h>
65
66static const char * const qstat_names[qstat_num + 1] = {
67 [qstat_pv_hash_hops] = "pv_hash_hops",
68 [qstat_pv_kick_unlock] = "pv_kick_unlock",
69 [qstat_pv_kick_wake] = "pv_kick_wake",
70 [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
71 [qstat_pv_latency_kick] = "pv_latency_kick",
72 [qstat_pv_latency_wake] = "pv_latency_wake",
73 [qstat_pv_lock_stealing] = "pv_lock_stealing",
74 [qstat_pv_wait_again] = "pv_wait_again",
75 [qstat_pv_wait_early] = "pv_wait_early",
76 [qstat_pv_wait_head] = "pv_wait_head",
77 [qstat_pv_wait_node] = "pv_wait_node",
78 [qstat_reset_cnts] = "reset_counters",
79};
80
81/*
82 * Per-cpu counters
83 */
84static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
85static DEFINE_PER_CPU(u64, pv_kick_time);
86
87/*
88 * Function to read and return the qlock statistical counter values
89 *
90 * The following counters are handled specially:
91 * 1. qstat_pv_latency_kick
92 * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
93 * 2. qstat_pv_latency_wake
94 * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
95 * 3. qstat_pv_hash_hops
96 * Average hops/hash = pv_hash_hops/pv_kick_unlock
97 */
98static ssize_t qstat_read(struct file *file, char __user *user_buf,
99 size_t count, loff_t *ppos)
100{
101 char buf[64];
102 int cpu, counter, len;
103 u64 stat = 0, kicks = 0;
104
105 /*
106 * Get the counter ID stored in file->f_inode->i_private
107 */
108 if (!file->f_inode) {
109 WARN_ON_ONCE(1);
110 return -EBADF;
111 }
112 counter = (long)(file->f_inode->i_private);
113
114 if (counter >= qstat_num)
115 return -EBADF;
116
117 for_each_possible_cpu(cpu) {
118 stat += per_cpu(qstats[counter], cpu);
119 /*
120 * Need to sum additional counter for some of them
121 */
122 switch (counter) {
123
124 case qstat_pv_latency_kick:
125 case qstat_pv_hash_hops:
126 kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
127 break;
128
129 case qstat_pv_latency_wake:
130 kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
131 break;
132 }
133 }
134
135 if (counter == qstat_pv_hash_hops) {
136 u64 frac;
137
138 frac = 100ULL * do_div(stat, kicks);
139 frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
140
141 /*
142 * Return a X.XX decimal number
143 */
144 len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
145 } else {
146 /*
147 * Round to the nearest ns
148 */
149 if ((counter == qstat_pv_latency_kick) ||
150 (counter == qstat_pv_latency_wake)) {
151 stat = 0;
152 if (kicks)
153 stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
154 }
155 len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
156 }
157
158 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
159}
160
161/*
162 * Function to handle write request
163 *
164 * When counter = reset_cnts, reset all the counter values.
165 * Since the counter updates aren't atomic, the resetting is done twice
166 * to make sure that the counters are very likely to be all cleared.
167 */
168static ssize_t qstat_write(struct file *file, const char __user *user_buf,
169 size_t count, loff_t *ppos)
170{
171 int cpu;
172
173 /*
174 * Get the counter ID stored in file->f_inode->i_private
175 */
176 if (!file->f_inode) {
177 WARN_ON_ONCE(1);
178 return -EBADF;
179 }
180 if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
181 return count;
182
183 for_each_possible_cpu(cpu) {
184 int i;
185 unsigned long *ptr = per_cpu_ptr(qstats, cpu);
186
187 for (i = 0 ; i < qstat_num; i++)
188 WRITE_ONCE(ptr[i], 0);
189 for (i = 0 ; i < qstat_num; i++)
190 WRITE_ONCE(ptr[i], 0);
191 }
192 return count;
193}
194
195/*
196 * Debugfs data structures
197 */
198static const struct file_operations fops_qstat = {
199 .read = qstat_read,
200 .write = qstat_write,
201 .llseek = default_llseek,
202};
203
204/*
205 * Initialize debugfs for the qspinlock statistical counters
206 */
207static int __init init_qspinlock_stat(void)
208{
209 struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
210 int i;
211
212 if (!d_qstat) {
213 pr_warn("Could not create 'qlockstat' debugfs directory\n");
214 return 0;
215 }
216
217 /*
218 * Create the debugfs files
219 *
220 * As reading from and writing to the stat files can be slow, only
221 * root is allowed to do the read/write to limit impact to system
222 * performance.
223 */
224 for (i = 0; i < qstat_num; i++)
225 debugfs_create_file(qstat_names[i], 0400, d_qstat,
226 (void *)(long)i, &fops_qstat);
227
228 debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
229 (void *)(long)qstat_reset_cnts, &fops_qstat);
230 return 0;
231}
232fs_initcall(init_qspinlock_stat);
233
234/*
235 * Increment the PV qspinlock statistical counters
236 */
237static inline void qstat_inc(enum qlock_stats stat, bool cond)
238{
239 if (cond)
240 this_cpu_inc(qstats[stat]);
241}
242
243/*
244 * PV hash hop count
245 */
246static inline void qstat_hop(int hopcnt)
247{
248 this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
249}
250
251/*
252 * Replacement function for pv_kick()
253 */
254static inline void __pv_kick(int cpu)
255{
256 u64 start = sched_clock();
257
258 per_cpu(pv_kick_time, cpu) = start;
259 pv_kick(cpu);
260 this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
261}
262
263/*
264 * Replacement function for pv_wait()
265 */
266static inline void __pv_wait(u8 *ptr, u8 val)
267{
268 u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
269
270 *pkick_time = 0;
271 pv_wait(ptr, val);
272 if (*pkick_time) {
273 this_cpu_add(qstats[qstat_pv_latency_wake],
274 sched_clock() - *pkick_time);
275 qstat_inc(qstat_pv_kick_wake, true);
276 }
277}
278
279#define pv_kick(c) __pv_kick(c)
280#define pv_wait(p, v) __pv_wait(p, v)
281
282/*
283 * PV unfair trylock count tracking function
284 */
285static inline int qstat_spin_steal_lock(struct qspinlock *lock)
286{
287 int ret = pv_queued_spin_steal_lock(lock);
288
289 qstat_inc(qstat_pv_lock_stealing, ret);
290 return ret;
291}
292#undef queued_spin_trylock
293#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
294
295#else /* CONFIG_QUEUED_LOCK_STAT */
296
297static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
298static inline void qstat_hop(int hopcnt) { }
299
300#endif /* CONFIG_QUEUED_LOCK_STAT */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1ef0d7aeab47..34cb9f7fc2d2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1905,6 +1905,97 @@ static void ttwu_queue(struct task_struct *p, int cpu)
1905 raw_spin_unlock(&rq->lock); 1905 raw_spin_unlock(&rq->lock);
1906} 1906}
1907 1907
1908/*
1909 * Notes on Program-Order guarantees on SMP systems.
1910 *
1911 * MIGRATION
1912 *
1913 * The basic program-order guarantee on SMP systems is that when a task [t]
1914 * migrates, all its activity on its old cpu [c0] happens-before any subsequent
1915 * execution on its new cpu [c1].
1916 *
1917 * For migration (of runnable tasks) this is provided by the following means:
1918 *
1919 * A) UNLOCK of the rq(c0)->lock scheduling out task t
1920 * B) migration for t is required to synchronize *both* rq(c0)->lock and
1921 * rq(c1)->lock (if not at the same time, then in that order).
1922 * C) LOCK of the rq(c1)->lock scheduling in task
1923 *
1924 * Transitivity guarantees that B happens after A and C after B.
1925 * Note: we only require RCpc transitivity.
1926 * Note: the cpu doing B need not be c0 or c1
1927 *
1928 * Example:
1929 *
1930 * CPU0 CPU1 CPU2
1931 *
1932 * LOCK rq(0)->lock
1933 * sched-out X
1934 * sched-in Y
1935 * UNLOCK rq(0)->lock
1936 *
1937 * LOCK rq(0)->lock // orders against CPU0
1938 * dequeue X
1939 * UNLOCK rq(0)->lock
1940 *
1941 * LOCK rq(1)->lock
1942 * enqueue X
1943 * UNLOCK rq(1)->lock
1944 *
1945 * LOCK rq(1)->lock // orders against CPU2
1946 * sched-out Z
1947 * sched-in X
1948 * UNLOCK rq(1)->lock
1949 *
1950 *
1951 * BLOCKING -- aka. SLEEP + WAKEUP
1952 *
1953 * For blocking we (obviously) need to provide the same guarantee as for
1954 * migration. However the means are completely different as there is no lock
1955 * chain to provide order. Instead we do:
1956 *
1957 * 1) smp_store_release(X->on_cpu, 0)
1958 * 2) smp_cond_acquire(!X->on_cpu)
1959 *
1960 * Example:
1961 *
1962 * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
1963 *
1964 * LOCK rq(0)->lock LOCK X->pi_lock
1965 * dequeue X
1966 * sched-out X
1967 * smp_store_release(X->on_cpu, 0);
1968 *
1969 * smp_cond_acquire(!X->on_cpu);
1970 * X->state = WAKING
1971 * set_task_cpu(X,2)
1972 *
1973 * LOCK rq(2)->lock
1974 * enqueue X
1975 * X->state = RUNNING
1976 * UNLOCK rq(2)->lock
1977 *
1978 * LOCK rq(2)->lock // orders against CPU1
1979 * sched-out Z
1980 * sched-in X
1981 * UNLOCK rq(2)->lock
1982 *
1983 * UNLOCK X->pi_lock
1984 * UNLOCK rq(0)->lock
1985 *
1986 *
1987 * However; for wakeups there is a second guarantee we must provide, namely we
1988 * must observe the state that lead to our wakeup. That is, not only must our
1989 * task observe its own prior state, it must also observe the stores prior to
1990 * its wakeup.
1991 *
1992 * This means that any means of doing remote wakeups must order the CPU doing
1993 * the wakeup against the CPU the task is going to end up running on. This,
1994 * however, is already required for the regular Program-Order guarantee above,
1995 * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
1996 *
1997 */
1998
1908/** 1999/**
1909 * try_to_wake_up - wake up a thread 2000 * try_to_wake_up - wake up a thread
1910 * @p: the thread to be awakened 2001 * @p: the thread to be awakened
@@ -1968,19 +2059,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1968 /* 2059 /*
1969 * If the owning (remote) cpu is still in the middle of schedule() with 2060 * If the owning (remote) cpu is still in the middle of schedule() with
1970 * this task as prev, wait until its done referencing the task. 2061 * this task as prev, wait until its done referencing the task.
1971 */
1972 while (p->on_cpu)
1973 cpu_relax();
1974 /*
1975 * Combined with the control dependency above, we have an effective
1976 * smp_load_acquire() without the need for full barriers.
1977 * 2062 *
1978 * Pairs with the smp_store_release() in finish_lock_switch(). 2063 * Pairs with the smp_store_release() in finish_lock_switch().
1979 * 2064 *
1980 * This ensures that tasks getting woken will be fully ordered against 2065 * This ensures that tasks getting woken will be fully ordered against
1981 * their previous state and preserve Program Order. 2066 * their previous state and preserve Program Order.
1982 */ 2067 */
1983 smp_rmb(); 2068 smp_cond_acquire(!p->on_cpu);
1984 2069
1985 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2070 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1986 p->state = TASK_WAKING; 2071 p->state = TASK_WAKING;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b242775bf670..1e0bb4afe3fd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1076,7 +1076,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1076 * In particular, the load of prev->state in finish_task_switch() must 1076 * In particular, the load of prev->state in finish_task_switch() must
1077 * happen before this. 1077 * happen before this.
1078 * 1078 *
1079 * Pairs with the control dependency and rmb in try_to_wake_up(). 1079 * Pairs with the smp_cond_acquire() in try_to_wake_up().
1080 */ 1080 */
1081 smp_store_release(&prev->on_cpu, 0); 1081 smp_store_release(&prev->on_cpu, 0);
1082#endif 1082#endif
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 83c33a5bcffb..18e422b259cf 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -27,6 +27,65 @@ do { \
27 (unsigned long long)r); \ 27 (unsigned long long)r); \
28} while (0) 28} while (0)
29 29
30/*
31 * Test for a atomic operation family,
32 * @test should be a macro accepting parameters (bit, op, ...)
33 */
34
35#define FAMILY_TEST(test, bit, op, args...) \
36do { \
37 test(bit, op, ##args); \
38 test(bit, op##_acquire, ##args); \
39 test(bit, op##_release, ##args); \
40 test(bit, op##_relaxed, ##args); \
41} while (0)
42
43#define TEST_RETURN(bit, op, c_op, val) \
44do { \
45 atomic##bit##_set(&v, v0); \
46 r = v0; \
47 r c_op val; \
48 BUG_ON(atomic##bit##_##op(val, &v) != r); \
49 BUG_ON(atomic##bit##_read(&v) != r); \
50} while (0)
51
52#define RETURN_FAMILY_TEST(bit, op, c_op, val) \
53do { \
54 FAMILY_TEST(TEST_RETURN, bit, op, c_op, val); \
55} while (0)
56
57#define TEST_ARGS(bit, op, init, ret, expect, args...) \
58do { \
59 atomic##bit##_set(&v, init); \
60 BUG_ON(atomic##bit##_##op(&v, ##args) != ret); \
61 BUG_ON(atomic##bit##_read(&v) != expect); \
62} while (0)
63
64#define XCHG_FAMILY_TEST(bit, init, new) \
65do { \
66 FAMILY_TEST(TEST_ARGS, bit, xchg, init, init, new, new); \
67} while (0)
68
69#define CMPXCHG_FAMILY_TEST(bit, init, new, wrong) \
70do { \
71 FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \
72 init, init, new, init, new); \
73 FAMILY_TEST(TEST_ARGS, bit, cmpxchg, \
74 init, init, init, wrong, new); \
75} while (0)
76
77#define INC_RETURN_FAMILY_TEST(bit, i) \
78do { \
79 FAMILY_TEST(TEST_ARGS, bit, inc_return, \
80 i, (i) + one, (i) + one); \
81} while (0)
82
83#define DEC_RETURN_FAMILY_TEST(bit, i) \
84do { \
85 FAMILY_TEST(TEST_ARGS, bit, dec_return, \
86 i, (i) - one, (i) - one); \
87} while (0)
88
30static __init void test_atomic(void) 89static __init void test_atomic(void)
31{ 90{
32 int v0 = 0xaaa31337; 91 int v0 = 0xaaa31337;
@@ -45,6 +104,18 @@ static __init void test_atomic(void)
45 TEST(, and, &=, v1); 104 TEST(, and, &=, v1);
46 TEST(, xor, ^=, v1); 105 TEST(, xor, ^=, v1);
47 TEST(, andnot, &= ~, v1); 106 TEST(, andnot, &= ~, v1);
107
108 RETURN_FAMILY_TEST(, add_return, +=, onestwos);
109 RETURN_FAMILY_TEST(, add_return, +=, -one);
110 RETURN_FAMILY_TEST(, sub_return, -=, onestwos);
111 RETURN_FAMILY_TEST(, sub_return, -=, -one);
112
113 INC_RETURN_FAMILY_TEST(, v0);
114 DEC_RETURN_FAMILY_TEST(, v0);
115
116 XCHG_FAMILY_TEST(, v0, v1);
117 CMPXCHG_FAMILY_TEST(, v0, v1, onestwos);
118
48} 119}
49 120
50#define INIT(c) do { atomic64_set(&v, c); r = c; } while (0) 121#define INIT(c) do { atomic64_set(&v, c); r = c; } while (0)
@@ -74,25 +145,10 @@ static __init void test_atomic64(void)
74 TEST(64, xor, ^=, v1); 145 TEST(64, xor, ^=, v1);
75 TEST(64, andnot, &= ~, v1); 146 TEST(64, andnot, &= ~, v1);
76 147
77 INIT(v0); 148 RETURN_FAMILY_TEST(64, add_return, +=, onestwos);
78 r += onestwos; 149 RETURN_FAMILY_TEST(64, add_return, +=, -one);
79 BUG_ON(atomic64_add_return(onestwos, &v) != r); 150 RETURN_FAMILY_TEST(64, sub_return, -=, onestwos);
80 BUG_ON(v.counter != r); 151 RETURN_FAMILY_TEST(64, sub_return, -=, -one);
81
82 INIT(v0);
83 r += -one;
84 BUG_ON(atomic64_add_return(-one, &v) != r);
85 BUG_ON(v.counter != r);
86
87 INIT(v0);
88 r -= onestwos;
89 BUG_ON(atomic64_sub_return(onestwos, &v) != r);
90 BUG_ON(v.counter != r);
91
92 INIT(v0);
93 r -= -one;
94 BUG_ON(atomic64_sub_return(-one, &v) != r);
95 BUG_ON(v.counter != r);
96 152
97 INIT(v0); 153 INIT(v0);
98 atomic64_inc(&v); 154 atomic64_inc(&v);
@@ -100,33 +156,15 @@ static __init void test_atomic64(void)
100 BUG_ON(v.counter != r); 156 BUG_ON(v.counter != r);
101 157
102 INIT(v0); 158 INIT(v0);
103 r += one;
104 BUG_ON(atomic64_inc_return(&v) != r);
105 BUG_ON(v.counter != r);
106
107 INIT(v0);
108 atomic64_dec(&v); 159 atomic64_dec(&v);
109 r -= one; 160 r -= one;
110 BUG_ON(v.counter != r); 161 BUG_ON(v.counter != r);
111 162
112 INIT(v0); 163 INC_RETURN_FAMILY_TEST(64, v0);
113 r -= one; 164 DEC_RETURN_FAMILY_TEST(64, v0);
114 BUG_ON(atomic64_dec_return(&v) != r);
115 BUG_ON(v.counter != r);
116
117 INIT(v0);
118 BUG_ON(atomic64_xchg(&v, v1) != v0);
119 r = v1;
120 BUG_ON(v.counter != r);
121
122 INIT(v0);
123 BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0);
124 r = v1;
125 BUG_ON(v.counter != r);
126 165
127 INIT(v0); 166 XCHG_FAMILY_TEST(64, v0, v1);
128 BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0); 167 CMPXCHG_FAMILY_TEST(64, v0, v1, v2);
129 BUG_ON(v.counter != r);
130 168
131 INIT(v0); 169 INIT(v0);
132 BUG_ON(atomic64_add_unless(&v, one, v0)); 170 BUG_ON(atomic64_add_unless(&v, one, v0));