Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar: "So we have a laundry list of locking subsystem changes: - continuing barrier API and code improvements - futex enhancements - atomics API improvements - pvqspinlock enhancements: in particular lock stealing and adaptive spinning - qspinlock micro-enhancements" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: futex: Allow FUTEX_CLOCK_REALTIME with FUTEX_WAIT op futex: Cleanup the goto confusion in requeue_pi() futex: Remove pointless put_pi_state calls in requeue() futex: Document pi_state refcounting in requeue code futex: Rename free_pi_state() to put_pi_state() futex: Drop refcount if requeue_pi() acquired the rtmutex locking/barriers, arch: Remove ambiguous statement in the smp_store_mb() documentation lcoking/barriers, arch: Use smp barriers in smp_store_release() locking/cmpxchg, arch: Remove tas() definitions locking/pvqspinlock: Queue node adaptive spinning locking/pvqspinlock: Allow limited lock stealing locking/pvqspinlock: Collect slowpath lock statistics sched/core, locking: Document Program-Order guarantees locking, sched: Introduce smp_cond_acquire() and use it locking/pvqspinlock, x86: Optimize the PV unlock code path locking/qspinlock: Avoid redundant read of next pointer locking/qspinlock: Prefetch the next node cacheline locking/qspinlock: Use _acquire/_release() versions of cmpxchg() & xchg() atomics: Add test for atomic operations with _relaxed variants
author: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-11 17:18:38 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-01-11 17:18:38 -0500
commit: 24af98c4cf5f5e69266e270c7f3fb34b82ff6656 (patch)
tree: 70d71381c841c92b2d28397bf0c5d6a7d9bbbaac
parent: 9061cbe62adeccf8c986883bcd40f4aeee59ea75 (diff)
parent: 337f13046ff03717a9e99675284a817527440a49 (diff)
20 files changed, 904 insertions, 146 deletions
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 85304ebd187c..a61be39c7b51 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1673,8 +1673,8 @@ There are some more advanced barrier functions:
 (*) smp_store_mb(var, value)
     This assigns the value to the variable and then inserts a full memory
-     barrier after it, depending on the function.  It isn't guaranteed to
+     barrier after it.  It isn't guaranteed to insert anything more than a
-     insert anything more than a compiler barrier in a UP compilation.
+     compiler barrier in a UP compilation.
 (*) smp_mb__before_atomic();
diff --git a/arch/blackfin/include/asm/cmpxchg.h b/arch/blackfin/include/asm/cmpxchg.h
index c05868cc61c1..253928854299 100644
--- a/arch/blackfin/include/asm/cmpxchg.h
+++ b/arch/blackfin/include/asm/cmpxchg.h
@@ -128,6 +128,5 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
 #endif /* !CONFIG_SMP */
 #define xchg(ptr, x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
-#define tas(ptr) ((void)xchg((ptr), 1))
 #endif /* __ARCH_BLACKFIN_CMPXCHG__ */
diff --git a/arch/c6x/include/asm/cmpxchg.h b/arch/c6x/include/asm/cmpxchg.h
index b27c8cefb8c3..93d0a5a047a2 100644
--- a/arch/c6x/include/asm/cmpxchg.h
+++ b/arch/c6x/include/asm/cmpxchg.h
@@ -47,8 +47,6 @@ static inline unsigned int __xchg(unsigned int x, volatile void *ptr, int size)
 #define xchg(ptr, x) \
        ((__typeof__(*(ptr)))__xchg((unsigned int)(x), (void *) (ptr), \
                                    sizeof(*(ptr))))
-#define tas(ptr)    xchg((ptr), 1)
 #include <asm-generic/cmpxchg-local.h>
diff --git a/arch/frv/include/asm/cmpxchg.h b/arch/frv/include/asm/cmpxchg.h
index 5b04dd0aecab..a899765102ea 100644
--- a/arch/frv/include/asm/cmpxchg.h
+++ b/arch/frv/include/asm/cmpxchg.h
@@ -69,8 +69,6 @@ extern uint32_t __xchg_32(uint32_t i, volatile void *v);
 #endif
-#define tas(ptr) (xchg((ptr), 1))
 /*****************************************************************************/
 /*
 * compare and conditionally exchange value with memory
diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h
index df896a1c41d3..209c4b817c95 100644
--- a/arch/ia64/include/asm/barrier.h
+++ b/arch/ia64/include/asm/barrier.h
@@ -77,7 +77,7 @@ do {									\
        ___p1;                                                          \
 })
-#define smp_store_mb(var, value)        do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 /*
 * The group barrier in front of the rsm & ssm are necessary to ensure
diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h
index 0eca6efc0631..a7af5fb7b914 100644
--- a/arch/powerpc/include/asm/barrier.h
+++ b/arch/powerpc/include/asm/barrier.h
@@ -34,7 +34,7 @@
 #define rmb()  __asm__ __volatile__ ("sync" : : : "memory")
 #define wmb()  __asm__ __volatile__ ("sync" : : : "memory")
-#define smp_store_mb(var, value)        do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #ifdef __SUBARCH_HAS_LWSYNC
 #    define SMPWMB      LWSYNC
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index d68e11e0df5e..7ffd0b19135c 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -36,7 +36,7 @@
 #define smp_mb__before_atomic()         smp_mb()
 #define smp_mb__after_atomic()          smp_mb()
-#define smp_store_mb(var, value)                do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value)        do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #define smp_store_release(p, v)                                         \
 do {                                                                    \
diff --git a/arch/tile/include/asm/cmpxchg.h b/arch/tile/include/asm/cmpxchg.h
index 0ccda3c425be..25d5899497be 100644
--- a/arch/tile/include/asm/cmpxchg.h
+++ b/arch/tile/include/asm/cmpxchg.h
@@ -127,8 +127,6 @@ long long _atomic64_cmpxchg(long long *v, long long o, long long n);
 #endif
-#define tas(ptr) xchg((ptr), 1)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_TILE_CMPXCHG_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index db3622f22b61..965fc4216f76 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -687,6 +687,14 @@ config PARAVIRT_SPINLOCKS
          If you are unsure how to answer this question, answer Y.
+config QUEUED_LOCK_STAT
+        bool "Paravirt queued spinlock statistics"
+        depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+        ---help---
+          Enable the collection of statistical data on the slowpath
+          behavior of paravirtualized queued spinlocks and report
+          them on debugfs.
 source "arch/x86/xen/Kconfig"
 config KVM_GUEST
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
index b002e711ba88..9f92c180ed2f 100644
--- a/arch/x86/include/asm/qspinlock_paravirt.h
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -1,6 +1,65 @@
 #ifndef __ASM_QSPINLOCK_PARAVIRT_H
 #define __ASM_QSPINLOCK_PARAVIRT_H
+/*
+ * For x86-64, PV_CALLEE_SAVE_REGS_THUNK() saves and restores 8 64-bit
+ * registers. For i386, however, only 1 32-bit register needs to be saved
+ * and restored. So an optimized version of __pv_queued_spin_unlock() is
+ * hand-coded for 64-bit, but it isn't worthwhile to do it for 32-bit.
+ */
+#ifdef CONFIG_64BIT
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock_slowpath);
+#define __pv_queued_spin_unlock __pv_queued_spin_unlock
+#define PV_UNLOCK               "__raw_callee_save___pv_queued_spin_unlock"
+#define PV_UNLOCK_SLOWPATH      "__raw_callee_save___pv_queued_spin_unlock_slowpath"
+/*
+ * Optimized assembly version of __raw_callee_save___pv_queued_spin_unlock
+ * which combines the registers saving trunk and the body of the following
+ * C code:
+ *
+ * void __pv_queued_spin_unlock(struct qspinlock *lock)
+ * {
+ *      struct __qspinlock *l = (void *)lock;
+ *      u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ *
+ *      if (likely(lockval == _Q_LOCKED_VAL))
+ *              return;
+ *      pv_queued_spin_unlock_slowpath(lock, lockval);
+ * }
+ *
+ * For x86-64,
+ *   rdi = lock              (first argument)
+ *   rsi = lockval           (second argument)
+ *   rdx = internal variable (set to 0)
+ */
+asm    (".pushsection .text;"
+        ".globl " PV_UNLOCK ";"
+        ".align 4,0x90;"
+        PV_UNLOCK ": "
+        "push  %rdx;"
+        "mov   $0x1,%eax;"
+        "xor   %edx,%edx;"
+        "lock cmpxchg %dl,(%rdi);"
+        "cmp   $0x1,%al;"
+        "jne   .slowpath;"
+        "pop   %rdx;"
+        "ret;"
+        ".slowpath: "
+        "push   %rsi;"
+        "movzbl %al,%esi;"
+        "call " PV_UNLOCK_SLOWPATH ";"
+        "pop    %rsi;"
+        "pop    %rdx;"
+        "ret;"
+        ".size " PV_UNLOCK ", .-" PV_UNLOCK ";"
+        ".popsection");
+#else /* CONFIG_64BIT */
+extern void __pv_queued_spin_unlock(struct qspinlock *lock);
 PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+#endif /* CONFIG_64BIT */
 #endif
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index b42afada1280..0f45f93ef692 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -93,7 +93,7 @@
 #endif  /* CONFIG_SMP */
 #ifndef smp_store_mb
-#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); mb(); } while (0)
+#define smp_store_mb(var, value)  do { WRITE_ONCE(var, value); smp_mb(); } while (0)
 #endif
 #ifndef smp_mb__before_atomic
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e2aadbc7151f..39e1cb201b8e 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -12,8 +12,9 @@
 * GNU General Public License for more details.
 *
 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
 *
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
 */
 #ifndef __ASM_GENERIC_QSPINLOCK_H
 #define __ASM_GENERIC_QSPINLOCK_H
@@ -62,7 +63,7 @@ static __always_inline int queued_spin_is_contended(struct qspinlock *lock)
 static __always_inline int queued_spin_trylock(struct qspinlock *lock)
 {
        if (!atomic_read(&lock->val) &&
-           (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) == 0))
+           (atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL) == 0))
                return 1;
        return 0;
 }
@@ -77,7 +78,7 @@ static __always_inline void queued_spin_lock(struct qspinlock *lock)
 {
        u32 val;
-        val = atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL);
+        val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);
        if (likely(val == 0))
                return;
        queued_spin_lock_slowpath(lock, val);
@@ -93,7 +94,7 @@ static __always_inline void queued_spin_unlock(struct qspinlock *lock)
        /*
         * smp_mb__before_atomic() in order to guarantee release semantics
         */
-        smp_mb__before_atomic_dec();
+        smp_mb__before_atomic();
        atomic_sub(_Q_LOCKED_VAL, &lock->val);
 }
 #endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 4dac1036594f..00b042c49ccd 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -299,6 +299,23 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
        __u.__val;                                      \
 })
+/**
+ * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
+ * @cond: boolean expression to wait for
+ *
+ * Equivalent to using smp_load_acquire() on the condition variable but employs
+ * the control dependency of the wait to reduce the barrier on many platforms.
+ *
+ * The control dependency provides a LOAD->STORE order, the additional RMB
+ * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
+ * aka. ACQUIRE.
+ */
+#define smp_cond_acquire(cond)  do {            \
+        while (!(cond))                         \
+                cpu_relax();                    \
+        smp_rmb(); /* ctrl + rmb := acquire */  \
+} while (0)
 #endif /* __KERNEL__ */
 #endif /* __ASSEMBLY__ */
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d7549825a..8a310e240cda 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -725,9 +725,12 @@ static struct futex_pi_state * alloc_pi_state(void)
 }
 /*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
 * Must be called with the hb lock held.
 */
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
 {
        if (!pi_state)
                return;
@@ -1706,31 +1709,35 @@ retry_private:
                 * exist yet, look it up one more time to ensure we have a
                 * reference to it. If the lock was taken, ret contains the
                 * vpid of the top waiter task.
+                 * If the lock was not taken, we have pi_state and an initial
+                 * refcount on it. In case of an error we have nothing.
                 */
                if (ret > 0) {
                        WARN_ON(pi_state);
                        drop_count++;
                        task_count++;
                        /*
-                         * If we acquired the lock, then the user
+                         * If we acquired the lock, then the user space value
-                         * space value of uaddr2 should be vpid. It
+                         * of uaddr2 should be vpid. It cannot be changed by
-                         * cannot be changed by the top waiter as it
+                         * the top waiter as it is blocked on hb2 lock if it
-                         * is blocked on hb2 lock if it tries to do
+                         * tries to do so. If something fiddled with it behind
-                         * so. If something fiddled with it behind our
+                         * our back the pi state lookup might unearth it. So
-                         * back the pi state lookup might unearth
+                         * we rather use the known value than rereading and
-                         * it. So we rather use the known value than
+                         * handing potential crap to lookup_pi_state.
-                         * rereading and handing potential crap to
+                         *
-                         * lookup_pi_state.
+                         * If that call succeeds then we have pi_state and an
+                         * initial refcount on it.
                         */
                        ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
                }
                switch (ret) {
                case 0:
+                        /* We hold a reference on the pi state. */
                        break;
+                        /* If the above failed, then pi_state is NULL */
                case -EFAULT:
-                        free_pi_state(pi_state);
-                        pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
                        hb_waiters_dec(hb2);
                        put_futex_key(&key2);
@@ -1746,8 +1753,6 @@ retry_private:
                         *   exit to complete.
                         * - The user space value changed.
                         */
-                        free_pi_state(pi_state);
-                        pi_state = NULL;
                        double_unlock_hb(hb1, hb2);
                        hb_waiters_dec(hb2);
                        put_futex_key(&key2);
@@ -1801,30 +1806,58 @@ retry_private:
                 * of requeue_pi if we couldn't acquire the lock atomically.
                 */
                if (requeue_pi) {
-                        /* Prepare the waiter to take the rt_mutex. */
+                        /*
+                         * Prepare the waiter to take the rt_mutex. Take a
+                         * refcount on the pi_state and store the pointer in
+                         * the futex_q object of the waiter.
+                         */
                        atomic_inc(&pi_state->refcount);
                        this->pi_state = pi_state;
                        ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
                                                        this->rt_waiter,
                                                        this->task);
                        if (ret == 1) {
-                                /* We got the lock. */
+                                /*
+                                 * We got the lock. We do neither drop the
+                                 * refcount on pi_state nor clear
+                                 * this->pi_state because the waiter needs the
+                                 * pi_state for cleaning up the user space
+                                 * value. It will drop the refcount after
+                                 * doing so.
+                                 */
                                requeue_pi_wake_futex(this, &key2, hb2);
                                drop_count++;
                                continue;
                        } else if (ret) {
-                                /* -EDEADLK */
+                                /*
+                                 * rt_mutex_start_proxy_lock() detected a
+                                 * potential deadlock when we tried to queue
+                                 * that waiter. Drop the pi_state reference
+                                 * which we took above and remove the pointer
+                                 * to the state from the waiters futex_q
+                                 * object.
+                                 */
                                this->pi_state = NULL;
-                                free_pi_state(pi_state);
+                                put_pi_state(pi_state);
-                                goto out_unlock;
+                                /*
+                                 * We stop queueing more waiters and let user
+                                 * space deal with the mess.
+                                 */
+                                break;
                        }
                }
                requeue_futex(this, hb1, hb2, &key2);
                drop_count++;
        }
+        /*
+         * We took an extra initial reference to the pi_state either
+         * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
+         * need to drop it here again.
+         */
+        put_pi_state(pi_state);
 out_unlock:
-        free_pi_state(pi_state);
        double_unlock_hb(hb1, hb2);
        wake_up_q(&wake_q);
        hb_waiters_dec(hb2);
@@ -1973,7 +2006,7 @@ static void unqueue_me_pi(struct futex_q *q)
        __unqueue_futex(q);
        BUG_ON(!q->pi_state);
-        free_pi_state(q->pi_state);
+        put_pi_state(q->pi_state);
        q->pi_state = NULL;
        spin_unlock(q->lock_ptr);
@@ -2755,6 +2788,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
                        ret = fixup_pi_state_owner(uaddr2, &q, current);
+                        /*
+                         * Drop the reference to the pi state which
+                         * the requeue_pi() code acquired for us.
+                         */
+                        put_pi_state(q.pi_state);
                        spin_unlock(q.lock_ptr);
                }
        } else {
@@ -3046,7 +3084,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
        if (op & FUTEX_CLOCK_REALTIME) {
                flags |= FLAGS_CLOCKRT;
-                if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+                if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
+                    cmd != FUTEX_WAIT_REQUEUE_PI)
                        return -ENOSYS;
        }
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5..393d1874b9e0 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
 * (C) Copyright 2013-2014 Red Hat, Inc.
 * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
 *
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
 *          Peter Zijlstra <peterz@infradead.org>
 */
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
        struct __qspinlock *l = (void *)lock;
-        return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+        /*
+         * Use release semantics to make sure that the MCS node is properly
+         * initialized before changing the tail code.
+         */
+        return (u32)xchg_release(&l->tail,
+                                 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
 }
 #else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
        for (;;) {
                new = (val & _Q_LOCKED_PENDING_MASK) | tail;
-                old = atomic_cmpxchg(&lock->val, val, new);
+                /*
+                 * Use release semantics to make sure that the MCS node is
+                 * properly initialized before changing the tail code.
+                 */
+                old = atomic_cmpxchg_release(&lock->val, val, new);
                if (old == val)
                        break;
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
 */
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+                                           struct mcs_spinlock *prev) { }
 static __always_inline void __pv_kick_node(struct qspinlock *lock,
                                           struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
+static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
-                                           struct mcs_spinlock *node) { }
+                                                   struct mcs_spinlock *node)
+                                                   { return 0; }
 #define pv_enabled()            false
 #define pv_init_node            __pv_init_node
 #define pv_wait_node            __pv_wait_node
 #define pv_kick_node            __pv_kick_node
-#define pv_wait_head            __pv_wait_head
+#define pv_wait_head_or_lock    __pv_wait_head_or_lock
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
                if (val == new)
                        new |= _Q_PENDING_VAL;
-                old = atomic_cmpxchg(&lock->val, val, new);
+                /*
+                 * Acquire semantic is required here as the function may
+                 * return immediately if the lock was free.
+                 */
+                old = atomic_cmpxchg_acquire(&lock->val, val, new);
                if (old == val)
                        break;
@@ -382,6 +398,7 @@ queue:
         * p,*,* -> n,*,*
         */
        old = xchg_tail(lock, tail);
+        next = NULL;
        /*
         * if there was a previous node; link it and wait until reaching the
@@ -391,8 +408,18 @@ queue:
                prev = decode_tail(old);
                WRITE_ONCE(prev->next, node);
-                pv_wait_node(node);
+                pv_wait_node(node, prev);
                arch_mcs_spin_lock_contended(&node->locked);
+                /*
+                 * While waiting for the MCS lock, the next pointer may have
+                 * been set by another lock waiter. We optimistically load
+                 * the next pointer & prefetch the cacheline for writing
+                 * to reduce latency in the upcoming MCS unlock operation.
+                 */
+                next = READ_ONCE(node->next);
+                if (next)
+                        prefetchw(next);
        }
        /*
@@ -406,11 +433,22 @@ queue:
         * sequentiality; this is because the set_locked() function below
         * does not imply a full barrier.
         *
+         * The PV pv_wait_head_or_lock function, if active, will acquire
+         * the lock and return a non-zero value. So we have to skip the
+         * smp_load_acquire() call. As the next PV queue head hasn't been
+         * designated yet, there is no way for the locked value to become
+         * _Q_SLOW_VAL. So both the set_locked() and the
+         * atomic_cmpxchg_relaxed() calls will be safe.
+         *
+         * If PV isn't active, 0 will be returned instead.
+         *
         */
-        pv_wait_head(lock, node);
+        if ((val = pv_wait_head_or_lock(lock, node)))
-        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+                goto locked;
-                cpu_relax();
+        smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+locked:
        /*
         * claim the lock:
         *
@@ -422,11 +460,17 @@ queue:
         * to grab the lock.
         */
        for (;;) {
-                if (val != tail) {
+                /* In the PV case we might already have _Q_LOCKED_VAL set */
+                if ((val & _Q_TAIL_MASK) != tail) {
                        set_locked(lock);
                        break;
                }
-                old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+                /*
+                 * The smp_load_acquire() call above has provided the necessary
+                 * acquire semantics required for locking. At most two
+                 * iterations of this loop may be ran.
+                 */
+                old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
                if (old == val)
                        goto release;   /* No contention */
@@ -434,10 +478,12 @@ queue:
        }
        /*
-         * contended path; wait for next, release.
+         * contended path; wait for next if not observed yet, release.
         */
-        while (!(next = READ_ONCE(node->next)))
+        if (!next) {
-                cpu_relax();
+                while (!(next = READ_ONCE(node->next)))
+                        cpu_relax();
+        }
        arch_mcs_spin_unlock_contended(&next->locked);
        pv_kick_node(lock, next);
@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 #undef pv_init_node
 #undef pv_wait_node
 #undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
 #undef  queued_spin_lock_slowpath
 #define queued_spin_lock_slowpath       __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff4829b..87bb235c3448 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
 #define _Q_SLOW_VAL     (3U << _Q_LOCKED_OFFSET)
 /*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK      0xff
+/*
 * Queue node uses: vcpu_running & vcpu_halted.
 * Queue head uses: vcpu_running & vcpu_hashed.
 */
@@ -41,6 +55,94 @@ struct pv_node {
 };
 /*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l)  pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+                (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->pending, 1);
+}
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->pending, 0);
+}
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        return !READ_ONCE(l->locked) &&
+               (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+                        == _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+        atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+        atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+        int val = atomic_read(&lock->val);
+        for (;;) {
+                int old, new;
+                if (val  & _Q_LOCKED_MASK)
+                        break;
+                /*
+                 * Try to clear pending bit & set locked bit
+                 */
+                old = val;
+                new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+                val = atomic_cmpxchg(&lock->val, old, new);
+                if (val == old)
+                        return 1;
+        }
+        return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+/*
 * Lock and MCS node addresses hash table for fast lookup
 *
 * Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 {
        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
        struct pv_hash_entry *he;
+        int hopcnt = 0;
        for_each_hash_entry(he, offset, hash) {
+                hopcnt++;
                if (!cmpxchg(&he->lock, NULL, lock)) {
                        WRITE_ONCE(he->node, node);
+                        qstat_hop(hopcnt);
                        return &he->lock;
                }
        }
@@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
 }
 /*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+        if ((loop & PV_PREV_CHECK_MASK) != 0)
+                return false;
+        return READ_ONCE(prev->state) != vcpu_running;
+}
+/*
 * Initialize the PV part of the mcs_spinlock node.
 */
 static void pv_init_node(struct mcs_spinlock *node)
@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
 * behalf.
 */
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 {
        struct pv_node *pn = (struct pv_node *)node;
+        struct pv_node *pp = (struct pv_node *)prev;
+        int waitcnt = 0;
        int loop;
+        bool wait_early;
-        for (;;) {
+        /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
-                for (loop = SPIN_THRESHOLD; loop; loop--) {
+        for (;; waitcnt++) {
+                for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
                        if (READ_ONCE(node->locked))
                                return;
+                        if (pv_wait_early(pp, loop)) {
+                                wait_early = true;
+                                break;
+                        }
                        cpu_relax();
                }
@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 */
                smp_store_mb(pn->state, vcpu_halted);
-                if (!READ_ONCE(node->locked))
+                if (!READ_ONCE(node->locked)) {
+                        qstat_inc(qstat_pv_wait_node, true);
+                        qstat_inc(qstat_pv_wait_again, waitcnt);
+                        qstat_inc(qstat_pv_wait_early, wait_early);
                        pv_wait(&pn->state, vcpu_halted);
+                }
                /*
-                 * If pv_kick_node() changed us to vcpu_hashed, retain that value
+                 * If pv_kick_node() changed us to vcpu_hashed, retain that
-                 * so that pv_wait_head() knows to not also try to hash this lock.
+                 * value so that pv_wait_head_or_lock() knows to not also try
+                 * to hash this lock.
                 */
                cmpxchg(&pn->state, vcpu_halted, vcpu_running);
@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 * So it is better to spin for a while in the hope that the
                 * MCS lock will be released soon.
                 */
+                qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
        }
        /*
@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
 /*
 * Called after setting next->locked = 1 when we're the lock owner.
 *
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
 */
 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 }
 /*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
 * __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
 */
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 {
        struct pv_node *pn = (struct pv_node *)node;
        struct __qspinlock *l = (void *)lock;
        struct qspinlock **lp = NULL;
+        int waitcnt = 0;
        int loop;
        /*
@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
        if (READ_ONCE(pn->state) == vcpu_hashed)
                lp = (struct qspinlock **)1;
-        for (;;) {
+        for (;; waitcnt++) {
+                /*
+                 * Set correct vCPU state to be used by queue node wait-early
+                 * mechanism.
+                 */
+                WRITE_ONCE(pn->state, vcpu_running);
+                /*
+                 * Set the pending bit in the active lock spinning loop to
+                 * disable lock stealing before attempting to acquire the lock.
+                 */
+                set_pending(lock);
                for (loop = SPIN_THRESHOLD; loop; loop--) {
-                        if (!READ_ONCE(l->locked))
+                        if (trylock_clear_pending(lock))
-                                return;
+                                goto gotlock;
                        cpu_relax();
                }
+                clear_pending(lock);
                if (!lp) { /* ONCE */
                        lp = pv_hash(lock, pn);
@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
                         *
                         * Matches the smp_rmb() in __pv_queued_spin_unlock().
                         */
-                        if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+                        if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
                                /*
-                                 * The lock is free and _Q_SLOW_VAL has never
+                                 * The lock was free and now we own the lock.
-                                 * been set. Therefore we need to unhash before
+                                 * Change the lock value back to _Q_LOCKED_VAL
-                                 * getting the lock.
+                                 * and unhash the table.
                                 */
+                                WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
                                WRITE_ONCE(*lp, NULL);
-                                return;
+                                goto gotlock;
                        }
                }
+                WRITE_ONCE(pn->state, vcpu_halted);
+                qstat_inc(qstat_pv_wait_head, true);
+                qstat_inc(qstat_pv_wait_again, waitcnt);
                pv_wait(&l->locked, _Q_SLOW_VAL);
                /*
                 * The unlocker should have freed the lock before kicking the
                 * CPU. So if the lock is still not free, it is a spurious
-                 * wakeup and so the vCPU should wait again after spinning for
+                 * wakeup or another vCPU has stolen the lock. The current
-                 * a while.
+                 * vCPU should spin again.
                 */
+                qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
        }
        /*
-         * Lock is unlocked now; the caller will acquire it without waiting.
+         * The cmpxchg() or xchg() call before coming here provides the
-         * As with pv_wait_node() we rely on the caller to do a load-acquire
+         * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
-         * for us.
+         * here is to indicate to the compiler that the value will always
+         * be nozero to enable better code optimization.
         */
+gotlock:
+        return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
 }
 /*
- * PV version of the unlock function to be used in stead of
+ * PV versions of the unlock fastpath and slowpath functions to be used
- * queued_spin_unlock().
+ * instead of queued_spin_unlock().
 */
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 {
        struct __qspinlock *l = (void *)lock;
        struct pv_node *node;
-        u8 locked;
-        /*
-         * We must not unlock if SLOW, because in that case we must first
-         * unhash. Otherwise it would be possible to have multiple @lock
-         * entries, which would be BAD.
-         */
-        locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
-        if (likely(locked == _Q_LOCKED_VAL))
-                return;
        if (unlikely(locked != _Q_SLOW_VAL)) {
                WARN(!debug_locks_silent,
@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
         * so we need a barrier to order the read of the node data in
         * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
         *
-         * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+         * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
         */
        smp_rmb();
@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
         * vCPU is harmless other than the additional latency in completing
         * the unlock.
         */
+        qstat_inc(qstat_pv_kick_unlock, true);
        pv_kick(node->cpu);
 }
 /*
 * Include the architecture specific callee-save thunk of the
 * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * that the callee-save thunk and the real unlock function are close
+ * function close to each other sharing consecutive instruction cachelines.
- * to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
 */
 #include <asm/qspinlock_paravirt.h>
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        u8 locked;
+        /*
+         * We must not unlock if SLOW, because in that case we must first
+         * unhash. Otherwise it would be possible to have multiple @lock
+         * entries, which would be BAD.
+         */
+        locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+        if (likely(locked == _Q_LOCKED_VAL))
+                return;
+        __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000000..640dcecdd1df
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,300 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ *   pv_hash_hops       - average # of hops per hashing operation
+ *   pv_kick_unlock     - # of vCPU kicks issued at unlock time
+ *   pv_kick_wake       - # of vCPU kicks used for computing pv_latency_wake
+ *   pv_latency_kick    - average latency (ns) of vCPU kick operation
+ *   pv_latency_wake    - average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_stealing   - # of lock stealing operations
+ *   pv_spurious_wakeup - # of spurious wakeups
+ *   pv_wait_again      - # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_early      - # of early vCPU wait's
+ *   pv_wait_head       - # of vCPU wait's at the queue head
+ *   pv_wait_node       - # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+        qstat_pv_hash_hops,
+        qstat_pv_kick_unlock,
+        qstat_pv_kick_wake,
+        qstat_pv_latency_kick,
+        qstat_pv_latency_wake,
+        qstat_pv_lock_stealing,
+        qstat_pv_spurious_wakeup,
+        qstat_pv_wait_again,
+        qstat_pv_wait_early,
+        qstat_pv_wait_head,
+        qstat_pv_wait_node,
+        qstat_num,      /* Total number of statistical counters */
+        qstat_reset_cnts = qstat_num,
+};
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+static const char * const qstat_names[qstat_num + 1] = {
+        [qstat_pv_hash_hops]       = "pv_hash_hops",
+        [qstat_pv_kick_unlock]     = "pv_kick_unlock",
+        [qstat_pv_kick_wake]       = "pv_kick_wake",
+        [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+        [qstat_pv_latency_kick]    = "pv_latency_kick",
+        [qstat_pv_latency_wake]    = "pv_latency_wake",
+        [qstat_pv_lock_stealing]   = "pv_lock_stealing",
+        [qstat_pv_wait_again]      = "pv_wait_again",
+        [qstat_pv_wait_early]      = "pv_wait_early",
+        [qstat_pv_wait_head]       = "pv_wait_head",
+        [qstat_pv_wait_node]       = "pv_wait_node",
+        [qstat_reset_cnts]         = "reset_counters",
+};
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ *    Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+                          size_t count, loff_t *ppos)
+{
+        char buf[64];
+        int cpu, counter, len;
+        u64 stat = 0, kicks = 0;
+        /*
+         * Get the counter ID stored in file->f_inode->i_private
+         */
+        if (!file->f_inode) {
+                WARN_ON_ONCE(1);
+                return -EBADF;
+        }
+        counter = (long)(file->f_inode->i_private);
+        if (counter >= qstat_num)
+                return -EBADF;
+        for_each_possible_cpu(cpu) {
+                stat += per_cpu(qstats[counter], cpu);
+                /*
+                 * Need to sum additional counter for some of them
+                 */
+                switch (counter) {
+                case qstat_pv_latency_kick:
+                case qstat_pv_hash_hops:
+                        kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+                        break;
+                case qstat_pv_latency_wake:
+                        kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+                        break;
+                }
+        }
+        if (counter == qstat_pv_hash_hops) {
+                u64 frac;
+                frac = 100ULL * do_div(stat, kicks);
+                frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+                /*
+                 * Return a X.XX decimal number
+                 */
+                len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+        } else {
+                /*
+                 * Round to the nearest ns
+                 */
+                if ((counter == qstat_pv_latency_kick) ||
+                    (counter == qstat_pv_latency_wake)) {
+                        stat = 0;
+                        if (kicks)
+                                stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+                }
+                len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+        }
+        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+                           size_t count, loff_t *ppos)
+{
+        int cpu;
+        /*
+         * Get the counter ID stored in file->f_inode->i_private
+         */
+        if (!file->f_inode) {
+                WARN_ON_ONCE(1);
+                return -EBADF;
+        }
+        if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+                return count;
+        for_each_possible_cpu(cpu) {
+                int i;
+                unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+                for (i = 0 ; i < qstat_num; i++)
+                        WRITE_ONCE(ptr[i], 0);
+                for (i = 0 ; i < qstat_num; i++)
+                        WRITE_ONCE(ptr[i], 0);
+        }
+        return count;
+}
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+        .read = qstat_read,
+        .write = qstat_write,
+        .llseek = default_llseek,
+};
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+        struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+        int i;
+        if (!d_qstat) {
+                pr_warn("Could not create 'qlockstat' debugfs directory\n");
+                return 0;
+        }
+        /*
+         * Create the debugfs files
+         *
+         * As reading from and writing to the stat files can be slow, only
+         * root is allowed to do the read/write to limit impact to system
+         * performance.
+         */
+        for (i = 0; i < qstat_num; i++)
+                debugfs_create_file(qstat_names[i], 0400, d_qstat,
+                                   (void *)(long)i, &fops_qstat);
+        debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+                           (void *)(long)qstat_reset_cnts, &fops_qstat);
+        return 0;
+}
+fs_initcall(init_qspinlock_stat);
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+        if (cond)
+                this_cpu_inc(qstats[stat]);
+}
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+        this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+        u64 start = sched_clock();
+        per_cpu(pv_kick_time, cpu) = start;
+        pv_kick(cpu);
+        this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+        u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+        *pkick_time = 0;
+        pv_wait(ptr, val);
+        if (*pkick_time) {
+                this_cpu_add(qstats[qstat_pv_latency_wake],
+                             sched_clock() - *pkick_time);
+                qstat_inc(qstat_pv_kick_wake, true);
+        }
+}
+#define pv_kick(c)      __pv_kick(c)
+#define pv_wait(p, v)   __pv_wait(p, v)
+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_spin_steal_lock(struct qspinlock *lock)
+{
+        int ret = pv_queued_spin_steal_lock(lock);
+        qstat_inc(qstat_pv_lock_stealing, ret);
+        return ret;
+}
+#undef  queued_spin_trylock
+#define queued_spin_trylock(l)  qstat_spin_steal_lock(l)
+#else /* CONFIG_QUEUED_LOCK_STAT */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)  { }
+static inline void qstat_hop(int hopcnt)                        { }
+#endif /* CONFIG_QUEUED_LOCK_STAT */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1ef0d7aeab47..34cb9f7fc2d2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1905,6 +1905,97 @@ static void ttwu_queue(struct task_struct *p, int cpu)
        raw_spin_unlock(&rq->lock);
 }
+/*
+ * Notes on Program-Order guarantees on SMP systems.
+ *
+ *  MIGRATION
+ *
+ * The basic program-order guarantee on SMP systems is that when a task [t]
+ * migrates, all its activity on its old cpu [c0] happens-before any subsequent
+ * execution on its new cpu [c1].
+ *
+ * For migration (of runnable tasks) this is provided by the following means:
+ *
+ *  A) UNLOCK of the rq(c0)->lock scheduling out task t
+ *  B) migration for t is required to synchronize *both* rq(c0)->lock and
+ *     rq(c1)->lock (if not at the same time, then in that order).
+ *  C) LOCK of the rq(c1)->lock scheduling in task
+ *
+ * Transitivity guarantees that B happens after A and C after B.
+ * Note: we only require RCpc transitivity.
+ * Note: the cpu doing B need not be c0 or c1
+ *
+ * Example:
+ *
+ *   CPU0            CPU1            CPU2
+ *
+ *   LOCK rq(0)->lock
+ *   sched-out X
+ *   sched-in Y
+ *   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(0)->lock // orders against CPU0
+ *                                   dequeue X
+ *                                   UNLOCK rq(0)->lock
+ *
+ *                                   LOCK rq(1)->lock
+ *                                   enqueue X
+ *                                   UNLOCK rq(1)->lock
+ *
+ *                   LOCK rq(1)->lock // orders against CPU2
+ *                   sched-out Z
+ *                   sched-in X
+ *                   UNLOCK rq(1)->lock
+ *
+ *
+ *  BLOCKING -- aka. SLEEP + WAKEUP
+ *
+ * For blocking we (obviously) need to provide the same guarantee as for
+ * migration. However the means are completely different as there is no lock
+ * chain to provide order. Instead we do:
+ *
+ *   1) smp_store_release(X->on_cpu, 0)
+ *   2) smp_cond_acquire(!X->on_cpu)
+ *
+ * Example:
+ *
+ *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
+ *
+ *   LOCK rq(0)->lock LOCK X->pi_lock
+ *   dequeue X
+ *   sched-out X
+ *   smp_store_release(X->on_cpu, 0);
+ *
+ *                    smp_cond_acquire(!X->on_cpu);
+ *                    X->state = WAKING
+ *                    set_task_cpu(X,2)
+ *
+ *                    LOCK rq(2)->lock
+ *                    enqueue X
+ *                    X->state = RUNNING
+ *                    UNLOCK rq(2)->lock
+ *
+ *                                          LOCK rq(2)->lock // orders against CPU1
+ *                                          sched-out Z
+ *                                          sched-in X
+ *                                          UNLOCK rq(2)->lock
+ *
+ *                    UNLOCK X->pi_lock
+ *   UNLOCK rq(0)->lock
+ *
+ *
+ * However; for wakeups there is a second guarantee we must provide, namely we
+ * must observe the state that lead to our wakeup. That is, not only must our
+ * task observe its own prior state, it must also observe the stores prior to
+ * its wakeup.
+ *
+ * This means that any means of doing remote wakeups must order the CPU doing
+ * the wakeup against the CPU the task is going to end up running on. This,
+ * however, is already required for the regular Program-Order guarantee above,
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ *
+ */
 /**
 * try_to_wake_up - wake up a thread
 * @p: the thread to be awakened
@@ -1968,19 +2059,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        /*
         * If the owning (remote) cpu is still in the middle of schedule() with
         * this task as prev, wait until its done referencing the task.
-         */
-        while (p->on_cpu)
-                cpu_relax();
-        /*
-         * Combined with the control dependency above, we have an effective
-         * smp_load_acquire() without the need for full barriers.
         *
         * Pairs with the smp_store_release() in finish_lock_switch().
         *
         * This ensures that tasks getting woken will be fully ordered against
         * their previous state and preserve Program Order.
         */
-        smp_rmb();
+        smp_cond_acquire(!p->on_cpu);
        p->sched_contributes_to_load = !!task_contributes_to_load(p);
        p->state = TASK_WAKING;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b242775bf670..1e0bb4afe3fd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1076,7 +1076,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
         * In particular, the load of prev->state in finish_task_switch() must
         * happen before this.
         *
-         * Pairs with the control dependency and rmb in try_to_wake_up().
+         * Pairs with the smp_cond_acquire() in try_to_wake_up().
         */
        smp_store_release(&prev->on_cpu, 0);
 #endif
diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c
index 83c33a5bcffb..18e422b259cf 100644
--- a/lib/atomic64_test.c
+++ b/lib/atomic64_test.c
@@ -27,6 +27,65 @@ do {								\
                (unsigned long long)r);                         \
 } while (0)
+/*
+ * Test for a atomic operation family,
+ * @test should be a macro accepting parameters (bit, op, ...)
+ */
+#define FAMILY_TEST(test, bit, op, args...)     \
+do {                                            \
+        test(bit, op, ##args);          \
+        test(bit, op##_acquire, ##args);        \
+        test(bit, op##_release, ##args);        \
+        test(bit, op##_relaxed, ##args);        \
+} while (0)
+#define TEST_RETURN(bit, op, c_op, val)                         \
+do {                                                            \
+        atomic##bit##_set(&v, v0);                              \
+        r = v0;                                                 \
+        r c_op val;                                             \
+        BUG_ON(atomic##bit##_##op(val, &v) != r);               \
+        BUG_ON(atomic##bit##_read(&v) != r);                    \
+} while (0)
+#define RETURN_FAMILY_TEST(bit, op, c_op, val)                  \
+do {                                                            \
+        FAMILY_TEST(TEST_RETURN, bit, op, c_op, val);           \
+} while (0)
+#define TEST_ARGS(bit, op, init, ret, expect, args...)          \
+do {                                                            \
+        atomic##bit##_set(&v, init);                            \
+        BUG_ON(atomic##bit##_##op(&v, ##args) != ret);          \
+        BUG_ON(atomic##bit##_read(&v) != expect);               \
+} while (0)
+#define XCHG_FAMILY_TEST(bit, init, new)                                \
+do {                                                                    \
+        FAMILY_TEST(TEST_ARGS, bit, xchg, init, init, new, new);        \
+} while (0)
+#define CMPXCHG_FAMILY_TEST(bit, init, new, wrong)                      \
+do {                                                                    \
+        FAMILY_TEST(TEST_ARGS, bit, cmpxchg,                            \
+                        init, init, new, init, new);                    \
+        FAMILY_TEST(TEST_ARGS, bit, cmpxchg,                            \
+                        init, init, init, wrong, new);                  \
+} while (0)
+#define INC_RETURN_FAMILY_TEST(bit, i)                  \
+do {                                                    \
+        FAMILY_TEST(TEST_ARGS, bit, inc_return,         \
+                        i, (i) + one, (i) + one);       \
+} while (0)
+#define DEC_RETURN_FAMILY_TEST(bit, i)                  \
+do {                                                    \
+        FAMILY_TEST(TEST_ARGS, bit, dec_return,         \
+                        i, (i) - one, (i) - one);       \
+} while (0)
 static __init void test_atomic(void)
 {
        int v0 = 0xaaa31337;
@@ -45,6 +104,18 @@ static __init void test_atomic(void)
        TEST(, and, &=, v1);
        TEST(, xor, ^=, v1);
        TEST(, andnot, &= ~, v1);
+        RETURN_FAMILY_TEST(, add_return, +=, onestwos);
+        RETURN_FAMILY_TEST(, add_return, +=, -one);
+        RETURN_FAMILY_TEST(, sub_return, -=, onestwos);
+        RETURN_FAMILY_TEST(, sub_return, -=, -one);
+        INC_RETURN_FAMILY_TEST(, v0);
+        DEC_RETURN_FAMILY_TEST(, v0);
+        XCHG_FAMILY_TEST(, v0, v1);
+        CMPXCHG_FAMILY_TEST(, v0, v1, onestwos);
 }
 #define INIT(c) do { atomic64_set(&v, c); r = c; } while (0)
@@ -74,25 +145,10 @@ static __init void test_atomic64(void)
        TEST(64, xor, ^=, v1);
        TEST(64, andnot, &= ~, v1);
-        INIT(v0);
+        RETURN_FAMILY_TEST(64, add_return, +=, onestwos);
-        r += onestwos;
+        RETURN_FAMILY_TEST(64, add_return, +=, -one);
-        BUG_ON(atomic64_add_return(onestwos, &v) != r);
+        RETURN_FAMILY_TEST(64, sub_return, -=, onestwos);
-        BUG_ON(v.counter != r);
+        RETURN_FAMILY_TEST(64, sub_return, -=, -one);
-        INIT(v0);
-        r += -one;
-        BUG_ON(atomic64_add_return(-one, &v) != r);
-        BUG_ON(v.counter != r);
-        INIT(v0);
-        r -= onestwos;
-        BUG_ON(atomic64_sub_return(onestwos, &v) != r);
-        BUG_ON(v.counter != r);
-        INIT(v0);
-        r -= -one;
-        BUG_ON(atomic64_sub_return(-one, &v) != r);
-        BUG_ON(v.counter != r);
        INIT(v0);
        atomic64_inc(&v);
@@ -100,33 +156,15 @@ static __init void test_atomic64(void)
        BUG_ON(v.counter != r);
        INIT(v0);
-        r += one;
-        BUG_ON(atomic64_inc_return(&v) != r);
-        BUG_ON(v.counter != r);
-        INIT(v0);
        atomic64_dec(&v);
        r -= one;
        BUG_ON(v.counter != r);
-        INIT(v0);
+        INC_RETURN_FAMILY_TEST(64, v0);
-        r -= one;
+        DEC_RETURN_FAMILY_TEST(64, v0);
-        BUG_ON(atomic64_dec_return(&v) != r);
-        BUG_ON(v.counter != r);
-        INIT(v0);
-        BUG_ON(atomic64_xchg(&v, v1) != v0);
-        r = v1;
-        BUG_ON(v.counter != r);
-        INIT(v0);
-        BUG_ON(atomic64_cmpxchg(&v, v0, v1) != v0);
-        r = v1;
-        BUG_ON(v.counter != r);
-        INIT(v0);
+        XCHG_FAMILY_TEST(64, v0, v1);
-        BUG_ON(atomic64_cmpxchg(&v, v2, v1) != v0);
+        CMPXCHG_FAMILY_TEST(64, v0, v1, v2);
-        BUG_ON(v.counter != r);
        INIT(v0);
        BUG_ON(atomic64_add_unless(&v, one, v0));
author	Linus Torvalds <torvalds@linux-foundation.org>	2016-01-11 17:18:38 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-01-11 17:18:38 -0500
commit	24af98c4cf5f5e69266e270c7f3fb34b82ff6656 (patch)
tree	70d71381c841c92b2d28397bf0c5d6a7d9bbbaac
parent	9061cbe62adeccf8c986883bcd40f4aeee59ea75 (diff)
parent	337f13046ff03717a9e99675284a817527440a49 (diff)