3 files changed, 576 insertions, 58 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5..393d1874b9e0 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
 * (C) Copyright 2013-2014 Red Hat, Inc.
 * (C) Copyright 2015 Intel Corp.
+ * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
 *
- * Authors: Waiman Long <waiman.long@hp.com>
+ * Authors: Waiman Long <waiman.long@hpe.com>
 *          Peter Zijlstra <peterz@infradead.org>
 */
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
 {
        struct __qspinlock *l = (void *)lock;
-        return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+        /*
+         * Use release semantics to make sure that the MCS node is properly
+         * initialized before changing the tail code.
+         */
+        return (u32)xchg_release(&l->tail,
+                                 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
 }
 #else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
        for (;;) {
                new = (val & _Q_LOCKED_PENDING_MASK) | tail;
-                old = atomic_cmpxchg(&lock->val, val, new);
+                /*
+                 * Use release semantics to make sure that the MCS node is
+                 * properly initialized before changing the tail code.
+                 */
+                old = atomic_cmpxchg_release(&lock->val, val, new);
                if (old == val)
                        break;
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
 */
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+                                           struct mcs_spinlock *prev) { }
 static __always_inline void __pv_kick_node(struct qspinlock *lock,
                                           struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
+static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
-                                           struct mcs_spinlock *node) { }
+                                                   struct mcs_spinlock *node)
+                                                   { return 0; }
 #define pv_enabled()            false
 #define pv_init_node            __pv_init_node
 #define pv_wait_node            __pv_wait_node
 #define pv_kick_node            __pv_kick_node
-#define pv_wait_head            __pv_wait_head
+#define pv_wait_head_or_lock    __pv_wait_head_or_lock
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define queued_spin_lock_slowpath       native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
                if (val == new)
                        new |= _Q_PENDING_VAL;
-                old = atomic_cmpxchg(&lock->val, val, new);
+                /*
+                 * Acquire semantic is required here as the function may
+                 * return immediately if the lock was free.
+                 */
+                old = atomic_cmpxchg_acquire(&lock->val, val, new);
                if (old == val)
                        break;
@@ -382,6 +398,7 @@ queue:
         * p,*,* -> n,*,*
         */
        old = xchg_tail(lock, tail);
+        next = NULL;
        /*
         * if there was a previous node; link it and wait until reaching the
@@ -391,8 +408,18 @@ queue:
                prev = decode_tail(old);
                WRITE_ONCE(prev->next, node);
-                pv_wait_node(node);
+                pv_wait_node(node, prev);
                arch_mcs_spin_lock_contended(&node->locked);
+                /*
+                 * While waiting for the MCS lock, the next pointer may have
+                 * been set by another lock waiter. We optimistically load
+                 * the next pointer & prefetch the cacheline for writing
+                 * to reduce latency in the upcoming MCS unlock operation.
+                 */
+                next = READ_ONCE(node->next);
+                if (next)
+                        prefetchw(next);
        }
        /*
@@ -406,11 +433,22 @@ queue:
         * sequentiality; this is because the set_locked() function below
         * does not imply a full barrier.
         *
+         * The PV pv_wait_head_or_lock function, if active, will acquire
+         * the lock and return a non-zero value. So we have to skip the
+         * smp_load_acquire() call. As the next PV queue head hasn't been
+         * designated yet, there is no way for the locked value to become
+         * _Q_SLOW_VAL. So both the set_locked() and the
+         * atomic_cmpxchg_relaxed() calls will be safe.
+         *
+         * If PV isn't active, 0 will be returned instead.
+         *
         */
-        pv_wait_head(lock, node);
+        if ((val = pv_wait_head_or_lock(lock, node)))
-        while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK)
+                goto locked;
-                cpu_relax();
+        smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+locked:
        /*
         * claim the lock:
         *
@@ -422,11 +460,17 @@ queue:
         * to grab the lock.
         */
        for (;;) {
-                if (val != tail) {
+                /* In the PV case we might already have _Q_LOCKED_VAL set */
+                if ((val & _Q_TAIL_MASK) != tail) {
                        set_locked(lock);
                        break;
                }
-                old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL);
+                /*
+                 * The smp_load_acquire() call above has provided the necessary
+                 * acquire semantics required for locking. At most two
+                 * iterations of this loop may be ran.
+                 */
+                old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
                if (old == val)
                        goto release;   /* No contention */
@@ -434,10 +478,12 @@ queue:
        }
        /*
-         * contended path; wait for next, release.
+         * contended path; wait for next if not observed yet, release.
         */
-        while (!(next = READ_ONCE(node->next)))
+        if (!next) {
-                cpu_relax();
+                while (!(next = READ_ONCE(node->next)))
+                        cpu_relax();
+        }
        arch_mcs_spin_unlock_contended(&next->locked);
        pv_kick_node(lock, next);
@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
 #undef pv_init_node
 #undef pv_wait_node
 #undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
 #undef  queued_spin_lock_slowpath
 #define queued_spin_lock_slowpath       __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff4829b..87bb235c3448 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
 #define _Q_SLOW_VAL     (3U << _Q_LOCKED_OFFSET)
 /*
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will stop spinning if the vCPU in the previous node is
+ * not running. The one lock stealing attempt allowed at slowpath entry
+ * mitigates the slight slowdown for non-overcommitted guest with this
+ * aggressive wait-early mechanism.
+ *
+ * The status of the previous node will be checked at fixed interval
+ * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
+ * pound on the cacheline of the previous node too heavily.
+ */
+#define PV_PREV_CHECK_MASK      0xff
+/*
 * Queue node uses: vcpu_running & vcpu_halted.
 * Queue head uses: vcpu_running & vcpu_hashed.
 */
@@ -41,6 +55,94 @@ struct pv_node {
 };
 /*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l)  pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+                (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->pending, 1);
+}
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        WRITE_ONCE(l->pending, 0);
+}
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        return !READ_ONCE(l->locked) &&
+               (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+                        == _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+        atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+        atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+        int val = atomic_read(&lock->val);
+        for (;;) {
+                int old, new;
+                if (val  & _Q_LOCKED_MASK)
+                        break;
+                /*
+                 * Try to clear pending bit & set locked bit
+                 */
+                old = val;
+                new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+                val = atomic_cmpxchg(&lock->val, old, new);
+                if (val == old)
+                        return 1;
+        }
+        return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+/*
 * Lock and MCS node addresses hash table for fast lookup
 *
 * Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 {
        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
        struct pv_hash_entry *he;
+        int hopcnt = 0;
        for_each_hash_entry(he, offset, hash) {
+                hopcnt++;
                if (!cmpxchg(&he->lock, NULL, lock)) {
                        WRITE_ONCE(he->node, node);
+                        qstat_hop(hopcnt);
                        return &he->lock;
                }
        }
@@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
 }
 /*
+ * Return true if when it is time to check the previous node which is not
+ * in a running state.
+ */
+static inline bool
+pv_wait_early(struct pv_node *prev, int loop)
+{
+        if ((loop & PV_PREV_CHECK_MASK) != 0)
+                return false;
+        return READ_ONCE(prev->state) != vcpu_running;
+}
+/*
 * Initialize the PV part of the mcs_spinlock node.
 */
 static void pv_init_node(struct mcs_spinlock *node)
@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
 * behalf.
 */
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 {
        struct pv_node *pn = (struct pv_node *)node;
+        struct pv_node *pp = (struct pv_node *)prev;
+        int waitcnt = 0;
        int loop;
+        bool wait_early;
-        for (;;) {
+        /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
-                for (loop = SPIN_THRESHOLD; loop; loop--) {
+        for (;; waitcnt++) {
+                for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
                        if (READ_ONCE(node->locked))
                                return;
+                        if (pv_wait_early(pp, loop)) {
+                                wait_early = true;
+                                break;
+                        }
                        cpu_relax();
                }
@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 */
                smp_store_mb(pn->state, vcpu_halted);
-                if (!READ_ONCE(node->locked))
+                if (!READ_ONCE(node->locked)) {
+                        qstat_inc(qstat_pv_wait_node, true);
+                        qstat_inc(qstat_pv_wait_again, waitcnt);
+                        qstat_inc(qstat_pv_wait_early, wait_early);
                        pv_wait(&pn->state, vcpu_halted);
+                }
                /*
-                 * If pv_kick_node() changed us to vcpu_hashed, retain that value
+                 * If pv_kick_node() changed us to vcpu_hashed, retain that
-                 * so that pv_wait_head() knows to not also try to hash this lock.
+                 * value so that pv_wait_head_or_lock() knows to not also try
+                 * to hash this lock.
                 */
                cmpxchg(&pn->state, vcpu_halted, vcpu_running);
@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 * So it is better to spin for a while in the hope that the
                 * MCS lock will be released soon.
                 */
+                qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
        }
        /*
@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
 /*
 * Called after setting next->locked = 1 when we're the lock owner.
 *
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
 */
 static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 {
@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
 }
 /*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
 * __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
 */
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
 {
        struct pv_node *pn = (struct pv_node *)node;
        struct __qspinlock *l = (void *)lock;
        struct qspinlock **lp = NULL;
+        int waitcnt = 0;
        int loop;
        /*
@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
        if (READ_ONCE(pn->state) == vcpu_hashed)
                lp = (struct qspinlock **)1;
-        for (;;) {
+        for (;; waitcnt++) {
+                /*
+                 * Set correct vCPU state to be used by queue node wait-early
+                 * mechanism.
+                 */
+                WRITE_ONCE(pn->state, vcpu_running);
+                /*
+                 * Set the pending bit in the active lock spinning loop to
+                 * disable lock stealing before attempting to acquire the lock.
+                 */
+                set_pending(lock);
                for (loop = SPIN_THRESHOLD; loop; loop--) {
-                        if (!READ_ONCE(l->locked))
+                        if (trylock_clear_pending(lock))
-                                return;
+                                goto gotlock;
                        cpu_relax();
                }
+                clear_pending(lock);
                if (!lp) { /* ONCE */
                        lp = pv_hash(lock, pn);
@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
                         *
                         * Matches the smp_rmb() in __pv_queued_spin_unlock().
                         */
-                        if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+                        if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
                                /*
-                                 * The lock is free and _Q_SLOW_VAL has never
+                                 * The lock was free and now we own the lock.
-                                 * been set. Therefore we need to unhash before
+                                 * Change the lock value back to _Q_LOCKED_VAL
-                                 * getting the lock.
+                                 * and unhash the table.
                                 */
+                                WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
                                WRITE_ONCE(*lp, NULL);
-                                return;
+                                goto gotlock;
                        }
                }
+                WRITE_ONCE(pn->state, vcpu_halted);
+                qstat_inc(qstat_pv_wait_head, true);
+                qstat_inc(qstat_pv_wait_again, waitcnt);
                pv_wait(&l->locked, _Q_SLOW_VAL);
                /*
                 * The unlocker should have freed the lock before kicking the
                 * CPU. So if the lock is still not free, it is a spurious
-                 * wakeup and so the vCPU should wait again after spinning for
+                 * wakeup or another vCPU has stolen the lock. The current
-                 * a while.
+                 * vCPU should spin again.
                 */
+                qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
        }
        /*
-         * Lock is unlocked now; the caller will acquire it without waiting.
+         * The cmpxchg() or xchg() call before coming here provides the
-         * As with pv_wait_node() we rely on the caller to do a load-acquire
+         * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
-         * for us.
+         * here is to indicate to the compiler that the value will always
+         * be nozero to enable better code optimization.
         */
+gotlock:
+        return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
 }
 /*
- * PV version of the unlock function to be used in stead of
+ * PV versions of the unlock fastpath and slowpath functions to be used
- * queued_spin_unlock().
+ * instead of queued_spin_unlock().
 */
-__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+__visible void
+__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
 {
        struct __qspinlock *l = (void *)lock;
        struct pv_node *node;
-        u8 locked;
-        /*
-         * We must not unlock if SLOW, because in that case we must first
-         * unhash. Otherwise it would be possible to have multiple @lock
-         * entries, which would be BAD.
-         */
-        locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
-        if (likely(locked == _Q_LOCKED_VAL))
-                return;
        if (unlikely(locked != _Q_SLOW_VAL)) {
                WARN(!debug_locks_silent,
@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
         * so we need a barrier to order the read of the node data in
         * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
         *
-         * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+         * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
         */
        smp_rmb();
@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
         * vCPU is harmless other than the additional latency in completing
         * the unlock.
         */
+        qstat_inc(qstat_pv_kick_unlock, true);
        pv_kick(node->cpu);
 }
 /*
 * Include the architecture specific callee-save thunk of the
 * __pv_queued_spin_unlock(). This thunk is put together with
- * __pv_queued_spin_unlock() near the top of the file to make sure
+ * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
- * that the callee-save thunk and the real unlock function are close
+ * function close to each other sharing consecutive instruction cachelines.
- * to each other sharing consecutive instruction cachelines.
+ * Alternatively, architecture specific version of __pv_queued_spin_unlock()
+ * can be defined.
 */
 #include <asm/qspinlock_paravirt.h>
+#ifndef __pv_queued_spin_unlock
+__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
+{
+        struct __qspinlock *l = (void *)lock;
+        u8 locked;
+        /*
+         * We must not unlock if SLOW, because in that case we must first
+         * unhash. Otherwise it would be possible to have multiple @lock
+         * entries, which would be BAD.
+         */
+        locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+        if (likely(locked == _Q_LOCKED_VAL))
+                return;
+        __pv_queued_spin_unlock_slowpath(lock, locked);
+}
+#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000000..640dcecdd1df
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,300 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ *   pv_hash_hops       - average # of hops per hashing operation
+ *   pv_kick_unlock     - # of vCPU kicks issued at unlock time
+ *   pv_kick_wake       - # of vCPU kicks used for computing pv_latency_wake
+ *   pv_latency_kick    - average latency (ns) of vCPU kick operation
+ *   pv_latency_wake    - average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_stealing   - # of lock stealing operations
+ *   pv_spurious_wakeup - # of spurious wakeups
+ *   pv_wait_again      - # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_early      - # of early vCPU wait's
+ *   pv_wait_head       - # of vCPU wait's at the queue head
+ *   pv_wait_node       - # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+        qstat_pv_hash_hops,
+        qstat_pv_kick_unlock,
+        qstat_pv_kick_wake,
+        qstat_pv_latency_kick,
+        qstat_pv_latency_wake,
+        qstat_pv_lock_stealing,
+        qstat_pv_spurious_wakeup,
+        qstat_pv_wait_again,
+        qstat_pv_wait_early,
+        qstat_pv_wait_head,
+        qstat_pv_wait_node,
+        qstat_num,      /* Total number of statistical counters */
+        qstat_reset_cnts = qstat_num,
+};
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+static const char * const qstat_names[qstat_num + 1] = {
+        [qstat_pv_hash_hops]       = "pv_hash_hops",
+        [qstat_pv_kick_unlock]     = "pv_kick_unlock",
+        [qstat_pv_kick_wake]       = "pv_kick_wake",
+        [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+        [qstat_pv_latency_kick]    = "pv_latency_kick",
+        [qstat_pv_latency_wake]    = "pv_latency_wake",
+        [qstat_pv_lock_stealing]   = "pv_lock_stealing",
+        [qstat_pv_wait_again]      = "pv_wait_again",
+        [qstat_pv_wait_early]      = "pv_wait_early",
+        [qstat_pv_wait_head]       = "pv_wait_head",
+        [qstat_pv_wait_node]       = "pv_wait_node",
+        [qstat_reset_cnts]         = "reset_counters",
+};
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ *    Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+                          size_t count, loff_t *ppos)
+{
+        char buf[64];
+        int cpu, counter, len;
+        u64 stat = 0, kicks = 0;
+        /*
+         * Get the counter ID stored in file->f_inode->i_private
+         */
+        if (!file->f_inode) {
+                WARN_ON_ONCE(1);
+                return -EBADF;
+        }
+        counter = (long)(file->f_inode->i_private);
+        if (counter >= qstat_num)
+                return -EBADF;
+        for_each_possible_cpu(cpu) {
+                stat += per_cpu(qstats[counter], cpu);
+                /*
+                 * Need to sum additional counter for some of them
+                 */
+                switch (counter) {
+                case qstat_pv_latency_kick:
+                case qstat_pv_hash_hops:
+                        kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+                        break;
+                case qstat_pv_latency_wake:
+                        kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+                        break;
+                }
+        }
+        if (counter == qstat_pv_hash_hops) {
+                u64 frac;
+                frac = 100ULL * do_div(stat, kicks);
+                frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+                /*
+                 * Return a X.XX decimal number
+                 */
+                len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+        } else {
+                /*
+                 * Round to the nearest ns
+                 */
+                if ((counter == qstat_pv_latency_kick) ||
+                    (counter == qstat_pv_latency_wake)) {
+                        stat = 0;
+                        if (kicks)
+                                stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+                }
+                len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+        }
+        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+                           size_t count, loff_t *ppos)
+{
+        int cpu;
+        /*
+         * Get the counter ID stored in file->f_inode->i_private
+         */
+        if (!file->f_inode) {
+                WARN_ON_ONCE(1);
+                return -EBADF;
+        }
+        if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+                return count;
+        for_each_possible_cpu(cpu) {
+                int i;
+                unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+                for (i = 0 ; i < qstat_num; i++)
+                        WRITE_ONCE(ptr[i], 0);
+                for (i = 0 ; i < qstat_num; i++)
+                        WRITE_ONCE(ptr[i], 0);
+        }
+        return count;
+}
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+        .read = qstat_read,
+        .write = qstat_write,
+        .llseek = default_llseek,
+};
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+        struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+        int i;
+        if (!d_qstat) {
+                pr_warn("Could not create 'qlockstat' debugfs directory\n");
+                return 0;
+        }
+        /*
+         * Create the debugfs files
+         *
+         * As reading from and writing to the stat files can be slow, only
+         * root is allowed to do the read/write to limit impact to system
+         * performance.
+         */
+        for (i = 0; i < qstat_num; i++)
+                debugfs_create_file(qstat_names[i], 0400, d_qstat,
+                                   (void *)(long)i, &fops_qstat);
+        debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+                           (void *)(long)qstat_reset_cnts, &fops_qstat);
+        return 0;
+}
+fs_initcall(init_qspinlock_stat);
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+        if (cond)
+                this_cpu_inc(qstats[stat]);
+}
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+        this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+        u64 start = sched_clock();
+        per_cpu(pv_kick_time, cpu) = start;
+        pv_kick(cpu);
+        this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+        u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+        *pkick_time = 0;
+        pv_wait(ptr, val);
+        if (*pkick_time) {
+                this_cpu_add(qstats[qstat_pv_latency_wake],
+                             sched_clock() - *pkick_time);
+                qstat_inc(qstat_pv_kick_wake, true);
+        }
+}
+#define pv_kick(c)      __pv_kick(c)
+#define pv_wait(p, v)   __pv_wait(p, v)
+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_spin_steal_lock(struct qspinlock *lock)
+{
+        int ret = pv_queued_spin_steal_lock(lock);
+        qstat_inc(qstat_pv_lock_stealing, ret);
+        return ret;
+}
+#undef  queued_spin_trylock
+#define queued_spin_trylock(l)  qstat_spin_steal_lock(l)
+#else /* CONFIG_QUEUED_LOCK_STAT */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)  { }
+static inline void qstat_hop(int hopcnt)                        { }
+#endif /* CONFIG_QUEUED_LOCK_STAT */