Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull locking updates from Ingo Molnar: "Here are the locking changes in this cycle: - rwsem unification and simpler micro-optimizations to prepare for more intrusive (and more lucrative) scalability improvements in v5.3 (Waiman Long) - Lockdep irq state tracking flag usage cleanups (Frederic Weisbecker) - static key improvements (Jakub Kicinski, Peter Zijlstra) - misc updates, cleanups and smaller fixes" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits) locking/lockdep: Remove unnecessary unlikely() locking/static_key: Don't take sleeping locks in __static_key_slow_dec_deferred() locking/static_key: Factor out the fast path of static_key_slow_dec() locking/static_key: Add support for deferred static branches locking/lockdep: Test all incompatible scenarios at once in check_irq_usage() locking/lockdep: Avoid bogus Clang warning locking/lockdep: Generate LOCKF_ bit composites locking/lockdep: Use expanded masks on find_usage_*() functions locking/lockdep: Map remaining magic numbers to lock usage mask names locking/lockdep: Move valid_state() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING locking/rwsem: Prevent unneeded warning during locking selftest locking/rwsem: Optimize rwsem structure for uncontended lock acquisition locking/rwsem: Enable lock event counting locking/lock_events: Don't show pvqspinlock events on bare metal locking/lock_events: Make lock_events available for all archs & other locks locking/qspinlock_stat: Introduce generic lockevent_*() counting APIs locking/rwsem: Enhance DEBUG_RWSEMS_WARN_ON() macro locking/rwsem: Add debug check for __down_read*() locking/rwsem: Micro-optimize rwsem_try_read_lock_unqueued() locking/rwsem: Move rwsem internal function declarations to rwsem-xadd.h ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2019-05-06 16:50:15 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-05-06 16:50:15 -0400
commit: 007dc78fea62610bf06829e38f1d8c69b6ea5af6 (patch)
tree: 683af90696ed7a237dedd48030bfd649e5822955 /kernel/locking
parent: 2f1835dffa949f560dfa3ed63c0bfc10944b461c (diff)
parent: d671002be6bdd7f77a771e23bf3e95d1f16775e6 (diff)
14 files changed, 858 insertions, 766 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 392c7f23af76..6fe2f333aecb 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
 # and is generally not a function of system call inputs.
 KCOV_INSTRUMENT         := n
-obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
-obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
 obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
 obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
 obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
+obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
new file mode 100644
index 000000000000..fa2c2f951c6b
--- /dev/null
+++ b/kernel/locking/lock_events.c
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+/*
+ * Collect locking event counts
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/sched/clock.h>
+#include <linux/fs.h>
+#include "lock_events.h"
+#undef  LOCK_EVENT
+#define LOCK_EVENT(name)        [LOCKEVENT_ ## name] = #name,
+#define LOCK_EVENTS_DIR         "lock_event_counts"
+/*
+ * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different
+ * types of locks will be reported under the <debugfs>/lock_event_counts/
+ * directory. See lock_events_list.h for the list of available locking
+ * events.
+ *
+ * Writing to the special ".reset_counts" file will reset all the above
+ * locking event counts. This is a very slow operation and so should not
+ * be done frequently.
+ *
+ * These event counts are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counts usable even in a production
+ * environment.
+ */
+static const char * const lockevent_names[lockevent_num + 1] = {
+#include "lock_events_list.h"
+        [LOCKEVENT_reset_cnts] = ".reset_counts",
+};
+/*
+ * Per-cpu counts
+ */
+DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+/*
+ * The lockevent_read() function can be overridden.
+ */
+ssize_t __weak lockevent_read(struct file *file, char __user *user_buf,
+                              size_t count, loff_t *ppos)
+{
+        char buf[64];
+        int cpu, id, len;
+        u64 sum = 0;
+        /*
+         * Get the counter ID stored in file->f_inode->i_private
+         */
+        id = (long)file_inode(file)->i_private;
+        if (id >= lockevent_num)
+                return -EBADF;
+        for_each_possible_cpu(cpu)
+                sum += per_cpu(lockevents[id], cpu);
+        len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
+        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+/*
+ * Function to handle write request
+ *
+ * When idx = reset_cnts, reset all the counts.
+ */
+static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
+                           size_t count, loff_t *ppos)
+{
+        int cpu;
+        /*
+         * Get the counter ID stored in file->f_inode->i_private
+         */
+        if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
+                return count;
+        for_each_possible_cpu(cpu) {
+                int i;
+                unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
+                for (i = 0 ; i < lockevent_num; i++)
+                        WRITE_ONCE(ptr[i], 0);
+        }
+        return count;
+}
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_lockevent = {
+        .read = lockevent_read,
+        .write = lockevent_write,
+        .llseek = default_llseek,
+};
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+#include <asm/paravirt.h>
+static bool __init skip_lockevent(const char *name)
+{
+        static int pv_on __initdata = -1;
+        if (pv_on < 0)
+                pv_on = !pv_is_native_spin_unlock();
+        /*
+         * Skip PV qspinlock events on bare metal.
+         */
+        if (!pv_on && !memcmp(name, "pv_", 3))
+                return true;
+        return false;
+}
+#else
+static inline bool skip_lockevent(const char *name)
+{
+        return false;
+}
+#endif
+/*
+ * Initialize debugfs for the locking event counts.
+ */
+static int __init init_lockevent_counts(void)
+{
+        struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
+        int i;
+        if (!d_counts)
+                goto out;
+        /*
+         * Create the debugfs files
+         *
+         * As reading from and writing to the stat files can be slow, only
+         * root is allowed to do the read/write to limit impact to system
+         * performance.
+         */
+        for (i = 0; i < lockevent_num; i++) {
+                if (skip_lockevent(lockevent_names[i]))
+                        continue;
+                if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
+                                         (void *)(long)i, &fops_lockevent))
+                        goto fail_undo;
+        }
+        if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
+                                 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
+                                 &fops_lockevent))
+                goto fail_undo;
+        return 0;
+fail_undo:
+        debugfs_remove_recursive(d_counts);
+out:
+        pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR);
+        return -ENOMEM;
+}
+fs_initcall(init_lockevent_counts);
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
new file mode 100644
index 000000000000..feb1acc54611
--- /dev/null
+++ b/kernel/locking/lock_events.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+#ifndef __LOCKING_LOCK_EVENTS_H
+#define __LOCKING_LOCK_EVENTS_H
+enum lock_events {
+#include "lock_events_list.h"
+        lockevent_num,  /* Total number of lock event counts */
+        LOCKEVENT_reset_cnts = lockevent_num,
+};
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+/*
+ * Per-cpu counters
+ */
+DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void __lockevent_inc(enum lock_events event, bool cond)
+{
+        if (cond)
+                __this_cpu_inc(lockevents[event]);
+}
+#define lockevent_inc(ev)         __lockevent_inc(LOCKEVENT_ ##ev, true)
+#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
+static inline void __lockevent_add(enum lock_events event, int inc)
+{
+        __this_cpu_add(lockevents[event], inc);
+}
+#define lockevent_add(ev, c)    __lockevent_add(LOCKEVENT_ ##ev, c)
+#else  /* CONFIG_LOCK_EVENT_COUNTS */
+#define lockevent_inc(ev)
+#define lockevent_add(ev, c)
+#define lockevent_cond_inc(ev, c)
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
+#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
new file mode 100644
index 000000000000..ad7668cfc9da
--- /dev/null
+++ b/kernel/locking/lock_events_list.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <longman@redhat.com>
+ */
+#ifndef LOCK_EVENT
+#define LOCK_EVENT(name)        LOCKEVENT_ ## name,
+#endif
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+/*
+ * Locking events for PV qspinlock.
+ */
+LOCK_EVENT(pv_hash_hops)        /* Average # of hops per hashing operation */
+LOCK_EVENT(pv_kick_unlock)      /* # of vCPU kicks issued at unlock time   */
+LOCK_EVENT(pv_kick_wake)        /* # of vCPU kicks for pv_latency_wake     */
+LOCK_EVENT(pv_latency_kick)     /* Average latency (ns) of vCPU kick       */
+LOCK_EVENT(pv_latency_wake)     /* Average latency (ns) of kick-to-wakeup  */
+LOCK_EVENT(pv_lock_stealing)    /* # of lock stealing operations           */
+LOCK_EVENT(pv_spurious_wakeup)  /* # of spurious wakeups in non-head vCPUs */
+LOCK_EVENT(pv_wait_again)       /* # of wait's after queue head vCPU kick  */
+LOCK_EVENT(pv_wait_early)       /* # of early vCPU wait's                  */
+LOCK_EVENT(pv_wait_head)        /* # of vCPU wait's at the queue head      */
+LOCK_EVENT(pv_wait_node)        /* # of vCPU wait's at non-head queue node */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+/*
+ * Locking events for qspinlock
+ *
+ * Subtracting lock_use_node[234] from lock_slowpath will give you
+ * lock_use_node1.
+ */
+LOCK_EVENT(lock_pending)        /* # of locking ops via pending code         */
+LOCK_EVENT(lock_slowpath)       /* # of locking ops via MCS lock queue       */
+LOCK_EVENT(lock_use_node2)      /* # of locking ops that use 2nd percpu node */
+LOCK_EVENT(lock_use_node3)      /* # of locking ops that use 3rd percpu node */
+LOCK_EVENT(lock_use_node4)      /* # of locking ops that use 4th percpu node */
+LOCK_EVENT(lock_no_node)        /* # of locking ops w/o using percpu node    */
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+/*
+ * Locking events for rwsem
+ */
+LOCK_EVENT(rwsem_sleep_reader)  /* # of reader sleeps                   */
+LOCK_EVENT(rwsem_sleep_writer)  /* # of writer sleeps                   */
+LOCK_EVENT(rwsem_wake_reader)   /* # of reader wakeups                  */
+LOCK_EVENT(rwsem_wake_writer)   /* # of writer wakeups                  */
+LOCK_EVENT(rwsem_opt_wlock)     /* # of write locks opt-spin acquired   */
+LOCK_EVENT(rwsem_opt_fail)      /* # of failed opt-spinnings            */
+LOCK_EVENT(rwsem_rlock)         /* # of read locks acquired             */
+LOCK_EVENT(rwsem_rlock_fast)    /* # of fast read locks acquired        */
+LOCK_EVENT(rwsem_rlock_fail)    /* # of failed read lock acquisitions   */
+LOCK_EVENT(rwsem_rtrylock)      /* # of read trylock calls              */
+LOCK_EVENT(rwsem_wlock)         /* # of write locks acquired            */
+LOCK_EVENT(rwsem_wlock_fail)    /* # of failed write lock acquisitions  */
+LOCK_EVENT(rwsem_wtrylock)      /* # of write trylock calls             */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 91c6b89f04df..27b992fe8cec 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -501,11 +501,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
 {
        char c = '.';
-        if (class->usage_mask & lock_flag(bit + 2))
+        if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
                c = '+';
        if (class->usage_mask & lock_flag(bit)) {
                c = '-';
-                if (class->usage_mask & lock_flag(bit + 2))
+                if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
                        c = '?';
        }
@@ -1666,19 +1666,25 @@ check_redundant(struct lock_list *root, struct lock_class *target,
 }
 #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
+static inline int usage_accumulate(struct lock_list *entry, void *mask)
+{
+        *(unsigned long *)mask |= entry->class->usage_mask;
+        return 0;
+}
 /*
 * Forwards and backwards subgraph searching, for the purposes of
 * proving that two subgraphs can be connected by a new dependency
 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
 */
-static inline int usage_match(struct lock_list *entry, void *bit)
+static inline int usage_match(struct lock_list *entry, void *mask)
 {
-        return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
+        return entry->class->usage_mask & *(unsigned long *)mask;
 }
 /*
 * Find a node in the forwards-direction dependency sub-graph starting
 * at @root->class that matches @bit.
@@ -1690,14 +1696,14 @@ static inline int usage_match(struct lock_list *entry, void *bit)
 * Return <0 on error.
 */
 static int
-find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
                        struct lock_list **target_entry)
 {
        int result;
        debug_atomic_inc(nr_find_usage_forwards_checks);
-        result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
+        result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
        return result;
 }
@@ -1713,14 +1719,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
 * Return <0 on error.
 */
 static int
-find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
+find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
                        struct lock_list **target_entry)
 {
        int result;
        debug_atomic_inc(nr_find_usage_backwards_checks);
-        result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
+        result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
        return result;
 }
@@ -1912,39 +1918,6 @@ print_bad_irq_dependency(struct task_struct *curr,
        return 0;
 }
-static int
-check_usage(struct task_struct *curr, struct held_lock *prev,
-            struct held_lock *next, enum lock_usage_bit bit_backwards,
-            enum lock_usage_bit bit_forwards, const char *irqclass)
-{
-        int ret;
-        struct lock_list this, that;
-        struct lock_list *uninitialized_var(target_entry);
-        struct lock_list *uninitialized_var(target_entry1);
-        this.parent = NULL;
-        this.class = hlock_class(prev);
-        ret = find_usage_backwards(&this, bit_backwards, &target_entry);
-        if (ret < 0)
-                return print_bfs_bug(ret);
-        if (ret == 1)
-                return ret;
-        that.parent = NULL;
-        that.class = hlock_class(next);
-        ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
-        if (ret < 0)
-                return print_bfs_bug(ret);
-        if (ret == 1)
-                return ret;
-        return print_bad_irq_dependency(curr, &this, &that,
-                        target_entry, target_entry1,
-                        prev, next,
-                        bit_backwards, bit_forwards, irqclass);
-}
 static const char *state_names[] = {
 #define LOCKDEP_STATE(__STATE) \
        __stringify(__STATE),
@@ -1961,9 +1934,19 @@ static const char *state_rnames[] = {
 static inline const char *state_name(enum lock_usage_bit bit)
 {
-        return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2];
+        if (bit & LOCK_USAGE_READ_MASK)
+                return state_rnames[bit >> LOCK_USAGE_DIR_MASK];
+        else
+                return state_names[bit >> LOCK_USAGE_DIR_MASK];
 }
+/*
+ * The bit number is encoded like:
+ *
+ *  bit0: 0 exclusive, 1 read lock
+ *  bit1: 0 used in irq, 1 irq enabled
+ *  bit2-n: state
+ */
 static int exclusive_bit(int new_bit)
 {
        int state = new_bit & LOCK_USAGE_STATE_MASK;
@@ -1975,45 +1958,160 @@ static int exclusive_bit(int new_bit)
        return state | (dir ^ LOCK_USAGE_DIR_MASK);
 }
+/*
+ * Observe that when given a bitmask where each bitnr is encoded as above, a
+ * right shift of the mask transforms the individual bitnrs as -1 and
+ * conversely, a left shift transforms into +1 for the individual bitnrs.
+ *
+ * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can
+ * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0)
+ * instead by subtracting the bit number by 2, or shifting the mask right by 2.
+ *
+ * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2.
+ *
+ * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is
+ * all bits set) and recompose with bitnr1 flipped.
+ */
+static unsigned long invert_dir_mask(unsigned long mask)
+{
+        unsigned long excl = 0;
+        /* Invert dir */
+        excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK;
+        excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK;
+        return excl;
+}
+/*
+ * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
+ * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
+ * And then mask out all bitnr0.
+ */
+static unsigned long exclusive_mask(unsigned long mask)
+{
+        unsigned long excl = invert_dir_mask(mask);
+        /* Strip read */
+        excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
+        excl &= ~LOCKF_IRQ_READ;
+        return excl;
+}
+/*
+ * Retrieve the _possible_ original mask to which @mask is
+ * exclusive. Ie: this is the opposite of exclusive_mask().
+ * Note that 2 possible original bits can match an exclusive
+ * bit: one has LOCK_USAGE_READ_MASK set, the other has it
+ * cleared. So both are returned for each exclusive bit.
+ */
+static unsigned long original_mask(unsigned long mask)
+{
+        unsigned long excl = invert_dir_mask(mask);
+        /* Include read in existing usages */
+        excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
+        return excl;
+}
+/*
+ * Find the first pair of bit match between an original
+ * usage mask and an exclusive usage mask.
+ */
+static int find_exclusive_match(unsigned long mask,
+                                unsigned long excl_mask,
+                                enum lock_usage_bit *bitp,
+                                enum lock_usage_bit *excl_bitp)
+{
+        int bit, excl;
+        for_each_set_bit(bit, &mask, LOCK_USED) {
+                excl = exclusive_bit(bit);
+                if (excl_mask & lock_flag(excl)) {
+                        *bitp = bit;
+                        *excl_bitp = excl;
+                        return 0;
+                }
+        }
+        return -1;
+}
+/*
+ * Prove that the new dependency does not connect a hardirq-safe(-read)
+ * lock with a hardirq-unsafe lock - to achieve this we search
+ * the backwards-subgraph starting at <prev>, and the
+ * forwards-subgraph starting at <next>:
+ */
 static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
-                           struct held_lock *next, enum lock_usage_bit bit)
+                           struct held_lock *next)
 {
+        unsigned long usage_mask = 0, forward_mask, backward_mask;
+        enum lock_usage_bit forward_bit = 0, backward_bit = 0;
+        struct lock_list *uninitialized_var(target_entry1);
+        struct lock_list *uninitialized_var(target_entry);
+        struct lock_list this, that;
+        int ret;
        /*
-         * Prove that the new dependency does not connect a hardirq-safe
+         * Step 1: gather all hard/soft IRQs usages backward in an
-         * lock with a hardirq-unsafe lock - to achieve this we search
+         * accumulated usage mask.
-         * the backwards-subgraph starting at <prev>, and the
-         * forwards-subgraph starting at <next>:
         */
-        if (!check_usage(curr, prev, next, bit,
+        this.parent = NULL;
-                           exclusive_bit(bit), state_name(bit)))
+        this.class = hlock_class(prev);
-                return 0;
+        ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
+        if (ret < 0)
+                return print_bfs_bug(ret);
-        bit++; /* _READ */
+        usage_mask &= LOCKF_USED_IN_IRQ_ALL;
+        if (!usage_mask)
+                return 1;
        /*
-         * Prove that the new dependency does not connect a hardirq-safe-read
+         * Step 2: find exclusive uses forward that match the previous
-         * lock with a hardirq-unsafe lock - to achieve this we search
+         * backward accumulated mask.
-         * the backwards-subgraph starting at <prev>, and the
-         * forwards-subgraph starting at <next>:
         */
-        if (!check_usage(curr, prev, next, bit,
+        forward_mask = exclusive_mask(usage_mask);
-                           exclusive_bit(bit), state_name(bit)))
-                return 0;
-        return 1;
+        that.parent = NULL;
-}
+        that.class = hlock_class(next);
-static int
+        ret = find_usage_forwards(&that, forward_mask, &target_entry1);
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+        if (ret < 0)
-                struct held_lock *next)
+                return print_bfs_bug(ret);
-{
+        if (ret == 1)
-#define LOCKDEP_STATE(__STATE)                                          \
+                return ret;
-        if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
-                return 0;
-#include "lockdep_states.h"
-#undef LOCKDEP_STATE
-        return 1;
+        /*
+         * Step 3: we found a bad match! Now retrieve a lock from the backward
+         * list whose usage mask matches the exclusive usage mask from the
+         * lock found on the forward list.
+         */
+        backward_mask = original_mask(target_entry1->class->usage_mask);
+        ret = find_usage_backwards(&this, backward_mask, &target_entry);
+        if (ret < 0)
+                return print_bfs_bug(ret);
+        if (DEBUG_LOCKS_WARN_ON(ret == 1))
+                return 1;
+        /*
+         * Step 4: narrow down to a pair of incompatible usage bits
+         * and report it.
+         */
+        ret = find_exclusive_match(target_entry->class->usage_mask,
+                                   target_entry1->class->usage_mask,
+                                   &backward_bit, &forward_bit);
+        if (DEBUG_LOCKS_WARN_ON(ret == -1))
+                return 1;
+        return print_bad_irq_dependency(curr, &this, &that,
+                        target_entry, target_entry1,
+                        prev, next,
+                        backward_bit, forward_bit,
+                        state_name(backward_bit));
 }
 static void inc_chains(void)
@@ -2030,9 +2128,8 @@ static void inc_chains(void)
 #else
-static inline int
+static inline int check_irq_usage(struct task_struct *curr,
-check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
+                                  struct held_lock *prev, struct held_lock *next)
-                struct held_lock *next)
 {
        return 1;
 }
@@ -2211,7 +2308,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
        else if (unlikely(ret < 0))
                return print_bfs_bug(ret);
-        if (!check_prev_add_irq(curr, prev, next))
+        if (!check_irq_usage(curr, prev, next))
                return 0;
        /*
@@ -2773,6 +2870,12 @@ static void check_chain_key(struct task_struct *curr)
 #endif
 }
+static int mark_lock(struct task_struct *curr, struct held_lock *this,
+                     enum lock_usage_bit new_bit);
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 static void
 print_usage_bug_scenario(struct held_lock *lock)
 {
@@ -2842,10 +2945,6 @@ valid_state(struct task_struct *curr, struct held_lock *this,
        return 1;
 }
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
-                     enum lock_usage_bit new_bit);
-#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
 * print irq inversion bug:
@@ -2925,7 +3024,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
        root.parent = NULL;
        root.class = hlock_class(this);
-        ret = find_usage_forwards(&root, bit, &target_entry);
+        ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
        if (ret < 0)
                return print_bfs_bug(ret);
        if (ret == 1)
@@ -2949,7 +3048,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
        root.parent = NULL;
        root.class = hlock_class(this);
-        ret = find_usage_backwards(&root, bit, &target_entry);
+        ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
        if (ret < 0)
                return print_bfs_bug(ret);
        if (ret == 1)
@@ -3004,7 +3103,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {
 static inline int state_verbose(enum lock_usage_bit bit,
                                struct lock_class *class)
 {
-        return state_verbose_f[bit >> 2](class);
+        return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);
 }
 typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
@@ -3146,7 +3245,7 @@ void lockdep_hardirqs_on(unsigned long ip)
        /*
         * See the fine text that goes along with this variable definition.
         */
-        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
+        if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))
                return;
        /*
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index d4c197425f68..150ec3f0c5b5 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -42,13 +42,35 @@ enum {
        __LOCKF(USED)
 };
-#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
+#define LOCKDEP_STATE(__STATE)  LOCKF_ENABLED_##__STATE |
-#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+static const unsigned long LOCKF_ENABLED_IRQ =
+#include "lockdep_states.h"
+        0;
+#undef LOCKDEP_STATE
+#define LOCKDEP_STATE(__STATE)  LOCKF_USED_IN_##__STATE |
+static const unsigned long LOCKF_USED_IN_IRQ =
+#include "lockdep_states.h"
+        0;
+#undef LOCKDEP_STATE
+#define LOCKDEP_STATE(__STATE)  LOCKF_ENABLED_##__STATE##_READ |
+static const unsigned long LOCKF_ENABLED_IRQ_READ =
+#include "lockdep_states.h"
+        0;
+#undef LOCKDEP_STATE
+#define LOCKDEP_STATE(__STATE)  LOCKF_USED_IN_##__STATE##_READ |
+static const unsigned long LOCKF_USED_IN_IRQ_READ =
+#include "lockdep_states.h"
+        0;
+#undef LOCKDEP_STATE
+#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
+#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
-#define LOCKF_ENABLED_IRQ_READ \
+#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ)
-                (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
+#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)
-#define LOCKF_USED_IN_IRQ_READ \
-                (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
 /*
 * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 883cf1b92d90..f17dad99eec8 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include "rwsem.h"
 int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
                        const char *name, struct lock_class_key *rwsem_key)
 {
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5e9247dc2515..e14b32c69639 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -395,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
         * 0,1,0 -> 0,0,1
         */
        clear_pending_set_locked(lock);
-        qstat_inc(qstat_lock_pending, true);
+        lockevent_inc(lock_pending);
        return;
        /*
@@ -403,7 +403,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
         * queuing.
         */
 queue:
-        qstat_inc(qstat_lock_slowpath, true);
+        lockevent_inc(lock_slowpath);
 pv_queue:
        node = this_cpu_ptr(&qnodes[0].mcs);
        idx = node->count++;
@@ -419,7 +419,7 @@ pv_queue:
         * simple enough.
         */
        if (unlikely(idx >= MAX_NODES)) {
-                qstat_inc(qstat_lock_no_node, true);
+                lockevent_inc(lock_no_node);
                while (!queued_spin_trylock(lock))
                        cpu_relax();
                goto release;
@@ -430,7 +430,7 @@ pv_queue:
        /*
         * Keep counts of non-zero index values:
         */
-        qstat_inc(qstat_lock_use_node2 + idx - 1, idx);
+        lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
        /*
         * Ensure that we increment the head node->count before initialising
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8f36c27c1794..89bab079e7a4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
                if (!(val & _Q_LOCKED_PENDING_MASK) &&
                   (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
-                        qstat_inc(qstat_pv_lock_stealing, true);
+                        lockevent_inc(pv_lock_stealing);
                        return true;
                }
                if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
@@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
                hopcnt++;
                if (!cmpxchg(&he->lock, NULL, lock)) {
                        WRITE_ONCE(he->node, node);
-                        qstat_hop(hopcnt);
+                        lockevent_pv_hop(hopcnt);
                        return &he->lock;
                }
        }
@@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
                smp_store_mb(pn->state, vcpu_halted);
                if (!READ_ONCE(node->locked)) {
-                        qstat_inc(qstat_pv_wait_node, true);
+                        lockevent_inc(pv_wait_node);
-                        qstat_inc(qstat_pv_wait_early, wait_early);
+                        lockevent_cond_inc(pv_wait_early, wait_early);
                        pv_wait(&pn->state, vcpu_halted);
                }
@@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
                 * So it is better to spin for a while in the hope that the
                 * MCS lock will be released soon.
                 */
-                qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
+                lockevent_cond_inc(pv_spurious_wakeup,
+                                  !READ_ONCE(node->locked));
        }
        /*
@@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
        /*
         * Tracking # of slowpath locking operations
         */
-        qstat_inc(qstat_lock_slowpath, true);
+        lockevent_inc(lock_slowpath);
        for (;; waitcnt++) {
                /*
@@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
                        }
                }
                WRITE_ONCE(pn->state, vcpu_hashed);
-                qstat_inc(qstat_pv_wait_head, true);
+                lockevent_inc(pv_wait_head);
-                qstat_inc(qstat_pv_wait_again, waitcnt);
+                lockevent_cond_inc(pv_wait_again, waitcnt);
                pv_wait(&lock->locked, _Q_SLOW_VAL);
                /*
@@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
         * vCPU is harmless other than the additional latency in completing
         * the unlock.
         */
-        qstat_inc(qstat_pv_kick_unlock, true);
+        lockevent_inc(pv_kick_unlock);
        pv_kick(node->cpu);
 }
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index d73f85388d5c..54152670ff24 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -9,262 +9,105 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
- * Authors: Waiman Long <waiman.long@hpe.com>
+ * Authors: Waiman Long <longman@redhat.com>
 */
-/*
+#include "lock_events.h"
- * When queued spinlock statistical counters are enabled, the following
- * debugfs files will be created for reporting the counter values:
- *
- * <debugfs>/qlockstat/
- *   pv_hash_hops       - average # of hops per hashing operation
- *   pv_kick_unlock     - # of vCPU kicks issued at unlock time
- *   pv_kick_wake       - # of vCPU kicks used for computing pv_latency_wake
- *   pv_latency_kick    - average latency (ns) of vCPU kick operation
- *   pv_latency_wake    - average latency (ns) from vCPU kick to wakeup
- *   pv_lock_stealing   - # of lock stealing operations
- *   pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
- *   pv_wait_again      - # of wait's after a queue head vCPU kick
- *   pv_wait_early      - # of early vCPU wait's
- *   pv_wait_head       - # of vCPU wait's at the queue head
- *   pv_wait_node       - # of vCPU wait's at a non-head queue node
- *   lock_pending       - # of locking operations via pending code
- *   lock_slowpath      - # of locking operations via MCS lock queue
- *   lock_use_node2     - # of locking operations that use 2nd per-CPU node
- *   lock_use_node3     - # of locking operations that use 3rd per-CPU node
- *   lock_use_node4     - # of locking operations that use 4th per-CPU node
- *   lock_no_node       - # of locking operations without using per-CPU node
- *
- * Subtracting lock_use_node[234] from lock_slowpath will give you
- * lock_use_node1.
- *
- * Writing to the "reset_counters" file will reset all the above counter
- * values.
- *
- * These statistical counters are implemented as per-cpu variables which are
- * summed and computed whenever the corresponding debugfs files are read. This
- * minimizes added overhead making the counters usable even in a production
- * environment.
- *
- * There may be slight difference between pv_kick_wake and pv_kick_unlock.
- */
-enum qlock_stats {
-        qstat_pv_hash_hops,
-        qstat_pv_kick_unlock,
-        qstat_pv_kick_wake,
-        qstat_pv_latency_kick,
-        qstat_pv_latency_wake,
-        qstat_pv_lock_stealing,
-        qstat_pv_spurious_wakeup,
-        qstat_pv_wait_again,
-        qstat_pv_wait_early,
-        qstat_pv_wait_head,
-        qstat_pv_wait_node,
-        qstat_lock_pending,
-        qstat_lock_slowpath,
-        qstat_lock_use_node2,
-        qstat_lock_use_node3,
-        qstat_lock_use_node4,
-        qstat_lock_no_node,
-        qstat_num,      /* Total number of statistical counters */
-        qstat_reset_cnts = qstat_num,
-};
-#ifdef CONFIG_QUEUED_LOCK_STAT
+#ifdef CONFIG_LOCK_EVENT_COUNTS
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
 /*
- * Collect pvqspinlock statistics
+ * Collect pvqspinlock locking event counts
 */
-#include <linux/debugfs.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/fs.h>
-static const char * const qstat_names[qstat_num + 1] = {
+#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev]
-        [qstat_pv_hash_hops]       = "pv_hash_hops",
-        [qstat_pv_kick_unlock]     = "pv_kick_unlock",
-        [qstat_pv_kick_wake]       = "pv_kick_wake",
-        [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
-        [qstat_pv_latency_kick]    = "pv_latency_kick",
-        [qstat_pv_latency_wake]    = "pv_latency_wake",
-        [qstat_pv_lock_stealing]   = "pv_lock_stealing",
-        [qstat_pv_wait_again]      = "pv_wait_again",
-        [qstat_pv_wait_early]      = "pv_wait_early",
-        [qstat_pv_wait_head]       = "pv_wait_head",
-        [qstat_pv_wait_node]       = "pv_wait_node",
-        [qstat_lock_pending]       = "lock_pending",
-        [qstat_lock_slowpath]      = "lock_slowpath",
-        [qstat_lock_use_node2]     = "lock_use_node2",
-        [qstat_lock_use_node3]     = "lock_use_node3",
-        [qstat_lock_use_node4]     = "lock_use_node4",
-        [qstat_lock_no_node]       = "lock_no_node",
-        [qstat_reset_cnts]         = "reset_counters",
-};
 /*
- * Per-cpu counters
+ * PV specific per-cpu counter
 */
-static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
 static DEFINE_PER_CPU(u64, pv_kick_time);
 /*
- * Function to read and return the qlock statistical counter values
+ * Function to read and return the PV qspinlock counts.
 *
 * The following counters are handled specially:
- * 1. qstat_pv_latency_kick
+ * 1. pv_latency_kick
 *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
- * 2. qstat_pv_latency_wake
+ * 2. pv_latency_wake
 *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake
- * 3. qstat_pv_hash_hops
+ * 3. pv_hash_hops
 *    Average hops/hash = pv_hash_hops/pv_kick_unlock
 */
-static ssize_t qstat_read(struct file *file, char __user *user_buf,
+ssize_t lockevent_read(struct file *file, char __user *user_buf,
-                          size_t count, loff_t *ppos)
+                       size_t count, loff_t *ppos)
 {
        char buf[64];
-        int cpu, counter, len;
+        int cpu, id, len;
-        u64 stat = 0, kicks = 0;
+        u64 sum = 0, kicks = 0;
        /*
         * Get the counter ID stored in file->f_inode->i_private
         */
-        counter = (long)file_inode(file)->i_private;
+        id = (long)file_inode(file)->i_private;
-        if (counter >= qstat_num)
+        if (id >= lockevent_num)
                return -EBADF;
        for_each_possible_cpu(cpu) {
-                stat += per_cpu(qstats[counter], cpu);
+                sum += per_cpu(lockevents[id], cpu);
                /*
-                 * Need to sum additional counter for some of them
+                 * Need to sum additional counters for some of them
                 */
-                switch (counter) {
+                switch (id) {
-                case qstat_pv_latency_kick:
+                case LOCKEVENT_pv_latency_kick:
-                case qstat_pv_hash_hops:
+                case LOCKEVENT_pv_hash_hops:
-                        kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+                        kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);
                        break;
-                case qstat_pv_latency_wake:
+                case LOCKEVENT_pv_latency_wake:
-                        kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+                        kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);
                        break;
                }
        }
-        if (counter == qstat_pv_hash_hops) {
+        if (id == LOCKEVENT_pv_hash_hops) {
                u64 frac = 0;
                if (kicks) {
-                        frac = 100ULL * do_div(stat, kicks);
+                        frac = 100ULL * do_div(sum, kicks);
                        frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
                }
                /*
                 * Return a X.XX decimal number
                 */
-                len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+                len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n",
+                               sum, frac);
        } else {
                /*
                 * Round to the nearest ns
                 */
-                if ((counter == qstat_pv_latency_kick) ||
+                if ((id == LOCKEVENT_pv_latency_kick) ||
-                    (counter == qstat_pv_latency_wake)) {
+                    (id == LOCKEVENT_pv_latency_wake)) {
                        if (kicks)
-                                stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+                                sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);
                }
-                len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+                len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
        }
        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
 }
 /*
- * Function to handle write request
- *
- * When counter = reset_cnts, reset all the counter values.
- * Since the counter updates aren't atomic, the resetting is done twice
- * to make sure that the counters are very likely to be all cleared.
- */
-static ssize_t qstat_write(struct file *file, const char __user *user_buf,
-                           size_t count, loff_t *ppos)
-{
-        int cpu;
-        /*
-         * Get the counter ID stored in file->f_inode->i_private
-         */
-        if ((long)file_inode(file)->i_private != qstat_reset_cnts)
-                return count;
-        for_each_possible_cpu(cpu) {
-                int i;
-                unsigned long *ptr = per_cpu_ptr(qstats, cpu);
-                for (i = 0 ; i < qstat_num; i++)
-                        WRITE_ONCE(ptr[i], 0);
-        }
-        return count;
-}
-/*
- * Debugfs data structures
- */
-static const struct file_operations fops_qstat = {
-        .read = qstat_read,
-        .write = qstat_write,
-        .llseek = default_llseek,
-};
-/*
- * Initialize debugfs for the qspinlock statistical counters
- */
-static int __init init_qspinlock_stat(void)
-{
-        struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
-        int i;
-        if (!d_qstat)
-                goto out;
-        /*
-         * Create the debugfs files
-         *
-         * As reading from and writing to the stat files can be slow, only
-         * root is allowed to do the read/write to limit impact to system
-         * performance.
-         */
-        for (i = 0; i < qstat_num; i++)
-                if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
-                                         (void *)(long)i, &fops_qstat))
-                        goto fail_undo;
-        if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
-                                 (void *)(long)qstat_reset_cnts, &fops_qstat))
-                goto fail_undo;
-        return 0;
-fail_undo:
-        debugfs_remove_recursive(d_qstat);
-out:
-        pr_warn("Could not create 'qlockstat' debugfs entries\n");
-        return -ENOMEM;
-}
-fs_initcall(init_qspinlock_stat);
-/*
- * Increment the PV qspinlock statistical counters
- */
-static inline void qstat_inc(enum qlock_stats stat, bool cond)
-{
-        if (cond)
-                this_cpu_inc(qstats[stat]);
-}
-/*
 * PV hash hop count
 */
-static inline void qstat_hop(int hopcnt)
+static inline void lockevent_pv_hop(int hopcnt)
 {
-        this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+        this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);
 }
 /*
@@ -276,7 +119,7 @@ static inline void __pv_kick(int cpu)
        per_cpu(pv_kick_time, cpu) = start;
        pv_kick(cpu);
-        this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+        this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);
 }
 /*
@@ -289,18 +132,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
        *pkick_time = 0;
        pv_wait(ptr, val);
        if (*pkick_time) {
-                this_cpu_add(qstats[qstat_pv_latency_wake],
+                this_cpu_add(EVENT_COUNT(pv_latency_wake),
                             sched_clock() - *pkick_time);
-                qstat_inc(qstat_pv_kick_wake, true);
+                lockevent_inc(pv_kick_wake);
        }
 }
 #define pv_kick(c)      __pv_kick(c)
 #define pv_wait(p, v)   __pv_wait(p, v)
-#else /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+#else /* CONFIG_LOCK_EVENT_COUNTS */
-static inline void qstat_inc(enum qlock_stats stat, bool cond)  { }
+static inline void lockevent_pv_hop(int hopcnt) { }
-static inline void qstat_hop(int hopcnt)                        { }
-#endif /* CONFIG_QUEUED_LOCK_STAT */
+#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index a7ffb2a96ede..000000000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,339 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
- * generic spinlock implementation
- *
- * Copyright (c) 2001   David Howells (dhowells@redhat.com).
- * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
- * - Derived also from comments by Linus
- */
-#include <linux/rwsem.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/export.h>
-enum rwsem_waiter_type {
-        RWSEM_WAITING_FOR_WRITE,
-        RWSEM_WAITING_FOR_READ
-};
-struct rwsem_waiter {
-        struct list_head list;
-        struct task_struct *task;
-        enum rwsem_waiter_type type;
-};
-int rwsem_is_locked(struct rw_semaphore *sem)
-{
-        int ret = 1;
-        unsigned long flags;
-        if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
-                ret = (sem->count != 0);
-                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        }
-        return ret;
-}
-EXPORT_SYMBOL(rwsem_is_locked);
-/*
- * initialise the semaphore
- */
-void __init_rwsem(struct rw_semaphore *sem, const char *name,
-                  struct lock_class_key *key)
-{
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-        /*
-         * Make sure we are not reinitializing a held semaphore:
-         */
-        debug_check_no_locks_freed((void *)sem, sizeof(*sem));
-        lockdep_init_map(&sem->dep_map, name, key, 0);
-#endif
-        sem->count = 0;
-        raw_spin_lock_init(&sem->wait_lock);
-        INIT_LIST_HEAD(&sem->wait_list);
-}
-EXPORT_SYMBOL(__init_rwsem);
-/*
- * handle the lock release when processes blocked on it that can now run
- * - if we come here, then:
- *   - the 'active count' _reached_ zero
- *   - the 'waiting count' is non-zero
- * - the spinlock must be held by the caller
- * - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if wakewrite is non-zero
- */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
-{
-        struct rwsem_waiter *waiter;
-        struct task_struct *tsk;
-        int woken;
-        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-        if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
-                if (wakewrite)
-                        /* Wake up a writer. Note that we do not grant it the
-                         * lock - it will have to acquire it when it runs. */
-                        wake_up_process(waiter->task);
-                goto out;
-        }
-        /* grant an infinite number of read locks to the front of the queue */
-        woken = 0;
-        do {
-                struct list_head *next = waiter->list.next;
-                list_del(&waiter->list);
-                tsk = waiter->task;
-                /*
-                 * Make sure we do not wakeup the next reader before
-                 * setting the nil condition to grant the next reader;
-                 * otherwise we could miss the wakeup on the other
-                 * side and end up sleeping again. See the pairing
-                 * in rwsem_down_read_failed().
-                 */
-                smp_mb();
-                waiter->task = NULL;
-                wake_up_process(tsk);
-                put_task_struct(tsk);
-                woken++;
-                if (next == &sem->wait_list)
-                        break;
-                waiter = list_entry(next, struct rwsem_waiter, list);
-        } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
-        sem->count += woken;
- out:
-        return sem;
-}
-/*
- * wake a single writer
- */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
-{
-        struct rwsem_waiter *waiter;
-        waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
-        wake_up_process(waiter->task);
-        return sem;
-}
-/*
- * get a read lock on the semaphore
- */
-int __sched __down_read_common(struct rw_semaphore *sem, int state)
-{
-        struct rwsem_waiter waiter;
-        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
-                /* granted */
-                sem->count++;
-                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-                goto out;
-        }
-        /* set up my own style of waitqueue */
-        waiter.task = current;
-        waiter.type = RWSEM_WAITING_FOR_READ;
-        get_task_struct(current);
-        list_add_tail(&waiter.list, &sem->wait_list);
-        /* wait to be given the lock */
-        for (;;) {
-                if (!waiter.task)
-                        break;
-                if (signal_pending_state(state, current))
-                        goto out_nolock;
-                set_current_state(state);
-                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-                schedule();
-                raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        }
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
- out:
-        return 0;
-out_nolock:
-        /*
-         * We didn't take the lock, so that there is a writer, which
-         * is owner or the first waiter of the sem. If it's a waiter,
-         * it will be woken by current owner. Not need to wake anybody.
-         */
-        list_del(&waiter.list);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return -EINTR;
-}
-void __sched __down_read(struct rw_semaphore *sem)
-{
-        __down_read_common(sem, TASK_UNINTERRUPTIBLE);
-}
-int __sched __down_read_killable(struct rw_semaphore *sem)
-{
-        return __down_read_common(sem, TASK_KILLABLE);
-}
-/*
- * trylock for reading -- returns 1 if successful, 0 if contention
- */
-int __down_read_trylock(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        int ret = 0;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->count >= 0 && list_empty(&sem->wait_list)) {
-                /* granted */
-                sem->count++;
-                ret = 1;
-        }
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return ret;
-}
-/*
- * get a write lock on the semaphore
- */
-int __sched __down_write_common(struct rw_semaphore *sem, int state)
-{
-        struct rwsem_waiter waiter;
-        unsigned long flags;
-        int ret = 0;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        /* set up my own style of waitqueue */
-        waiter.task = current;
-        waiter.type = RWSEM_WAITING_FOR_WRITE;
-        list_add_tail(&waiter.list, &sem->wait_list);
-        /* wait for someone to release the lock */
-        for (;;) {
-                /*
-                 * That is the key to support write lock stealing: allows the
-                 * task already on CPU to get the lock soon rather than put
-                 * itself into sleep and waiting for system woke it or someone
-                 * else in the head of the wait list up.
-                 */
-                if (sem->count == 0)
-                        break;
-                if (signal_pending_state(state, current))
-                        goto out_nolock;
-                set_current_state(state);
-                raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-                schedule();
-                raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        }
-        /* got the lock */
-        sem->count = -1;
-        list_del(&waiter.list);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return ret;
-out_nolock:
-        list_del(&waiter.list);
-        if (!list_empty(&sem->wait_list) && sem->count >= 0)
-                __rwsem_do_wake(sem, 0);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return -EINTR;
-}
-void __sched __down_write(struct rw_semaphore *sem)
-{
-        __down_write_common(sem, TASK_UNINTERRUPTIBLE);
-}
-int __sched __down_write_killable(struct rw_semaphore *sem)
-{
-        return __down_write_common(sem, TASK_KILLABLE);
-}
-/*
- * trylock for writing -- returns 1 if successful, 0 if contention
- */
-int __down_write_trylock(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        int ret = 0;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (sem->count == 0) {
-                /* got the lock */
-                sem->count = -1;
-                ret = 1;
-        }
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-        return ret;
-}
-/*
- * release a read lock on the semaphore
- */
-void __up_read(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        if (--sem->count == 0 && !list_empty(&sem->wait_list))
-                sem = __rwsem_wake_one_writer(sem);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-/*
- * release a write lock on the semaphore
- */
-void __up_write(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->count = 0;
-        if (!list_empty(&sem->wait_list))
-                sem = __rwsem_do_wake(sem, 1);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
-/*
- * downgrade a write lock into a read lock
- * - just wake up any readers at the front of the queue
- */
-void __downgrade_write(struct rw_semaphore *sem)
-{
-        unsigned long flags;
-        raw_spin_lock_irqsave(&sem->wait_lock, flags);
-        sem->count = 1;
-        if (!list_empty(&sem->wait_list))
-                sem = __rwsem_do_wake(sem, 0);
-        raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
-}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index fbe96341beee..6b3ee9948bf1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
                         * will notice the queued writer.
                         */
                        wake_q_add(wake_q, waiter->task);
+                        lockevent_inc(rwsem_wake_writer);
                }
                return;
@@ -176,9 +177,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
                        goto try_reader_grant;
                }
                /*
-                 * It is not really necessary to set it to reader-owned here,
+                 * Set it to reader-owned to give spinners an early
-                 * but it gives the spinners an early indication that the
+                 * indication that readers now have the lock.
-                 * readers now have the lock.
                 */
                __rwsem_set_reader_owned(sem, waiter->task);
        }
@@ -215,6 +215,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
        }
        adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
+        lockevent_cond_inc(rwsem_wake_reader, woken);
        if (list_empty(&sem->wait_list)) {
                /* hit end of list above */
                adjustment -= RWSEM_WAITING_BIAS;
@@ -225,92 +226,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
 }
 /*
- * Wait for the read lock to be granted
- */
-static inline struct rw_semaphore __sched *
-__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
-{
-        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
-        struct rwsem_waiter waiter;
-        DEFINE_WAKE_Q(wake_q);
-        waiter.task = current;
-        waiter.type = RWSEM_WAITING_FOR_READ;
-        raw_spin_lock_irq(&sem->wait_lock);
-        if (list_empty(&sem->wait_list)) {
-                /*
-                 * In case the wait queue is empty and the lock isn't owned
-                 * by a writer, this reader can exit the slowpath and return
-                 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
-                 * been set in the count.
-                 */
-                if (atomic_long_read(&sem->count) >= 0) {
-                        raw_spin_unlock_irq(&sem->wait_lock);
-                        return sem;
-                }
-                adjustment += RWSEM_WAITING_BIAS;
-        }
-        list_add_tail(&waiter.list, &sem->wait_list);
-        /* we're now waiting on the lock, but no longer actively locking */
-        count = atomic_long_add_return(adjustment, &sem->count);
-        /*
-         * If there are no active locks, wake the front queued process(es).
-         *
-         * If there are no writers and we are first in the queue,
-         * wake our own waiter to join the existing active readers !
-         */
-        if (count == RWSEM_WAITING_BIAS ||
-            (count > RWSEM_WAITING_BIAS &&
-             adjustment != -RWSEM_ACTIVE_READ_BIAS))
-                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
-        raw_spin_unlock_irq(&sem->wait_lock);
-        wake_up_q(&wake_q);
-        /* wait to be given the lock */
-        while (true) {
-                set_current_state(state);
-                if (!waiter.task)
-                        break;
-                if (signal_pending_state(state, current)) {
-                        raw_spin_lock_irq(&sem->wait_lock);
-                        if (waiter.task)
-                                goto out_nolock;
-                        raw_spin_unlock_irq(&sem->wait_lock);
-                        break;
-                }
-                schedule();
-        }
-        __set_current_state(TASK_RUNNING);
-        return sem;
-out_nolock:
-        list_del(&waiter.list);
-        if (list_empty(&sem->wait_list))
-                atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
-        raw_spin_unlock_irq(&sem->wait_lock);
-        __set_current_state(TASK_RUNNING);
-        return ERR_PTR(-EINTR);
-}
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed(struct rw_semaphore *sem)
-{
-        return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed);
-__visible struct rw_semaphore * __sched
-rwsem_down_read_failed_killable(struct rw_semaphore *sem)
-{
-        return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
-}
-EXPORT_SYMBOL(rwsem_down_read_failed_killable);
-/*
 * This function must be called with the sem->wait_lock held to prevent
 * race conditions between checking the rwsem wait list and setting the
 * sem->count accordingly.
@@ -346,21 +261,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
 */
 static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
 {
-        long old, count = atomic_long_read(&sem->count);
+        long count = atomic_long_read(&sem->count);
-        while (true) {
-                if (!(count == 0 || count == RWSEM_WAITING_BIAS))
-                        return false;
-                old = atomic_long_cmpxchg_acquire(&sem->count, count,
+        while (!count || count == RWSEM_WAITING_BIAS) {
-                                      count + RWSEM_ACTIVE_WRITE_BIAS);
+                if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
-                if (old == count) {
+                                        count + RWSEM_ACTIVE_WRITE_BIAS)) {
                        rwsem_set_owner(sem);
+                        lockevent_inc(rwsem_opt_wlock);
                        return true;
                }
-                count = old;
        }
+        return false;
 }
 static inline bool owner_on_cpu(struct task_struct *owner)
@@ -481,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
        osq_unlock(&sem->osq);
 done:
        preempt_enable();
+        lockevent_cond_inc(rwsem_opt_fail, !taken);
        return taken;
 }
@@ -505,6 +417,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
 #endif
 /*
+ * Wait for the read lock to be granted
+ */
+static inline struct rw_semaphore __sched *
+__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
+{
+        long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
+        struct rwsem_waiter waiter;
+        DEFINE_WAKE_Q(wake_q);
+        waiter.task = current;
+        waiter.type = RWSEM_WAITING_FOR_READ;
+        raw_spin_lock_irq(&sem->wait_lock);
+        if (list_empty(&sem->wait_list)) {
+                /*
+                 * In case the wait queue is empty and the lock isn't owned
+                 * by a writer, this reader can exit the slowpath and return
+                 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
+                 * been set in the count.
+                 */
+                if (atomic_long_read(&sem->count) >= 0) {
+                        raw_spin_unlock_irq(&sem->wait_lock);
+                        rwsem_set_reader_owned(sem);
+                        lockevent_inc(rwsem_rlock_fast);
+                        return sem;
+                }
+                adjustment += RWSEM_WAITING_BIAS;
+        }
+        list_add_tail(&waiter.list, &sem->wait_list);
+        /* we're now waiting on the lock, but no longer actively locking */
+        count = atomic_long_add_return(adjustment, &sem->count);
+        /*
+         * If there are no active locks, wake the front queued process(es).
+         *
+         * If there are no writers and we are first in the queue,
+         * wake our own waiter to join the existing active readers !
+         */
+        if (count == RWSEM_WAITING_BIAS ||
+            (count > RWSEM_WAITING_BIAS &&
+             adjustment != -RWSEM_ACTIVE_READ_BIAS))
+                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        wake_up_q(&wake_q);
+        /* wait to be given the lock */
+        while (true) {
+                set_current_state(state);
+                if (!waiter.task)
+                        break;
+                if (signal_pending_state(state, current)) {
+                        raw_spin_lock_irq(&sem->wait_lock);
+                        if (waiter.task)
+                                goto out_nolock;
+                        raw_spin_unlock_irq(&sem->wait_lock);
+                        break;
+                }
+                schedule();
+                lockevent_inc(rwsem_sleep_reader);
+        }
+        __set_current_state(TASK_RUNNING);
+        lockevent_inc(rwsem_rlock);
+        return sem;
+out_nolock:
+        list_del(&waiter.list);
+        if (list_empty(&sem->wait_list))
+                atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+        raw_spin_unlock_irq(&sem->wait_lock);
+        __set_current_state(TASK_RUNNING);
+        lockevent_inc(rwsem_rlock_fail);
+        return ERR_PTR(-EINTR);
+}
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed(struct rw_semaphore *sem)
+{
+        return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed);
+__visible struct rw_semaphore * __sched
+rwsem_down_read_failed_killable(struct rw_semaphore *sem)
+{
+        return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_read_failed_killable);
+/*
 * Wait until we successfully acquire the write lock
 */
 static inline struct rw_semaphore *
@@ -580,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
                                goto out_nolock;
                        schedule();
+                        lockevent_inc(rwsem_sleep_writer);
                        set_current_state(state);
                } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
@@ -588,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
        __set_current_state(TASK_RUNNING);
        list_del(&waiter.list);
        raw_spin_unlock_irq(&sem->wait_lock);
+        lockevent_inc(rwsem_wlock);
        return ret;
@@ -601,6 +606,7 @@ out_nolock:
                __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
        raw_spin_unlock_irq(&sem->wait_lock);
        wake_up_q(&wake_q);
+        lockevent_inc(rwsem_wlock_fail);
        return ERR_PTR(-EINTR);
 }
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e586f0d03ad3..ccbf18f560ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem)
        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
-        rwsem_set_reader_owned(sem);
 }
 EXPORT_SYMBOL(down_read);
@@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)
                return -EINTR;
        }
-        rwsem_set_reader_owned(sem);
        return 0;
 }
@@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem)
 {
        int ret = __down_read_trylock(sem);
-        if (ret == 1) {
+        if (ret == 1)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
-                rwsem_set_reader_owned(sem);
-        }
        return ret;
 }
@@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem)
        rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(down_write);
@@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem)
                return -EINTR;
        }
-        rwsem_set_owner(sem);
        return 0;
 }
@@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem)
 {
        int ret = __down_write_trylock(sem);
-        if (ret == 1) {
+        if (ret == 1)
                rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
-                rwsem_set_owner(sem);
-        }
        return ret;
 }
@@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock);
 void up_read(struct rw_semaphore *sem)
 {
        rwsem_release(&sem->dep_map, 1, _RET_IP_);
-        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
-        rwsem_clear_reader_owned(sem);
        __up_read(sem);
 }
@@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read);
 void up_write(struct rw_semaphore *sem)
 {
        rwsem_release(&sem->dep_map, 1, _RET_IP_);
-        DEBUG_RWSEMS_WARN_ON(sem->owner != current);
-        rwsem_clear_owner(sem);
        __up_write(sem);
 }
@@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write);
 void downgrade_write(struct rw_semaphore *sem)
 {
        lock_downgrade(&sem->dep_map, _RET_IP_);
-        DEBUG_RWSEMS_WARN_ON(sem->owner != current);
-        rwsem_set_reader_owned(sem);
        __downgrade_write(sem);
 }
@@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
        rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
-        rwsem_set_reader_owned(sem);
 }
 EXPORT_SYMBOL(down_read_nested);
@@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
        rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(_down_write_nest_lock);
@@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
        rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
        LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
-        rwsem_set_owner(sem);
 }
 EXPORT_SYMBOL(down_write_nested);
@@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
                return -EINTR;
        }
-        rwsem_set_owner(sem);
        return 0;
 }
@@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested);
 void up_read_non_owner(struct rw_semaphore *sem)
 {
-        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
+        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
+                                sem);
        __up_read(sem);
 }
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index bad2bca0268b..64877f5294e3 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -23,15 +23,44 @@
 * is involved. Ideally we would like to track all the readers that own
 * a rwsem, but the overhead is simply too big.
 */
+#include "lock_events.h"
 #define RWSEM_READER_OWNED      (1UL << 0)
 #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
 #ifdef CONFIG_DEBUG_RWSEMS
-# define DEBUG_RWSEMS_WARN_ON(c)        DEBUG_LOCKS_WARN_ON(c)
+# define DEBUG_RWSEMS_WARN_ON(c, sem)   do {                    \
+        if (!debug_locks_silent &&                              \
+            WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
+                #c, atomic_long_read(&(sem)->count),            \
+                (long)((sem)->owner), (long)current,            \
+                list_empty(&(sem)->wait_list) ? "" : "not "))   \
+                        debug_locks_off();                      \
+        } while (0)
+#else
+# define DEBUG_RWSEMS_WARN_ON(c, sem)
+#endif
+/*
+ * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
+ * Adapted largely from include/asm-i386/rwsem.h
+ * by Paul Mackerras <paulus@samba.org>.
+ */
+/*
+ * the semaphore definition
+ */
+#ifdef CONFIG_64BIT
+# define RWSEM_ACTIVE_MASK              0xffffffffL
 #else
-# define DEBUG_RWSEMS_WARN_ON(c)
+# define RWSEM_ACTIVE_MASK              0x0000ffffL
 #endif
+#define RWSEM_ACTIVE_BIAS               0x00000001L
+#define RWSEM_WAITING_BIAS              (-RWSEM_ACTIVE_MASK-1)
+#define RWSEM_ACTIVE_READ_BIAS          RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS         (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 /*
 * All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
 {
 }
 #endif
+extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
+extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct rw_semaphore *sem)
+{
+        if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
+                rwsem_down_read_failed(sem);
+                DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
+                                        RWSEM_READER_OWNED), sem);
+        } else {
+                rwsem_set_reader_owned(sem);
+        }
+}
+static inline int __down_read_killable(struct rw_semaphore *sem)
+{
+        if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
+                if (IS_ERR(rwsem_down_read_failed_killable(sem)))
+                        return -EINTR;
+                DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
+                                        RWSEM_READER_OWNED), sem);
+        } else {
+                rwsem_set_reader_owned(sem);
+        }
+        return 0;
+}
+static inline int __down_read_trylock(struct rw_semaphore *sem)
+{
+        /*
+         * Optimize for the case when the rwsem is not locked at all.
+         */
+        long tmp = RWSEM_UNLOCKED_VALUE;
+        lockevent_inc(rwsem_rtrylock);
+        do {
+                if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
+                                        tmp + RWSEM_ACTIVE_READ_BIAS)) {
+                        rwsem_set_reader_owned(sem);
+                        return 1;
+                }
+        } while (tmp >= 0);
+        return 0;
+}
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct rw_semaphore *sem)
+{
+        long tmp;
+        tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+                                             &sem->count);
+        if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+                rwsem_down_write_failed(sem);
+        rwsem_set_owner(sem);
+}
+static inline int __down_write_killable(struct rw_semaphore *sem)
+{
+        long tmp;
+        tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
+                                             &sem->count);
+        if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
+                if (IS_ERR(rwsem_down_write_failed_killable(sem)))
+                        return -EINTR;
+        rwsem_set_owner(sem);
+        return 0;
+}
+static inline int __down_write_trylock(struct rw_semaphore *sem)
+{
+        long tmp;
+        lockevent_inc(rwsem_wtrylock);
+        tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
+                      RWSEM_ACTIVE_WRITE_BIAS);
+        if (tmp == RWSEM_UNLOCKED_VALUE) {
+                rwsem_set_owner(sem);
+                return true;
+        }
+        return false;
+}
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct rw_semaphore *sem)
+{
+        long tmp;
+        DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
+                                sem);
+        rwsem_clear_reader_owned(sem);
+        tmp = atomic_long_dec_return_release(&sem->count);
+        if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
+                rwsem_wake(sem);
+}
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct rw_semaphore *sem)
+{
+        DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
+        rwsem_clear_owner(sem);
+        if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
+                                                    &sem->count) < 0))
+                rwsem_wake(sem);
+}
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct rw_semaphore *sem)
+{
+        long tmp;
+        /*
+         * When downgrading from exclusive to shared ownership,
+         * anything inside the write-locked region cannot leak
+         * into the read side. In contrast, anything in the
+         * read-locked region is ok to be re-ordered into the
+         * write side. As such, rely on RELEASE semantics.
+         */
+        DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
+        tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
+        rwsem_set_reader_owned(sem);
+        if (tmp < 0)
+                rwsem_downgrade_wake(sem);
+}
author	Linus Torvalds <torvalds@linux-foundation.org>	2019-05-06 16:50:15 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-05-06 16:50:15 -0400
commit	007dc78fea62610bf06829e38f1d8c69b6ea5af6 (patch)
tree	683af90696ed7a237dedd48030bfd649e5822955 /kernel/locking
parent	2f1835dffa949f560dfa3ed63c0bfc10944b461c (diff)
parent	d671002be6bdd7f77a771e23bf3e95d1f16775e6 (diff)