aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/locking
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 16:50:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 16:50:15 -0400
commit007dc78fea62610bf06829e38f1d8c69b6ea5af6 (patch)
tree683af90696ed7a237dedd48030bfd649e5822955 /kernel/locking
parent2f1835dffa949f560dfa3ed63c0bfc10944b461c (diff)
parentd671002be6bdd7f77a771e23bf3e95d1f16775e6 (diff)
Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar: "Here are the locking changes in this cycle: - rwsem unification and simpler micro-optimizations to prepare for more intrusive (and more lucrative) scalability improvements in v5.3 (Waiman Long) - Lockdep irq state tracking flag usage cleanups (Frederic Weisbecker) - static key improvements (Jakub Kicinski, Peter Zijlstra) - misc updates, cleanups and smaller fixes" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (26 commits) locking/lockdep: Remove unnecessary unlikely() locking/static_key: Don't take sleeping locks in __static_key_slow_dec_deferred() locking/static_key: Factor out the fast path of static_key_slow_dec() locking/static_key: Add support for deferred static branches locking/lockdep: Test all incompatible scenarios at once in check_irq_usage() locking/lockdep: Avoid bogus Clang warning locking/lockdep: Generate LOCKF_ bit composites locking/lockdep: Use expanded masks on find_usage_*() functions locking/lockdep: Map remaining magic numbers to lock usage mask names locking/lockdep: Move valid_state() inside CONFIG_TRACE_IRQFLAGS && CONFIG_PROVE_LOCKING locking/rwsem: Prevent unneeded warning during locking selftest locking/rwsem: Optimize rwsem structure for uncontended lock acquisition locking/rwsem: Enable lock event counting locking/lock_events: Don't show pvqspinlock events on bare metal locking/lock_events: Make lock_events available for all archs & other locks locking/qspinlock_stat: Introduce generic lockevent_*() counting APIs locking/rwsem: Enhance DEBUG_RWSEMS_WARN_ON() macro locking/rwsem: Add debug check for __down_read*() locking/rwsem: Micro-optimize rwsem_try_read_lock_unqueued() locking/rwsem: Move rwsem internal function declarations to rwsem-xadd.h ...
Diffstat (limited to 'kernel/locking')
-rw-r--r--kernel/locking/Makefile5
-rw-r--r--kernel/locking/lock_events.c179
-rw-r--r--kernel/locking/lock_events.h59
-rw-r--r--kernel/locking/lock_events_list.h67
-rw-r--r--kernel/locking/lockdep.c267
-rw-r--r--kernel/locking/lockdep_internals.h34
-rw-r--r--kernel/locking/percpu-rwsem.c2
-rw-r--r--kernel/locking/qspinlock.c8
-rw-r--r--kernel/locking/qspinlock_paravirt.h19
-rw-r--r--kernel/locking/qspinlock_stat.h242
-rw-r--r--kernel/locking/rwsem-spinlock.c339
-rw-r--r--kernel/locking/rwsem-xadd.c204
-rw-r--r--kernel/locking/rwsem.c25
-rw-r--r--kernel/locking/rwsem.h174
14 files changed, 858 insertions, 766 deletions
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 392c7f23af76..6fe2f333aecb 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -3,7 +3,7 @@
3# and is generally not a function of system call inputs. 3# and is generally not a function of system call inputs.
4KCOV_INSTRUMENT := n 4KCOV_INSTRUMENT := n
5 5
6obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o 6obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o
7 7
8ifdef CONFIG_FUNCTION_TRACER 8ifdef CONFIG_FUNCTION_TRACER
9CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) 9CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -25,8 +25,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
25obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 25obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
26obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 26obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
27obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o 27obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
28obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
29obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
30obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o 28obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
31obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o 29obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
32obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o 30obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
31obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
diff --git a/kernel/locking/lock_events.c b/kernel/locking/lock_events.c
new file mode 100644
index 000000000000..fa2c2f951c6b
--- /dev/null
+++ b/kernel/locking/lock_events.c
@@ -0,0 +1,179 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * Authors: Waiman Long <waiman.long@hpe.com>
14 */
15
16/*
17 * Collect locking event counts
18 */
19#include <linux/debugfs.h>
20#include <linux/sched.h>
21#include <linux/sched/clock.h>
22#include <linux/fs.h>
23
24#include "lock_events.h"
25
26#undef LOCK_EVENT
27#define LOCK_EVENT(name) [LOCKEVENT_ ## name] = #name,
28
29#define LOCK_EVENTS_DIR "lock_event_counts"
30
31/*
32 * When CONFIG_LOCK_EVENT_COUNTS is enabled, event counts of different
33 * types of locks will be reported under the <debugfs>/lock_event_counts/
34 * directory. See lock_events_list.h for the list of available locking
35 * events.
36 *
37 * Writing to the special ".reset_counts" file will reset all the above
38 * locking event counts. This is a very slow operation and so should not
39 * be done frequently.
40 *
41 * These event counts are implemented as per-cpu variables which are
42 * summed and computed whenever the corresponding debugfs files are read. This
43 * minimizes added overhead making the counts usable even in a production
44 * environment.
45 */
46static const char * const lockevent_names[lockevent_num + 1] = {
47
48#include "lock_events_list.h"
49
50 [LOCKEVENT_reset_cnts] = ".reset_counts",
51};
52
53/*
54 * Per-cpu counts
55 */
56DEFINE_PER_CPU(unsigned long, lockevents[lockevent_num]);
57
58/*
59 * The lockevent_read() function can be overridden.
60 */
61ssize_t __weak lockevent_read(struct file *file, char __user *user_buf,
62 size_t count, loff_t *ppos)
63{
64 char buf[64];
65 int cpu, id, len;
66 u64 sum = 0;
67
68 /*
69 * Get the counter ID stored in file->f_inode->i_private
70 */
71 id = (long)file_inode(file)->i_private;
72
73 if (id >= lockevent_num)
74 return -EBADF;
75
76 for_each_possible_cpu(cpu)
77 sum += per_cpu(lockevents[id], cpu);
78 len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
79
80 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
81}
82
83/*
84 * Function to handle write request
85 *
86 * When idx = reset_cnts, reset all the counts.
87 */
88static ssize_t lockevent_write(struct file *file, const char __user *user_buf,
89 size_t count, loff_t *ppos)
90{
91 int cpu;
92
93 /*
94 * Get the counter ID stored in file->f_inode->i_private
95 */
96 if ((long)file_inode(file)->i_private != LOCKEVENT_reset_cnts)
97 return count;
98
99 for_each_possible_cpu(cpu) {
100 int i;
101 unsigned long *ptr = per_cpu_ptr(lockevents, cpu);
102
103 for (i = 0 ; i < lockevent_num; i++)
104 WRITE_ONCE(ptr[i], 0);
105 }
106 return count;
107}
108
109/*
110 * Debugfs data structures
111 */
112static const struct file_operations fops_lockevent = {
113 .read = lockevent_read,
114 .write = lockevent_write,
115 .llseek = default_llseek,
116};
117
118#ifdef CONFIG_PARAVIRT_SPINLOCKS
119#include <asm/paravirt.h>
120
121static bool __init skip_lockevent(const char *name)
122{
123 static int pv_on __initdata = -1;
124
125 if (pv_on < 0)
126 pv_on = !pv_is_native_spin_unlock();
127 /*
128 * Skip PV qspinlock events on bare metal.
129 */
130 if (!pv_on && !memcmp(name, "pv_", 3))
131 return true;
132 return false;
133}
134#else
135static inline bool skip_lockevent(const char *name)
136{
137 return false;
138}
139#endif
140
141/*
142 * Initialize debugfs for the locking event counts.
143 */
144static int __init init_lockevent_counts(void)
145{
146 struct dentry *d_counts = debugfs_create_dir(LOCK_EVENTS_DIR, NULL);
147 int i;
148
149 if (!d_counts)
150 goto out;
151
152 /*
153 * Create the debugfs files
154 *
155 * As reading from and writing to the stat files can be slow, only
156 * root is allowed to do the read/write to limit impact to system
157 * performance.
158 */
159 for (i = 0; i < lockevent_num; i++) {
160 if (skip_lockevent(lockevent_names[i]))
161 continue;
162 if (!debugfs_create_file(lockevent_names[i], 0400, d_counts,
163 (void *)(long)i, &fops_lockevent))
164 goto fail_undo;
165 }
166
167 if (!debugfs_create_file(lockevent_names[LOCKEVENT_reset_cnts], 0200,
168 d_counts, (void *)(long)LOCKEVENT_reset_cnts,
169 &fops_lockevent))
170 goto fail_undo;
171
172 return 0;
173fail_undo:
174 debugfs_remove_recursive(d_counts);
175out:
176 pr_warn("Could not create '%s' debugfs entries\n", LOCK_EVENTS_DIR);
177 return -ENOMEM;
178}
179fs_initcall(init_lockevent_counts);
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
new file mode 100644
index 000000000000..feb1acc54611
--- /dev/null
+++ b/kernel/locking/lock_events.h
@@ -0,0 +1,59 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * Authors: Waiman Long <longman@redhat.com>
14 */
15
16#ifndef __LOCKING_LOCK_EVENTS_H
17#define __LOCKING_LOCK_EVENTS_H
18
19enum lock_events {
20
21#include "lock_events_list.h"
22
23 lockevent_num, /* Total number of lock event counts */
24 LOCKEVENT_reset_cnts = lockevent_num,
25};
26
27#ifdef CONFIG_LOCK_EVENT_COUNTS
28/*
29 * Per-cpu counters
30 */
31DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
32
33/*
34 * Increment the PV qspinlock statistical counters
35 */
36static inline void __lockevent_inc(enum lock_events event, bool cond)
37{
38 if (cond)
39 __this_cpu_inc(lockevents[event]);
40}
41
42#define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true)
43#define lockevent_cond_inc(ev, c) __lockevent_inc(LOCKEVENT_ ##ev, c)
44
45static inline void __lockevent_add(enum lock_events event, int inc)
46{
47 __this_cpu_add(lockevents[event], inc);
48}
49
50#define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
51
52#else /* CONFIG_LOCK_EVENT_COUNTS */
53
54#define lockevent_inc(ev)
55#define lockevent_add(ev, c)
56#define lockevent_cond_inc(ev, c)
57
58#endif /* CONFIG_LOCK_EVENT_COUNTS */
59#endif /* __LOCKING_LOCK_EVENTS_H */
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
new file mode 100644
index 000000000000..ad7668cfc9da
--- /dev/null
+++ b/kernel/locking/lock_events_list.h
@@ -0,0 +1,67 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License as published by
5 * the Free Software Foundation; either version 2 of the License, or
6 * (at your option) any later version.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * Authors: Waiman Long <longman@redhat.com>
14 */
15
16#ifndef LOCK_EVENT
17#define LOCK_EVENT(name) LOCKEVENT_ ## name,
18#endif
19
20#ifdef CONFIG_QUEUED_SPINLOCKS
21#ifdef CONFIG_PARAVIRT_SPINLOCKS
22/*
23 * Locking events for PV qspinlock.
24 */
25LOCK_EVENT(pv_hash_hops) /* Average # of hops per hashing operation */
26LOCK_EVENT(pv_kick_unlock) /* # of vCPU kicks issued at unlock time */
27LOCK_EVENT(pv_kick_wake) /* # of vCPU kicks for pv_latency_wake */
28LOCK_EVENT(pv_latency_kick) /* Average latency (ns) of vCPU kick */
29LOCK_EVENT(pv_latency_wake) /* Average latency (ns) of kick-to-wakeup */
30LOCK_EVENT(pv_lock_stealing) /* # of lock stealing operations */
31LOCK_EVENT(pv_spurious_wakeup) /* # of spurious wakeups in non-head vCPUs */
32LOCK_EVENT(pv_wait_again) /* # of wait's after queue head vCPU kick */
33LOCK_EVENT(pv_wait_early) /* # of early vCPU wait's */
34LOCK_EVENT(pv_wait_head) /* # of vCPU wait's at the queue head */
35LOCK_EVENT(pv_wait_node) /* # of vCPU wait's at non-head queue node */
36#endif /* CONFIG_PARAVIRT_SPINLOCKS */
37
38/*
39 * Locking events for qspinlock
40 *
41 * Subtracting lock_use_node[234] from lock_slowpath will give you
42 * lock_use_node1.
43 */
44LOCK_EVENT(lock_pending) /* # of locking ops via pending code */
45LOCK_EVENT(lock_slowpath) /* # of locking ops via MCS lock queue */
46LOCK_EVENT(lock_use_node2) /* # of locking ops that use 2nd percpu node */
47LOCK_EVENT(lock_use_node3) /* # of locking ops that use 3rd percpu node */
48LOCK_EVENT(lock_use_node4) /* # of locking ops that use 4th percpu node */
49LOCK_EVENT(lock_no_node) /* # of locking ops w/o using percpu node */
50#endif /* CONFIG_QUEUED_SPINLOCKS */
51
52/*
53 * Locking events for rwsem
54 */
55LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */
56LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */
57LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */
58LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */
59LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */
60LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */
61LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */
62LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */
63LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */
64LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */
65LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
66LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
67LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 91c6b89f04df..27b992fe8cec 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -501,11 +501,11 @@ static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
501{ 501{
502 char c = '.'; 502 char c = '.';
503 503
504 if (class->usage_mask & lock_flag(bit + 2)) 504 if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
505 c = '+'; 505 c = '+';
506 if (class->usage_mask & lock_flag(bit)) { 506 if (class->usage_mask & lock_flag(bit)) {
507 c = '-'; 507 c = '-';
508 if (class->usage_mask & lock_flag(bit + 2)) 508 if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK))
509 c = '?'; 509 c = '?';
510 } 510 }
511 511
@@ -1666,19 +1666,25 @@ check_redundant(struct lock_list *root, struct lock_class *target,
1666} 1666}
1667 1667
1668#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1668#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
1669
1670static inline int usage_accumulate(struct lock_list *entry, void *mask)
1671{
1672 *(unsigned long *)mask |= entry->class->usage_mask;
1673
1674 return 0;
1675}
1676
1669/* 1677/*
1670 * Forwards and backwards subgraph searching, for the purposes of 1678 * Forwards and backwards subgraph searching, for the purposes of
1671 * proving that two subgraphs can be connected by a new dependency 1679 * proving that two subgraphs can be connected by a new dependency
1672 * without creating any illegal irq-safe -> irq-unsafe lock dependency. 1680 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
1673 */ 1681 */
1674 1682
1675static inline int usage_match(struct lock_list *entry, void *bit) 1683static inline int usage_match(struct lock_list *entry, void *mask)
1676{ 1684{
1677 return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); 1685 return entry->class->usage_mask & *(unsigned long *)mask;
1678} 1686}
1679 1687
1680
1681
1682/* 1688/*
1683 * Find a node in the forwards-direction dependency sub-graph starting 1689 * Find a node in the forwards-direction dependency sub-graph starting
1684 * at @root->class that matches @bit. 1690 * at @root->class that matches @bit.
@@ -1690,14 +1696,14 @@ static inline int usage_match(struct lock_list *entry, void *bit)
1690 * Return <0 on error. 1696 * Return <0 on error.
1691 */ 1697 */
1692static int 1698static int
1693find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, 1699find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
1694 struct lock_list **target_entry) 1700 struct lock_list **target_entry)
1695{ 1701{
1696 int result; 1702 int result;
1697 1703
1698 debug_atomic_inc(nr_find_usage_forwards_checks); 1704 debug_atomic_inc(nr_find_usage_forwards_checks);
1699 1705
1700 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1706 result = __bfs_forwards(root, &usage_mask, usage_match, target_entry);
1701 1707
1702 return result; 1708 return result;
1703} 1709}
@@ -1713,14 +1719,14 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1713 * Return <0 on error. 1719 * Return <0 on error.
1714 */ 1720 */
1715static int 1721static int
1716find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, 1722find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
1717 struct lock_list **target_entry) 1723 struct lock_list **target_entry)
1718{ 1724{
1719 int result; 1725 int result;
1720 1726
1721 debug_atomic_inc(nr_find_usage_backwards_checks); 1727 debug_atomic_inc(nr_find_usage_backwards_checks);
1722 1728
1723 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1729 result = __bfs_backwards(root, &usage_mask, usage_match, target_entry);
1724 1730
1725 return result; 1731 return result;
1726} 1732}
@@ -1912,39 +1918,6 @@ print_bad_irq_dependency(struct task_struct *curr,
1912 return 0; 1918 return 0;
1913} 1919}
1914 1920
1915static int
1916check_usage(struct task_struct *curr, struct held_lock *prev,
1917 struct held_lock *next, enum lock_usage_bit bit_backwards,
1918 enum lock_usage_bit bit_forwards, const char *irqclass)
1919{
1920 int ret;
1921 struct lock_list this, that;
1922 struct lock_list *uninitialized_var(target_entry);
1923 struct lock_list *uninitialized_var(target_entry1);
1924
1925 this.parent = NULL;
1926
1927 this.class = hlock_class(prev);
1928 ret = find_usage_backwards(&this, bit_backwards, &target_entry);
1929 if (ret < 0)
1930 return print_bfs_bug(ret);
1931 if (ret == 1)
1932 return ret;
1933
1934 that.parent = NULL;
1935 that.class = hlock_class(next);
1936 ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
1937 if (ret < 0)
1938 return print_bfs_bug(ret);
1939 if (ret == 1)
1940 return ret;
1941
1942 return print_bad_irq_dependency(curr, &this, &that,
1943 target_entry, target_entry1,
1944 prev, next,
1945 bit_backwards, bit_forwards, irqclass);
1946}
1947
1948static const char *state_names[] = { 1921static const char *state_names[] = {
1949#define LOCKDEP_STATE(__STATE) \ 1922#define LOCKDEP_STATE(__STATE) \
1950 __stringify(__STATE), 1923 __stringify(__STATE),
@@ -1961,9 +1934,19 @@ static const char *state_rnames[] = {
1961 1934
1962static inline const char *state_name(enum lock_usage_bit bit) 1935static inline const char *state_name(enum lock_usage_bit bit)
1963{ 1936{
1964 return (bit & LOCK_USAGE_READ_MASK) ? state_rnames[bit >> 2] : state_names[bit >> 2]; 1937 if (bit & LOCK_USAGE_READ_MASK)
1938 return state_rnames[bit >> LOCK_USAGE_DIR_MASK];
1939 else
1940 return state_names[bit >> LOCK_USAGE_DIR_MASK];
1965} 1941}
1966 1942
1943/*
1944 * The bit number is encoded like:
1945 *
1946 * bit0: 0 exclusive, 1 read lock
1947 * bit1: 0 used in irq, 1 irq enabled
1948 * bit2-n: state
1949 */
1967static int exclusive_bit(int new_bit) 1950static int exclusive_bit(int new_bit)
1968{ 1951{
1969 int state = new_bit & LOCK_USAGE_STATE_MASK; 1952 int state = new_bit & LOCK_USAGE_STATE_MASK;
@@ -1975,45 +1958,160 @@ static int exclusive_bit(int new_bit)
1975 return state | (dir ^ LOCK_USAGE_DIR_MASK); 1958 return state | (dir ^ LOCK_USAGE_DIR_MASK);
1976} 1959}
1977 1960
1961/*
1962 * Observe that when given a bitmask where each bitnr is encoded as above, a
1963 * right shift of the mask transforms the individual bitnrs as -1 and
1964 * conversely, a left shift transforms into +1 for the individual bitnrs.
1965 *
1966 * So for all bits whose number have LOCK_ENABLED_* set (bitnr1 == 1), we can
1967 * create the mask with those bit numbers using LOCK_USED_IN_* (bitnr1 == 0)
1968 * instead by subtracting the bit number by 2, or shifting the mask right by 2.
1969 *
1970 * Similarly, bitnr1 == 0 becomes bitnr1 == 1 by adding 2, or shifting left 2.
1971 *
1972 * So split the mask (note that LOCKF_ENABLED_IRQ_ALL|LOCKF_USED_IN_IRQ_ALL is
1973 * all bits set) and recompose with bitnr1 flipped.
1974 */
1975static unsigned long invert_dir_mask(unsigned long mask)
1976{
1977 unsigned long excl = 0;
1978
1979 /* Invert dir */
1980 excl |= (mask & LOCKF_ENABLED_IRQ_ALL) >> LOCK_USAGE_DIR_MASK;
1981 excl |= (mask & LOCKF_USED_IN_IRQ_ALL) << LOCK_USAGE_DIR_MASK;
1982
1983 return excl;
1984}
1985
1986/*
1987 * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
1988 * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
1989 * And then mask out all bitnr0.
1990 */
1991static unsigned long exclusive_mask(unsigned long mask)
1992{
1993 unsigned long excl = invert_dir_mask(mask);
1994
1995 /* Strip read */
1996 excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
1997 excl &= ~LOCKF_IRQ_READ;
1998
1999 return excl;
2000}
2001
2002/*
2003 * Retrieve the _possible_ original mask to which @mask is
2004 * exclusive. Ie: this is the opposite of exclusive_mask().
2005 * Note that 2 possible original bits can match an exclusive
2006 * bit: one has LOCK_USAGE_READ_MASK set, the other has it
2007 * cleared. So both are returned for each exclusive bit.
2008 */
2009static unsigned long original_mask(unsigned long mask)
2010{
2011 unsigned long excl = invert_dir_mask(mask);
2012
2013 /* Include read in existing usages */
2014 excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
2015
2016 return excl;
2017}
2018
2019/*
2020 * Find the first pair of bit match between an original
2021 * usage mask and an exclusive usage mask.
2022 */
2023static int find_exclusive_match(unsigned long mask,
2024 unsigned long excl_mask,
2025 enum lock_usage_bit *bitp,
2026 enum lock_usage_bit *excl_bitp)
2027{
2028 int bit, excl;
2029
2030 for_each_set_bit(bit, &mask, LOCK_USED) {
2031 excl = exclusive_bit(bit);
2032 if (excl_mask & lock_flag(excl)) {
2033 *bitp = bit;
2034 *excl_bitp = excl;
2035 return 0;
2036 }
2037 }
2038 return -1;
2039}
2040
2041/*
2042 * Prove that the new dependency does not connect a hardirq-safe(-read)
2043 * lock with a hardirq-unsafe lock - to achieve this we search
2044 * the backwards-subgraph starting at <prev>, and the
2045 * forwards-subgraph starting at <next>:
2046 */
1978static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, 2047static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
1979 struct held_lock *next, enum lock_usage_bit bit) 2048 struct held_lock *next)
1980{ 2049{
2050 unsigned long usage_mask = 0, forward_mask, backward_mask;
2051 enum lock_usage_bit forward_bit = 0, backward_bit = 0;
2052 struct lock_list *uninitialized_var(target_entry1);
2053 struct lock_list *uninitialized_var(target_entry);
2054 struct lock_list this, that;
2055 int ret;
2056
1981 /* 2057 /*
1982 * Prove that the new dependency does not connect a hardirq-safe 2058 * Step 1: gather all hard/soft IRQs usages backward in an
1983 * lock with a hardirq-unsafe lock - to achieve this we search 2059 * accumulated usage mask.
1984 * the backwards-subgraph starting at <prev>, and the
1985 * forwards-subgraph starting at <next>:
1986 */ 2060 */
1987 if (!check_usage(curr, prev, next, bit, 2061 this.parent = NULL;
1988 exclusive_bit(bit), state_name(bit))) 2062 this.class = hlock_class(prev);
1989 return 0; 2063
2064 ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
2065 if (ret < 0)
2066 return print_bfs_bug(ret);
1990 2067
1991 bit++; /* _READ */ 2068 usage_mask &= LOCKF_USED_IN_IRQ_ALL;
2069 if (!usage_mask)
2070 return 1;
1992 2071
1993 /* 2072 /*
1994 * Prove that the new dependency does not connect a hardirq-safe-read 2073 * Step 2: find exclusive uses forward that match the previous
1995 * lock with a hardirq-unsafe lock - to achieve this we search 2074 * backward accumulated mask.
1996 * the backwards-subgraph starting at <prev>, and the
1997 * forwards-subgraph starting at <next>:
1998 */ 2075 */
1999 if (!check_usage(curr, prev, next, bit, 2076 forward_mask = exclusive_mask(usage_mask);
2000 exclusive_bit(bit), state_name(bit)))
2001 return 0;
2002 2077
2003 return 1; 2078 that.parent = NULL;
2004} 2079 that.class = hlock_class(next);
2005 2080
2006static int 2081 ret = find_usage_forwards(&that, forward_mask, &target_entry1);
2007check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, 2082 if (ret < 0)
2008 struct held_lock *next) 2083 return print_bfs_bug(ret);
2009{ 2084 if (ret == 1)
2010#define LOCKDEP_STATE(__STATE) \ 2085 return ret;
2011 if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
2012 return 0;
2013#include "lockdep_states.h"
2014#undef LOCKDEP_STATE
2015 2086
2016 return 1; 2087 /*
2088 * Step 3: we found a bad match! Now retrieve a lock from the backward
2089 * list whose usage mask matches the exclusive usage mask from the
2090 * lock found on the forward list.
2091 */
2092 backward_mask = original_mask(target_entry1->class->usage_mask);
2093
2094 ret = find_usage_backwards(&this, backward_mask, &target_entry);
2095 if (ret < 0)
2096 return print_bfs_bug(ret);
2097 if (DEBUG_LOCKS_WARN_ON(ret == 1))
2098 return 1;
2099
2100 /*
2101 * Step 4: narrow down to a pair of incompatible usage bits
2102 * and report it.
2103 */
2104 ret = find_exclusive_match(target_entry->class->usage_mask,
2105 target_entry1->class->usage_mask,
2106 &backward_bit, &forward_bit);
2107 if (DEBUG_LOCKS_WARN_ON(ret == -1))
2108 return 1;
2109
2110 return print_bad_irq_dependency(curr, &this, &that,
2111 target_entry, target_entry1,
2112 prev, next,
2113 backward_bit, forward_bit,
2114 state_name(backward_bit));
2017} 2115}
2018 2116
2019static void inc_chains(void) 2117static void inc_chains(void)
@@ -2030,9 +2128,8 @@ static void inc_chains(void)
2030 2128
2031#else 2129#else
2032 2130
2033static inline int 2131static inline int check_irq_usage(struct task_struct *curr,
2034check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, 2132 struct held_lock *prev, struct held_lock *next)
2035 struct held_lock *next)
2036{ 2133{
2037 return 1; 2134 return 1;
2038} 2135}
@@ -2211,7 +2308,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
2211 else if (unlikely(ret < 0)) 2308 else if (unlikely(ret < 0))
2212 return print_bfs_bug(ret); 2309 return print_bfs_bug(ret);
2213 2310
2214 if (!check_prev_add_irq(curr, prev, next)) 2311 if (!check_irq_usage(curr, prev, next))
2215 return 0; 2312 return 0;
2216 2313
2217 /* 2314 /*
@@ -2773,6 +2870,12 @@ static void check_chain_key(struct task_struct *curr)
2773#endif 2870#endif
2774} 2871}
2775 2872
2873static int mark_lock(struct task_struct *curr, struct held_lock *this,
2874 enum lock_usage_bit new_bit);
2875
2876#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
2877
2878
2776static void 2879static void
2777print_usage_bug_scenario(struct held_lock *lock) 2880print_usage_bug_scenario(struct held_lock *lock)
2778{ 2881{
@@ -2842,10 +2945,6 @@ valid_state(struct task_struct *curr, struct held_lock *this,
2842 return 1; 2945 return 1;
2843} 2946}
2844 2947
2845static int mark_lock(struct task_struct *curr, struct held_lock *this,
2846 enum lock_usage_bit new_bit);
2847
2848#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
2849 2948
2850/* 2949/*
2851 * print irq inversion bug: 2950 * print irq inversion bug:
@@ -2925,7 +3024,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
2925 3024
2926 root.parent = NULL; 3025 root.parent = NULL;
2927 root.class = hlock_class(this); 3026 root.class = hlock_class(this);
2928 ret = find_usage_forwards(&root, bit, &target_entry); 3027 ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
2929 if (ret < 0) 3028 if (ret < 0)
2930 return print_bfs_bug(ret); 3029 return print_bfs_bug(ret);
2931 if (ret == 1) 3030 if (ret == 1)
@@ -2949,7 +3048,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2949 3048
2950 root.parent = NULL; 3049 root.parent = NULL;
2951 root.class = hlock_class(this); 3050 root.class = hlock_class(this);
2952 ret = find_usage_backwards(&root, bit, &target_entry); 3051 ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
2953 if (ret < 0) 3052 if (ret < 0)
2954 return print_bfs_bug(ret); 3053 return print_bfs_bug(ret);
2955 if (ret == 1) 3054 if (ret == 1)
@@ -3004,7 +3103,7 @@ static int (*state_verbose_f[])(struct lock_class *class) = {
3004static inline int state_verbose(enum lock_usage_bit bit, 3103static inline int state_verbose(enum lock_usage_bit bit,
3005 struct lock_class *class) 3104 struct lock_class *class)
3006{ 3105{
3007 return state_verbose_f[bit >> 2](class); 3106 return state_verbose_f[bit >> LOCK_USAGE_DIR_MASK](class);
3008} 3107}
3009 3108
3010typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, 3109typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
@@ -3146,7 +3245,7 @@ void lockdep_hardirqs_on(unsigned long ip)
3146 /* 3245 /*
3147 * See the fine text that goes along with this variable definition. 3246 * See the fine text that goes along with this variable definition.
3148 */ 3247 */
3149 if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) 3248 if (DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled))
3150 return; 3249 return;
3151 3250
3152 /* 3251 /*
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index d4c197425f68..150ec3f0c5b5 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -42,13 +42,35 @@ enum {
42 __LOCKF(USED) 42 __LOCKF(USED)
43}; 43};
44 44
45#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) 45#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
46#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) 46static const unsigned long LOCKF_ENABLED_IRQ =
47#include "lockdep_states.h"
48 0;
49#undef LOCKDEP_STATE
50
51#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE |
52static const unsigned long LOCKF_USED_IN_IRQ =
53#include "lockdep_states.h"
54 0;
55#undef LOCKDEP_STATE
56
57#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE##_READ |
58static const unsigned long LOCKF_ENABLED_IRQ_READ =
59#include "lockdep_states.h"
60 0;
61#undef LOCKDEP_STATE
62
63#define LOCKDEP_STATE(__STATE) LOCKF_USED_IN_##__STATE##_READ |
64static const unsigned long LOCKF_USED_IN_IRQ_READ =
65#include "lockdep_states.h"
66 0;
67#undef LOCKDEP_STATE
68
69#define LOCKF_ENABLED_IRQ_ALL (LOCKF_ENABLED_IRQ | LOCKF_ENABLED_IRQ_READ)
70#define LOCKF_USED_IN_IRQ_ALL (LOCKF_USED_IN_IRQ | LOCKF_USED_IN_IRQ_READ)
47 71
48#define LOCKF_ENABLED_IRQ_READ \ 72#define LOCKF_IRQ (LOCKF_ENABLED_IRQ | LOCKF_USED_IN_IRQ)
49 (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) 73#define LOCKF_IRQ_READ (LOCKF_ENABLED_IRQ_READ | LOCKF_USED_IN_IRQ_READ)
50#define LOCKF_USED_IN_IRQ_READ \
51 (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
52 74
53/* 75/*
54 * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, 76 * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text,
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 883cf1b92d90..f17dad99eec8 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,6 +7,8 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9 9
10#include "rwsem.h"
11
10int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, 12int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
11 const char *name, struct lock_class_key *rwsem_key) 13 const char *name, struct lock_class_key *rwsem_key)
12{ 14{
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 5e9247dc2515..e14b32c69639 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -395,7 +395,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
395 * 0,1,0 -> 0,0,1 395 * 0,1,0 -> 0,0,1
396 */ 396 */
397 clear_pending_set_locked(lock); 397 clear_pending_set_locked(lock);
398 qstat_inc(qstat_lock_pending, true); 398 lockevent_inc(lock_pending);
399 return; 399 return;
400 400
401 /* 401 /*
@@ -403,7 +403,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
403 * queuing. 403 * queuing.
404 */ 404 */
405queue: 405queue:
406 qstat_inc(qstat_lock_slowpath, true); 406 lockevent_inc(lock_slowpath);
407pv_queue: 407pv_queue:
408 node = this_cpu_ptr(&qnodes[0].mcs); 408 node = this_cpu_ptr(&qnodes[0].mcs);
409 idx = node->count++; 409 idx = node->count++;
@@ -419,7 +419,7 @@ pv_queue:
419 * simple enough. 419 * simple enough.
420 */ 420 */
421 if (unlikely(idx >= MAX_NODES)) { 421 if (unlikely(idx >= MAX_NODES)) {
422 qstat_inc(qstat_lock_no_node, true); 422 lockevent_inc(lock_no_node);
423 while (!queued_spin_trylock(lock)) 423 while (!queued_spin_trylock(lock))
424 cpu_relax(); 424 cpu_relax();
425 goto release; 425 goto release;
@@ -430,7 +430,7 @@ pv_queue:
430 /* 430 /*
431 * Keep counts of non-zero index values: 431 * Keep counts of non-zero index values:
432 */ 432 */
433 qstat_inc(qstat_lock_use_node2 + idx - 1, idx); 433 lockevent_cond_inc(lock_use_node2 + idx - 1, idx);
434 434
435 /* 435 /*
436 * Ensure that we increment the head node->count before initialising 436 * Ensure that we increment the head node->count before initialising
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 8f36c27c1794..89bab079e7a4 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -89,7 +89,7 @@ static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
89 89
90 if (!(val & _Q_LOCKED_PENDING_MASK) && 90 if (!(val & _Q_LOCKED_PENDING_MASK) &&
91 (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) { 91 (cmpxchg_acquire(&lock->locked, 0, _Q_LOCKED_VAL) == 0)) {
92 qstat_inc(qstat_pv_lock_stealing, true); 92 lockevent_inc(pv_lock_stealing);
93 return true; 93 return true;
94 } 94 }
95 if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK)) 95 if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
@@ -219,7 +219,7 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
219 hopcnt++; 219 hopcnt++;
220 if (!cmpxchg(&he->lock, NULL, lock)) { 220 if (!cmpxchg(&he->lock, NULL, lock)) {
221 WRITE_ONCE(he->node, node); 221 WRITE_ONCE(he->node, node);
222 qstat_hop(hopcnt); 222 lockevent_pv_hop(hopcnt);
223 return &he->lock; 223 return &he->lock;
224 } 224 }
225 } 225 }
@@ -320,8 +320,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
320 smp_store_mb(pn->state, vcpu_halted); 320 smp_store_mb(pn->state, vcpu_halted);
321 321
322 if (!READ_ONCE(node->locked)) { 322 if (!READ_ONCE(node->locked)) {
323 qstat_inc(qstat_pv_wait_node, true); 323 lockevent_inc(pv_wait_node);
324 qstat_inc(qstat_pv_wait_early, wait_early); 324 lockevent_cond_inc(pv_wait_early, wait_early);
325 pv_wait(&pn->state, vcpu_halted); 325 pv_wait(&pn->state, vcpu_halted);
326 } 326 }
327 327
@@ -339,7 +339,8 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
339 * So it is better to spin for a while in the hope that the 339 * So it is better to spin for a while in the hope that the
340 * MCS lock will be released soon. 340 * MCS lock will be released soon.
341 */ 341 */
342 qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); 342 lockevent_cond_inc(pv_spurious_wakeup,
343 !READ_ONCE(node->locked));
343 } 344 }
344 345
345 /* 346 /*
@@ -416,7 +417,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
416 /* 417 /*
417 * Tracking # of slowpath locking operations 418 * Tracking # of slowpath locking operations
418 */ 419 */
419 qstat_inc(qstat_lock_slowpath, true); 420 lockevent_inc(lock_slowpath);
420 421
421 for (;; waitcnt++) { 422 for (;; waitcnt++) {
422 /* 423 /*
@@ -464,8 +465,8 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
464 } 465 }
465 } 466 }
466 WRITE_ONCE(pn->state, vcpu_hashed); 467 WRITE_ONCE(pn->state, vcpu_hashed);
467 qstat_inc(qstat_pv_wait_head, true); 468 lockevent_inc(pv_wait_head);
468 qstat_inc(qstat_pv_wait_again, waitcnt); 469 lockevent_cond_inc(pv_wait_again, waitcnt);
469 pv_wait(&lock->locked, _Q_SLOW_VAL); 470 pv_wait(&lock->locked, _Q_SLOW_VAL);
470 471
471 /* 472 /*
@@ -528,7 +529,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
528 * vCPU is harmless other than the additional latency in completing 529 * vCPU is harmless other than the additional latency in completing
529 * the unlock. 530 * the unlock.
530 */ 531 */
531 qstat_inc(qstat_pv_kick_unlock, true); 532 lockevent_inc(pv_kick_unlock);
532 pv_kick(node->cpu); 533 pv_kick(node->cpu);
533} 534}
534 535
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index d73f85388d5c..54152670ff24 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -9,262 +9,105 @@
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details. 10 * GNU General Public License for more details.
11 * 11 *
12 * Authors: Waiman Long <waiman.long@hpe.com> 12 * Authors: Waiman Long <longman@redhat.com>
13 */ 13 */
14 14
15/* 15#include "lock_events.h"
16 * When queued spinlock statistical counters are enabled, the following
17 * debugfs files will be created for reporting the counter values:
18 *
19 * <debugfs>/qlockstat/
20 * pv_hash_hops - average # of hops per hashing operation
21 * pv_kick_unlock - # of vCPU kicks issued at unlock time
22 * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
23 * pv_latency_kick - average latency (ns) of vCPU kick operation
24 * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
25 * pv_lock_stealing - # of lock stealing operations
26 * pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
27 * pv_wait_again - # of wait's after a queue head vCPU kick
28 * pv_wait_early - # of early vCPU wait's
29 * pv_wait_head - # of vCPU wait's at the queue head
30 * pv_wait_node - # of vCPU wait's at a non-head queue node
31 * lock_pending - # of locking operations via pending code
32 * lock_slowpath - # of locking operations via MCS lock queue
33 * lock_use_node2 - # of locking operations that use 2nd per-CPU node
34 * lock_use_node3 - # of locking operations that use 3rd per-CPU node
35 * lock_use_node4 - # of locking operations that use 4th per-CPU node
36 * lock_no_node - # of locking operations without using per-CPU node
37 *
38 * Subtracting lock_use_node[234] from lock_slowpath will give you
39 * lock_use_node1.
40 *
41 * Writing to the "reset_counters" file will reset all the above counter
42 * values.
43 *
44 * These statistical counters are implemented as per-cpu variables which are
45 * summed and computed whenever the corresponding debugfs files are read. This
46 * minimizes added overhead making the counters usable even in a production
47 * environment.
48 *
49 * There may be slight difference between pv_kick_wake and pv_kick_unlock.
50 */
51enum qlock_stats {
52 qstat_pv_hash_hops,
53 qstat_pv_kick_unlock,
54 qstat_pv_kick_wake,
55 qstat_pv_latency_kick,
56 qstat_pv_latency_wake,
57 qstat_pv_lock_stealing,
58 qstat_pv_spurious_wakeup,
59 qstat_pv_wait_again,
60 qstat_pv_wait_early,
61 qstat_pv_wait_head,
62 qstat_pv_wait_node,
63 qstat_lock_pending,
64 qstat_lock_slowpath,
65 qstat_lock_use_node2,
66 qstat_lock_use_node3,
67 qstat_lock_use_node4,
68 qstat_lock_no_node,
69 qstat_num, /* Total number of statistical counters */
70 qstat_reset_cnts = qstat_num,
71};
72 16
73#ifdef CONFIG_QUEUED_LOCK_STAT 17#ifdef CONFIG_LOCK_EVENT_COUNTS
18#ifdef CONFIG_PARAVIRT_SPINLOCKS
74/* 19/*
75 * Collect pvqspinlock statistics 20 * Collect pvqspinlock locking event counts
76 */ 21 */
77#include <linux/debugfs.h>
78#include <linux/sched.h> 22#include <linux/sched.h>
79#include <linux/sched/clock.h> 23#include <linux/sched/clock.h>
80#include <linux/fs.h> 24#include <linux/fs.h>
81 25
82static const char * const qstat_names[qstat_num + 1] = { 26#define EVENT_COUNT(ev) lockevents[LOCKEVENT_ ## ev]
83 [qstat_pv_hash_hops] = "pv_hash_hops",
84 [qstat_pv_kick_unlock] = "pv_kick_unlock",
85 [qstat_pv_kick_wake] = "pv_kick_wake",
86 [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
87 [qstat_pv_latency_kick] = "pv_latency_kick",
88 [qstat_pv_latency_wake] = "pv_latency_wake",
89 [qstat_pv_lock_stealing] = "pv_lock_stealing",
90 [qstat_pv_wait_again] = "pv_wait_again",
91 [qstat_pv_wait_early] = "pv_wait_early",
92 [qstat_pv_wait_head] = "pv_wait_head",
93 [qstat_pv_wait_node] = "pv_wait_node",
94 [qstat_lock_pending] = "lock_pending",
95 [qstat_lock_slowpath] = "lock_slowpath",
96 [qstat_lock_use_node2] = "lock_use_node2",
97 [qstat_lock_use_node3] = "lock_use_node3",
98 [qstat_lock_use_node4] = "lock_use_node4",
99 [qstat_lock_no_node] = "lock_no_node",
100 [qstat_reset_cnts] = "reset_counters",
101};
102 27
103/* 28/*
104 * Per-cpu counters 29 * PV specific per-cpu counter
105 */ 30 */
106static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
107static DEFINE_PER_CPU(u64, pv_kick_time); 31static DEFINE_PER_CPU(u64, pv_kick_time);
108 32
109/* 33/*
110 * Function to read and return the qlock statistical counter values 34 * Function to read and return the PV qspinlock counts.
111 * 35 *
112 * The following counters are handled specially: 36 * The following counters are handled specially:
113 * 1. qstat_pv_latency_kick 37 * 1. pv_latency_kick
114 * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock 38 * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
115 * 2. qstat_pv_latency_wake 39 * 2. pv_latency_wake
116 * Average wake latency (ns) = pv_latency_wake/pv_kick_wake 40 * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
117 * 3. qstat_pv_hash_hops 41 * 3. pv_hash_hops
118 * Average hops/hash = pv_hash_hops/pv_kick_unlock 42 * Average hops/hash = pv_hash_hops/pv_kick_unlock
119 */ 43 */
120static ssize_t qstat_read(struct file *file, char __user *user_buf, 44ssize_t lockevent_read(struct file *file, char __user *user_buf,
121 size_t count, loff_t *ppos) 45 size_t count, loff_t *ppos)
122{ 46{
123 char buf[64]; 47 char buf[64];
124 int cpu, counter, len; 48 int cpu, id, len;
125 u64 stat = 0, kicks = 0; 49 u64 sum = 0, kicks = 0;
126 50
127 /* 51 /*
128 * Get the counter ID stored in file->f_inode->i_private 52 * Get the counter ID stored in file->f_inode->i_private
129 */ 53 */
130 counter = (long)file_inode(file)->i_private; 54 id = (long)file_inode(file)->i_private;
131 55
132 if (counter >= qstat_num) 56 if (id >= lockevent_num)
133 return -EBADF; 57 return -EBADF;
134 58
135 for_each_possible_cpu(cpu) { 59 for_each_possible_cpu(cpu) {
136 stat += per_cpu(qstats[counter], cpu); 60 sum += per_cpu(lockevents[id], cpu);
137 /* 61 /*
138 * Need to sum additional counter for some of them 62 * Need to sum additional counters for some of them
139 */ 63 */
140 switch (counter) { 64 switch (id) {
141 65
142 case qstat_pv_latency_kick: 66 case LOCKEVENT_pv_latency_kick:
143 case qstat_pv_hash_hops: 67 case LOCKEVENT_pv_hash_hops:
144 kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); 68 kicks += per_cpu(EVENT_COUNT(pv_kick_unlock), cpu);
145 break; 69 break;
146 70
147 case qstat_pv_latency_wake: 71 case LOCKEVENT_pv_latency_wake:
148 kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); 72 kicks += per_cpu(EVENT_COUNT(pv_kick_wake), cpu);
149 break; 73 break;
150 } 74 }
151 } 75 }
152 76
153 if (counter == qstat_pv_hash_hops) { 77 if (id == LOCKEVENT_pv_hash_hops) {
154 u64 frac = 0; 78 u64 frac = 0;
155 79
156 if (kicks) { 80 if (kicks) {
157 frac = 100ULL * do_div(stat, kicks); 81 frac = 100ULL * do_div(sum, kicks);
158 frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); 82 frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
159 } 83 }
160 84
161 /* 85 /*
162 * Return a X.XX decimal number 86 * Return a X.XX decimal number
163 */ 87 */
164 len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); 88 len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n",
89 sum, frac);
165 } else { 90 } else {
166 /* 91 /*
167 * Round to the nearest ns 92 * Round to the nearest ns
168 */ 93 */
169 if ((counter == qstat_pv_latency_kick) || 94 if ((id == LOCKEVENT_pv_latency_kick) ||
170 (counter == qstat_pv_latency_wake)) { 95 (id == LOCKEVENT_pv_latency_wake)) {
171 if (kicks) 96 if (kicks)
172 stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); 97 sum = DIV_ROUND_CLOSEST_ULL(sum, kicks);
173 } 98 }
174 len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); 99 len = snprintf(buf, sizeof(buf) - 1, "%llu\n", sum);
175 } 100 }
176 101
177 return simple_read_from_buffer(user_buf, count, ppos, buf, len); 102 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
178} 103}
179 104
180/* 105/*
181 * Function to handle write request
182 *
183 * When counter = reset_cnts, reset all the counter values.
184 * Since the counter updates aren't atomic, the resetting is done twice
185 * to make sure that the counters are very likely to be all cleared.
186 */
187static ssize_t qstat_write(struct file *file, const char __user *user_buf,
188 size_t count, loff_t *ppos)
189{
190 int cpu;
191
192 /*
193 * Get the counter ID stored in file->f_inode->i_private
194 */
195 if ((long)file_inode(file)->i_private != qstat_reset_cnts)
196 return count;
197
198 for_each_possible_cpu(cpu) {
199 int i;
200 unsigned long *ptr = per_cpu_ptr(qstats, cpu);
201
202 for (i = 0 ; i < qstat_num; i++)
203 WRITE_ONCE(ptr[i], 0);
204 }
205 return count;
206}
207
208/*
209 * Debugfs data structures
210 */
211static const struct file_operations fops_qstat = {
212 .read = qstat_read,
213 .write = qstat_write,
214 .llseek = default_llseek,
215};
216
217/*
218 * Initialize debugfs for the qspinlock statistical counters
219 */
220static int __init init_qspinlock_stat(void)
221{
222 struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
223 int i;
224
225 if (!d_qstat)
226 goto out;
227
228 /*
229 * Create the debugfs files
230 *
231 * As reading from and writing to the stat files can be slow, only
232 * root is allowed to do the read/write to limit impact to system
233 * performance.
234 */
235 for (i = 0; i < qstat_num; i++)
236 if (!debugfs_create_file(qstat_names[i], 0400, d_qstat,
237 (void *)(long)i, &fops_qstat))
238 goto fail_undo;
239
240 if (!debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
241 (void *)(long)qstat_reset_cnts, &fops_qstat))
242 goto fail_undo;
243
244 return 0;
245fail_undo:
246 debugfs_remove_recursive(d_qstat);
247out:
248 pr_warn("Could not create 'qlockstat' debugfs entries\n");
249 return -ENOMEM;
250}
251fs_initcall(init_qspinlock_stat);
252
253/*
254 * Increment the PV qspinlock statistical counters
255 */
256static inline void qstat_inc(enum qlock_stats stat, bool cond)
257{
258 if (cond)
259 this_cpu_inc(qstats[stat]);
260}
261
262/*
263 * PV hash hop count 106 * PV hash hop count
264 */ 107 */
265static inline void qstat_hop(int hopcnt) 108static inline void lockevent_pv_hop(int hopcnt)
266{ 109{
267 this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); 110 this_cpu_add(EVENT_COUNT(pv_hash_hops), hopcnt);
268} 111}
269 112
270/* 113/*
@@ -276,7 +119,7 @@ static inline void __pv_kick(int cpu)
276 119
277 per_cpu(pv_kick_time, cpu) = start; 120 per_cpu(pv_kick_time, cpu) = start;
278 pv_kick(cpu); 121 pv_kick(cpu);
279 this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); 122 this_cpu_add(EVENT_COUNT(pv_latency_kick), sched_clock() - start);
280} 123}
281 124
282/* 125/*
@@ -289,18 +132,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
289 *pkick_time = 0; 132 *pkick_time = 0;
290 pv_wait(ptr, val); 133 pv_wait(ptr, val);
291 if (*pkick_time) { 134 if (*pkick_time) {
292 this_cpu_add(qstats[qstat_pv_latency_wake], 135 this_cpu_add(EVENT_COUNT(pv_latency_wake),
293 sched_clock() - *pkick_time); 136 sched_clock() - *pkick_time);
294 qstat_inc(qstat_pv_kick_wake, true); 137 lockevent_inc(pv_kick_wake);
295 } 138 }
296} 139}
297 140
298#define pv_kick(c) __pv_kick(c) 141#define pv_kick(c) __pv_kick(c)
299#define pv_wait(p, v) __pv_wait(p, v) 142#define pv_wait(p, v) __pv_wait(p, v)
300 143
301#else /* CONFIG_QUEUED_LOCK_STAT */ 144#endif /* CONFIG_PARAVIRT_SPINLOCKS */
145
146#else /* CONFIG_LOCK_EVENT_COUNTS */
302 147
303static inline void qstat_inc(enum qlock_stats stat, bool cond) { } 148static inline void lockevent_pv_hop(int hopcnt) { }
304static inline void qstat_hop(int hopcnt) { }
305 149
306#endif /* CONFIG_QUEUED_LOCK_STAT */ 150#endif /* CONFIG_LOCK_EVENT_COUNTS */
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
deleted file mode 100644
index a7ffb2a96ede..000000000000
--- a/kernel/locking/rwsem-spinlock.c
+++ /dev/null
@@ -1,339 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2/* rwsem-spinlock.c: R/W semaphores: contention handling functions for
3 * generic spinlock implementation
4 *
5 * Copyright (c) 2001 David Howells (dhowells@redhat.com).
6 * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
7 * - Derived also from comments by Linus
8 */
9#include <linux/rwsem.h>
10#include <linux/sched/signal.h>
11#include <linux/sched/debug.h>
12#include <linux/export.h>
13
14enum rwsem_waiter_type {
15 RWSEM_WAITING_FOR_WRITE,
16 RWSEM_WAITING_FOR_READ
17};
18
19struct rwsem_waiter {
20 struct list_head list;
21 struct task_struct *task;
22 enum rwsem_waiter_type type;
23};
24
25int rwsem_is_locked(struct rw_semaphore *sem)
26{
27 int ret = 1;
28 unsigned long flags;
29
30 if (raw_spin_trylock_irqsave(&sem->wait_lock, flags)) {
31 ret = (sem->count != 0);
32 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
33 }
34 return ret;
35}
36EXPORT_SYMBOL(rwsem_is_locked);
37
38/*
39 * initialise the semaphore
40 */
41void __init_rwsem(struct rw_semaphore *sem, const char *name,
42 struct lock_class_key *key)
43{
44#ifdef CONFIG_DEBUG_LOCK_ALLOC
45 /*
46 * Make sure we are not reinitializing a held semaphore:
47 */
48 debug_check_no_locks_freed((void *)sem, sizeof(*sem));
49 lockdep_init_map(&sem->dep_map, name, key, 0);
50#endif
51 sem->count = 0;
52 raw_spin_lock_init(&sem->wait_lock);
53 INIT_LIST_HEAD(&sem->wait_list);
54}
55EXPORT_SYMBOL(__init_rwsem);
56
57/*
58 * handle the lock release when processes blocked on it that can now run
59 * - if we come here, then:
60 * - the 'active count' _reached_ zero
61 * - the 'waiting count' is non-zero
62 * - the spinlock must be held by the caller
63 * - woken process blocks are discarded from the list after having task zeroed
64 * - writers are only woken if wakewrite is non-zero
65 */
66static inline struct rw_semaphore *
67__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
68{
69 struct rwsem_waiter *waiter;
70 struct task_struct *tsk;
71 int woken;
72
73 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
74
75 if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
76 if (wakewrite)
77 /* Wake up a writer. Note that we do not grant it the
78 * lock - it will have to acquire it when it runs. */
79 wake_up_process(waiter->task);
80 goto out;
81 }
82
83 /* grant an infinite number of read locks to the front of the queue */
84 woken = 0;
85 do {
86 struct list_head *next = waiter->list.next;
87
88 list_del(&waiter->list);
89 tsk = waiter->task;
90 /*
91 * Make sure we do not wakeup the next reader before
92 * setting the nil condition to grant the next reader;
93 * otherwise we could miss the wakeup on the other
94 * side and end up sleeping again. See the pairing
95 * in rwsem_down_read_failed().
96 */
97 smp_mb();
98 waiter->task = NULL;
99 wake_up_process(tsk);
100 put_task_struct(tsk);
101 woken++;
102 if (next == &sem->wait_list)
103 break;
104 waiter = list_entry(next, struct rwsem_waiter, list);
105 } while (waiter->type != RWSEM_WAITING_FOR_WRITE);
106
107 sem->count += woken;
108
109 out:
110 return sem;
111}
112
113/*
114 * wake a single writer
115 */
116static inline struct rw_semaphore *
117__rwsem_wake_one_writer(struct rw_semaphore *sem)
118{
119 struct rwsem_waiter *waiter;
120
121 waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
122 wake_up_process(waiter->task);
123
124 return sem;
125}
126
127/*
128 * get a read lock on the semaphore
129 */
130int __sched __down_read_common(struct rw_semaphore *sem, int state)
131{
132 struct rwsem_waiter waiter;
133 unsigned long flags;
134
135 raw_spin_lock_irqsave(&sem->wait_lock, flags);
136
137 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
138 /* granted */
139 sem->count++;
140 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
141 goto out;
142 }
143
144 /* set up my own style of waitqueue */
145 waiter.task = current;
146 waiter.type = RWSEM_WAITING_FOR_READ;
147 get_task_struct(current);
148
149 list_add_tail(&waiter.list, &sem->wait_list);
150
151 /* wait to be given the lock */
152 for (;;) {
153 if (!waiter.task)
154 break;
155 if (signal_pending_state(state, current))
156 goto out_nolock;
157 set_current_state(state);
158 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
159 schedule();
160 raw_spin_lock_irqsave(&sem->wait_lock, flags);
161 }
162
163 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
164 out:
165 return 0;
166
167out_nolock:
168 /*
169 * We didn't take the lock, so that there is a writer, which
170 * is owner or the first waiter of the sem. If it's a waiter,
171 * it will be woken by current owner. Not need to wake anybody.
172 */
173 list_del(&waiter.list);
174 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
175 return -EINTR;
176}
177
178void __sched __down_read(struct rw_semaphore *sem)
179{
180 __down_read_common(sem, TASK_UNINTERRUPTIBLE);
181}
182
183int __sched __down_read_killable(struct rw_semaphore *sem)
184{
185 return __down_read_common(sem, TASK_KILLABLE);
186}
187
188/*
189 * trylock for reading -- returns 1 if successful, 0 if contention
190 */
191int __down_read_trylock(struct rw_semaphore *sem)
192{
193 unsigned long flags;
194 int ret = 0;
195
196
197 raw_spin_lock_irqsave(&sem->wait_lock, flags);
198
199 if (sem->count >= 0 && list_empty(&sem->wait_list)) {
200 /* granted */
201 sem->count++;
202 ret = 1;
203 }
204
205 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
206
207 return ret;
208}
209
210/*
211 * get a write lock on the semaphore
212 */
213int __sched __down_write_common(struct rw_semaphore *sem, int state)
214{
215 struct rwsem_waiter waiter;
216 unsigned long flags;
217 int ret = 0;
218
219 raw_spin_lock_irqsave(&sem->wait_lock, flags);
220
221 /* set up my own style of waitqueue */
222 waiter.task = current;
223 waiter.type = RWSEM_WAITING_FOR_WRITE;
224 list_add_tail(&waiter.list, &sem->wait_list);
225
226 /* wait for someone to release the lock */
227 for (;;) {
228 /*
229 * That is the key to support write lock stealing: allows the
230 * task already on CPU to get the lock soon rather than put
231 * itself into sleep and waiting for system woke it or someone
232 * else in the head of the wait list up.
233 */
234 if (sem->count == 0)
235 break;
236 if (signal_pending_state(state, current))
237 goto out_nolock;
238
239 set_current_state(state);
240 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
241 schedule();
242 raw_spin_lock_irqsave(&sem->wait_lock, flags);
243 }
244 /* got the lock */
245 sem->count = -1;
246 list_del(&waiter.list);
247
248 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
249
250 return ret;
251
252out_nolock:
253 list_del(&waiter.list);
254 if (!list_empty(&sem->wait_list) && sem->count >= 0)
255 __rwsem_do_wake(sem, 0);
256 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
257
258 return -EINTR;
259}
260
261void __sched __down_write(struct rw_semaphore *sem)
262{
263 __down_write_common(sem, TASK_UNINTERRUPTIBLE);
264}
265
266int __sched __down_write_killable(struct rw_semaphore *sem)
267{
268 return __down_write_common(sem, TASK_KILLABLE);
269}
270
271/*
272 * trylock for writing -- returns 1 if successful, 0 if contention
273 */
274int __down_write_trylock(struct rw_semaphore *sem)
275{
276 unsigned long flags;
277 int ret = 0;
278
279 raw_spin_lock_irqsave(&sem->wait_lock, flags);
280
281 if (sem->count == 0) {
282 /* got the lock */
283 sem->count = -1;
284 ret = 1;
285 }
286
287 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
288
289 return ret;
290}
291
292/*
293 * release a read lock on the semaphore
294 */
295void __up_read(struct rw_semaphore *sem)
296{
297 unsigned long flags;
298
299 raw_spin_lock_irqsave(&sem->wait_lock, flags);
300
301 if (--sem->count == 0 && !list_empty(&sem->wait_list))
302 sem = __rwsem_wake_one_writer(sem);
303
304 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
305}
306
307/*
308 * release a write lock on the semaphore
309 */
310void __up_write(struct rw_semaphore *sem)
311{
312 unsigned long flags;
313
314 raw_spin_lock_irqsave(&sem->wait_lock, flags);
315
316 sem->count = 0;
317 if (!list_empty(&sem->wait_list))
318 sem = __rwsem_do_wake(sem, 1);
319
320 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
321}
322
323/*
324 * downgrade a write lock into a read lock
325 * - just wake up any readers at the front of the queue
326 */
327void __downgrade_write(struct rw_semaphore *sem)
328{
329 unsigned long flags;
330
331 raw_spin_lock_irqsave(&sem->wait_lock, flags);
332
333 sem->count = 1;
334 if (!list_empty(&sem->wait_list))
335 sem = __rwsem_do_wake(sem, 0);
336
337 raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
338}
339
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index fbe96341beee..6b3ee9948bf1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -147,6 +147,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
147 * will notice the queued writer. 147 * will notice the queued writer.
148 */ 148 */
149 wake_q_add(wake_q, waiter->task); 149 wake_q_add(wake_q, waiter->task);
150 lockevent_inc(rwsem_wake_writer);
150 } 151 }
151 152
152 return; 153 return;
@@ -176,9 +177,8 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
176 goto try_reader_grant; 177 goto try_reader_grant;
177 } 178 }
178 /* 179 /*
179 * It is not really necessary to set it to reader-owned here, 180 * Set it to reader-owned to give spinners an early
180 * but it gives the spinners an early indication that the 181 * indication that readers now have the lock.
181 * readers now have the lock.
182 */ 182 */
183 __rwsem_set_reader_owned(sem, waiter->task); 183 __rwsem_set_reader_owned(sem, waiter->task);
184 } 184 }
@@ -215,6 +215,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
215 } 215 }
216 216
217 adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; 217 adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
218 lockevent_cond_inc(rwsem_wake_reader, woken);
218 if (list_empty(&sem->wait_list)) { 219 if (list_empty(&sem->wait_list)) {
219 /* hit end of list above */ 220 /* hit end of list above */
220 adjustment -= RWSEM_WAITING_BIAS; 221 adjustment -= RWSEM_WAITING_BIAS;
@@ -225,92 +226,6 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
225} 226}
226 227
227/* 228/*
228 * Wait for the read lock to be granted
229 */
230static inline struct rw_semaphore __sched *
231__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
232{
233 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
234 struct rwsem_waiter waiter;
235 DEFINE_WAKE_Q(wake_q);
236
237 waiter.task = current;
238 waiter.type = RWSEM_WAITING_FOR_READ;
239
240 raw_spin_lock_irq(&sem->wait_lock);
241 if (list_empty(&sem->wait_list)) {
242 /*
243 * In case the wait queue is empty and the lock isn't owned
244 * by a writer, this reader can exit the slowpath and return
245 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
246 * been set in the count.
247 */
248 if (atomic_long_read(&sem->count) >= 0) {
249 raw_spin_unlock_irq(&sem->wait_lock);
250 return sem;
251 }
252 adjustment += RWSEM_WAITING_BIAS;
253 }
254 list_add_tail(&waiter.list, &sem->wait_list);
255
256 /* we're now waiting on the lock, but no longer actively locking */
257 count = atomic_long_add_return(adjustment, &sem->count);
258
259 /*
260 * If there are no active locks, wake the front queued process(es).
261 *
262 * If there are no writers and we are first in the queue,
263 * wake our own waiter to join the existing active readers !
264 */
265 if (count == RWSEM_WAITING_BIAS ||
266 (count > RWSEM_WAITING_BIAS &&
267 adjustment != -RWSEM_ACTIVE_READ_BIAS))
268 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
269
270 raw_spin_unlock_irq(&sem->wait_lock);
271 wake_up_q(&wake_q);
272
273 /* wait to be given the lock */
274 while (true) {
275 set_current_state(state);
276 if (!waiter.task)
277 break;
278 if (signal_pending_state(state, current)) {
279 raw_spin_lock_irq(&sem->wait_lock);
280 if (waiter.task)
281 goto out_nolock;
282 raw_spin_unlock_irq(&sem->wait_lock);
283 break;
284 }
285 schedule();
286 }
287
288 __set_current_state(TASK_RUNNING);
289 return sem;
290out_nolock:
291 list_del(&waiter.list);
292 if (list_empty(&sem->wait_list))
293 atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
294 raw_spin_unlock_irq(&sem->wait_lock);
295 __set_current_state(TASK_RUNNING);
296 return ERR_PTR(-EINTR);
297}
298
299__visible struct rw_semaphore * __sched
300rwsem_down_read_failed(struct rw_semaphore *sem)
301{
302 return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
303}
304EXPORT_SYMBOL(rwsem_down_read_failed);
305
306__visible struct rw_semaphore * __sched
307rwsem_down_read_failed_killable(struct rw_semaphore *sem)
308{
309 return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
310}
311EXPORT_SYMBOL(rwsem_down_read_failed_killable);
312
313/*
314 * This function must be called with the sem->wait_lock held to prevent 229 * This function must be called with the sem->wait_lock held to prevent
315 * race conditions between checking the rwsem wait list and setting the 230 * race conditions between checking the rwsem wait list and setting the
316 * sem->count accordingly. 231 * sem->count accordingly.
@@ -346,21 +261,17 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
346 */ 261 */
347static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) 262static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
348{ 263{
349 long old, count = atomic_long_read(&sem->count); 264 long count = atomic_long_read(&sem->count);
350
351 while (true) {
352 if (!(count == 0 || count == RWSEM_WAITING_BIAS))
353 return false;
354 265
355 old = atomic_long_cmpxchg_acquire(&sem->count, count, 266 while (!count || count == RWSEM_WAITING_BIAS) {
356 count + RWSEM_ACTIVE_WRITE_BIAS); 267 if (atomic_long_try_cmpxchg_acquire(&sem->count, &count,
357 if (old == count) { 268 count + RWSEM_ACTIVE_WRITE_BIAS)) {
358 rwsem_set_owner(sem); 269 rwsem_set_owner(sem);
270 lockevent_inc(rwsem_opt_wlock);
359 return true; 271 return true;
360 } 272 }
361
362 count = old;
363 } 273 }
274 return false;
364} 275}
365 276
366static inline bool owner_on_cpu(struct task_struct *owner) 277static inline bool owner_on_cpu(struct task_struct *owner)
@@ -481,6 +392,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
481 osq_unlock(&sem->osq); 392 osq_unlock(&sem->osq);
482done: 393done:
483 preempt_enable(); 394 preempt_enable();
395 lockevent_cond_inc(rwsem_opt_fail, !taken);
484 return taken; 396 return taken;
485} 397}
486 398
@@ -505,6 +417,97 @@ static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
505#endif 417#endif
506 418
507/* 419/*
420 * Wait for the read lock to be granted
421 */
422static inline struct rw_semaphore __sched *
423__rwsem_down_read_failed_common(struct rw_semaphore *sem, int state)
424{
425 long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
426 struct rwsem_waiter waiter;
427 DEFINE_WAKE_Q(wake_q);
428
429 waiter.task = current;
430 waiter.type = RWSEM_WAITING_FOR_READ;
431
432 raw_spin_lock_irq(&sem->wait_lock);
433 if (list_empty(&sem->wait_list)) {
434 /*
435 * In case the wait queue is empty and the lock isn't owned
436 * by a writer, this reader can exit the slowpath and return
437 * immediately as its RWSEM_ACTIVE_READ_BIAS has already
438 * been set in the count.
439 */
440 if (atomic_long_read(&sem->count) >= 0) {
441 raw_spin_unlock_irq(&sem->wait_lock);
442 rwsem_set_reader_owned(sem);
443 lockevent_inc(rwsem_rlock_fast);
444 return sem;
445 }
446 adjustment += RWSEM_WAITING_BIAS;
447 }
448 list_add_tail(&waiter.list, &sem->wait_list);
449
450 /* we're now waiting on the lock, but no longer actively locking */
451 count = atomic_long_add_return(adjustment, &sem->count);
452
453 /*
454 * If there are no active locks, wake the front queued process(es).
455 *
456 * If there are no writers and we are first in the queue,
457 * wake our own waiter to join the existing active readers !
458 */
459 if (count == RWSEM_WAITING_BIAS ||
460 (count > RWSEM_WAITING_BIAS &&
461 adjustment != -RWSEM_ACTIVE_READ_BIAS))
462 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
463
464 raw_spin_unlock_irq(&sem->wait_lock);
465 wake_up_q(&wake_q);
466
467 /* wait to be given the lock */
468 while (true) {
469 set_current_state(state);
470 if (!waiter.task)
471 break;
472 if (signal_pending_state(state, current)) {
473 raw_spin_lock_irq(&sem->wait_lock);
474 if (waiter.task)
475 goto out_nolock;
476 raw_spin_unlock_irq(&sem->wait_lock);
477 break;
478 }
479 schedule();
480 lockevent_inc(rwsem_sleep_reader);
481 }
482
483 __set_current_state(TASK_RUNNING);
484 lockevent_inc(rwsem_rlock);
485 return sem;
486out_nolock:
487 list_del(&waiter.list);
488 if (list_empty(&sem->wait_list))
489 atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
490 raw_spin_unlock_irq(&sem->wait_lock);
491 __set_current_state(TASK_RUNNING);
492 lockevent_inc(rwsem_rlock_fail);
493 return ERR_PTR(-EINTR);
494}
495
496__visible struct rw_semaphore * __sched
497rwsem_down_read_failed(struct rw_semaphore *sem)
498{
499 return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE);
500}
501EXPORT_SYMBOL(rwsem_down_read_failed);
502
503__visible struct rw_semaphore * __sched
504rwsem_down_read_failed_killable(struct rw_semaphore *sem)
505{
506 return __rwsem_down_read_failed_common(sem, TASK_KILLABLE);
507}
508EXPORT_SYMBOL(rwsem_down_read_failed_killable);
509
510/*
508 * Wait until we successfully acquire the write lock 511 * Wait until we successfully acquire the write lock
509 */ 512 */
510static inline struct rw_semaphore * 513static inline struct rw_semaphore *
@@ -580,6 +583,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
580 goto out_nolock; 583 goto out_nolock;
581 584
582 schedule(); 585 schedule();
586 lockevent_inc(rwsem_sleep_writer);
583 set_current_state(state); 587 set_current_state(state);
584 } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); 588 } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
585 589
@@ -588,6 +592,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
588 __set_current_state(TASK_RUNNING); 592 __set_current_state(TASK_RUNNING);
589 list_del(&waiter.list); 593 list_del(&waiter.list);
590 raw_spin_unlock_irq(&sem->wait_lock); 594 raw_spin_unlock_irq(&sem->wait_lock);
595 lockevent_inc(rwsem_wlock);
591 596
592 return ret; 597 return ret;
593 598
@@ -601,6 +606,7 @@ out_nolock:
601 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); 606 __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
602 raw_spin_unlock_irq(&sem->wait_lock); 607 raw_spin_unlock_irq(&sem->wait_lock);
603 wake_up_q(&wake_q); 608 wake_up_q(&wake_q);
609 lockevent_inc(rwsem_wlock_fail);
604 610
605 return ERR_PTR(-EINTR); 611 return ERR_PTR(-EINTR);
606} 612}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e586f0d03ad3..ccbf18f560ff 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -24,7 +24,6 @@ void __sched down_read(struct rw_semaphore *sem)
24 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 24 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
25 25
26 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 26 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
27 rwsem_set_reader_owned(sem);
28} 27}
29 28
30EXPORT_SYMBOL(down_read); 29EXPORT_SYMBOL(down_read);
@@ -39,7 +38,6 @@ int __sched down_read_killable(struct rw_semaphore *sem)
39 return -EINTR; 38 return -EINTR;
40 } 39 }
41 40
42 rwsem_set_reader_owned(sem);
43 return 0; 41 return 0;
44} 42}
45 43
@@ -52,10 +50,8 @@ int down_read_trylock(struct rw_semaphore *sem)
52{ 50{
53 int ret = __down_read_trylock(sem); 51 int ret = __down_read_trylock(sem);
54 52
55 if (ret == 1) { 53 if (ret == 1)
56 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); 54 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
57 rwsem_set_reader_owned(sem);
58 }
59 return ret; 55 return ret;
60} 56}
61 57
@@ -70,7 +66,6 @@ void __sched down_write(struct rw_semaphore *sem)
70 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 66 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
71 67
72 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 68 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
73 rwsem_set_owner(sem);
74} 69}
75 70
76EXPORT_SYMBOL(down_write); 71EXPORT_SYMBOL(down_write);
@@ -88,7 +83,6 @@ int __sched down_write_killable(struct rw_semaphore *sem)
88 return -EINTR; 83 return -EINTR;
89 } 84 }
90 85
91 rwsem_set_owner(sem);
92 return 0; 86 return 0;
93} 87}
94 88
@@ -101,10 +95,8 @@ int down_write_trylock(struct rw_semaphore *sem)
101{ 95{
102 int ret = __down_write_trylock(sem); 96 int ret = __down_write_trylock(sem);
103 97
104 if (ret == 1) { 98 if (ret == 1)
105 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); 99 rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
106 rwsem_set_owner(sem);
107 }
108 100
109 return ret; 101 return ret;
110} 102}
@@ -117,9 +109,7 @@ EXPORT_SYMBOL(down_write_trylock);
117void up_read(struct rw_semaphore *sem) 109void up_read(struct rw_semaphore *sem)
118{ 110{
119 rwsem_release(&sem->dep_map, 1, _RET_IP_); 111 rwsem_release(&sem->dep_map, 1, _RET_IP_);
120 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED));
121 112
122 rwsem_clear_reader_owned(sem);
123 __up_read(sem); 113 __up_read(sem);
124} 114}
125 115
@@ -131,9 +121,7 @@ EXPORT_SYMBOL(up_read);
131void up_write(struct rw_semaphore *sem) 121void up_write(struct rw_semaphore *sem)
132{ 122{
133 rwsem_release(&sem->dep_map, 1, _RET_IP_); 123 rwsem_release(&sem->dep_map, 1, _RET_IP_);
134 DEBUG_RWSEMS_WARN_ON(sem->owner != current);
135 124
136 rwsem_clear_owner(sem);
137 __up_write(sem); 125 __up_write(sem);
138} 126}
139 127
@@ -145,9 +133,7 @@ EXPORT_SYMBOL(up_write);
145void downgrade_write(struct rw_semaphore *sem) 133void downgrade_write(struct rw_semaphore *sem)
146{ 134{
147 lock_downgrade(&sem->dep_map, _RET_IP_); 135 lock_downgrade(&sem->dep_map, _RET_IP_);
148 DEBUG_RWSEMS_WARN_ON(sem->owner != current);
149 136
150 rwsem_set_reader_owned(sem);
151 __downgrade_write(sem); 137 __downgrade_write(sem);
152} 138}
153 139
@@ -161,7 +147,6 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
161 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); 147 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
162 148
163 LOCK_CONTENDED(sem, __down_read_trylock, __down_read); 149 LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
164 rwsem_set_reader_owned(sem);
165} 150}
166 151
167EXPORT_SYMBOL(down_read_nested); 152EXPORT_SYMBOL(down_read_nested);
@@ -172,7 +157,6 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
172 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); 157 rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
173 158
174 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 159 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
175 rwsem_set_owner(sem);
176} 160}
177 161
178EXPORT_SYMBOL(_down_write_nest_lock); 162EXPORT_SYMBOL(_down_write_nest_lock);
@@ -193,7 +177,6 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
193 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); 177 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
194 178
195 LOCK_CONTENDED(sem, __down_write_trylock, __down_write); 179 LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
196 rwsem_set_owner(sem);
197} 180}
198 181
199EXPORT_SYMBOL(down_write_nested); 182EXPORT_SYMBOL(down_write_nested);
@@ -208,7 +191,6 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass)
208 return -EINTR; 191 return -EINTR;
209 } 192 }
210 193
211 rwsem_set_owner(sem);
212 return 0; 194 return 0;
213} 195}
214 196
@@ -216,7 +198,8 @@ EXPORT_SYMBOL(down_write_killable_nested);
216 198
217void up_read_non_owner(struct rw_semaphore *sem) 199void up_read_non_owner(struct rw_semaphore *sem)
218{ 200{
219 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); 201 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
202 sem);
220 __up_read(sem); 203 __up_read(sem);
221} 204}
222 205
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index bad2bca0268b..64877f5294e3 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -23,15 +23,44 @@
23 * is involved. Ideally we would like to track all the readers that own 23 * is involved. Ideally we would like to track all the readers that own
24 * a rwsem, but the overhead is simply too big. 24 * a rwsem, but the overhead is simply too big.
25 */ 25 */
26#include "lock_events.h"
27
26#define RWSEM_READER_OWNED (1UL << 0) 28#define RWSEM_READER_OWNED (1UL << 0)
27#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) 29#define RWSEM_ANONYMOUSLY_OWNED (1UL << 1)
28 30
29#ifdef CONFIG_DEBUG_RWSEMS 31#ifdef CONFIG_DEBUG_RWSEMS
30# define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) 32# define DEBUG_RWSEMS_WARN_ON(c, sem) do { \
33 if (!debug_locks_silent && \
34 WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\
35 #c, atomic_long_read(&(sem)->count), \
36 (long)((sem)->owner), (long)current, \
37 list_empty(&(sem)->wait_list) ? "" : "not ")) \
38 debug_locks_off(); \
39 } while (0)
40#else
41# define DEBUG_RWSEMS_WARN_ON(c, sem)
42#endif
43
44/*
45 * R/W semaphores originally for PPC using the stuff in lib/rwsem.c.
46 * Adapted largely from include/asm-i386/rwsem.h
47 * by Paul Mackerras <paulus@samba.org>.
48 */
49
50/*
51 * the semaphore definition
52 */
53#ifdef CONFIG_64BIT
54# define RWSEM_ACTIVE_MASK 0xffffffffL
31#else 55#else
32# define DEBUG_RWSEMS_WARN_ON(c) 56# define RWSEM_ACTIVE_MASK 0x0000ffffL
33#endif 57#endif
34 58
59#define RWSEM_ACTIVE_BIAS 0x00000001L
60#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
61#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
62#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
63
35#ifdef CONFIG_RWSEM_SPIN_ON_OWNER 64#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
36/* 65/*
37 * All writes to owner are protected by WRITE_ONCE() to make sure that 66 * All writes to owner are protected by WRITE_ONCE() to make sure that
@@ -132,3 +161,144 @@ static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem)
132{ 161{
133} 162}
134#endif 163#endif
164
165extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
166extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem);
167extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
168extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem);
169extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
170extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
171
172/*
173 * lock for reading
174 */
175static inline void __down_read(struct rw_semaphore *sem)
176{
177 if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
178 rwsem_down_read_failed(sem);
179 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
180 RWSEM_READER_OWNED), sem);
181 } else {
182 rwsem_set_reader_owned(sem);
183 }
184}
185
186static inline int __down_read_killable(struct rw_semaphore *sem)
187{
188 if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) {
189 if (IS_ERR(rwsem_down_read_failed_killable(sem)))
190 return -EINTR;
191 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner &
192 RWSEM_READER_OWNED), sem);
193 } else {
194 rwsem_set_reader_owned(sem);
195 }
196 return 0;
197}
198
199static inline int __down_read_trylock(struct rw_semaphore *sem)
200{
201 /*
202 * Optimize for the case when the rwsem is not locked at all.
203 */
204 long tmp = RWSEM_UNLOCKED_VALUE;
205
206 lockevent_inc(rwsem_rtrylock);
207 do {
208 if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp,
209 tmp + RWSEM_ACTIVE_READ_BIAS)) {
210 rwsem_set_reader_owned(sem);
211 return 1;
212 }
213 } while (tmp >= 0);
214 return 0;
215}
216
217/*
218 * lock for writing
219 */
220static inline void __down_write(struct rw_semaphore *sem)
221{
222 long tmp;
223
224 tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
225 &sem->count);
226 if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
227 rwsem_down_write_failed(sem);
228 rwsem_set_owner(sem);
229}
230
231static inline int __down_write_killable(struct rw_semaphore *sem)
232{
233 long tmp;
234
235 tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS,
236 &sem->count);
237 if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS))
238 if (IS_ERR(rwsem_down_write_failed_killable(sem)))
239 return -EINTR;
240 rwsem_set_owner(sem);
241 return 0;
242}
243
244static inline int __down_write_trylock(struct rw_semaphore *sem)
245{
246 long tmp;
247
248 lockevent_inc(rwsem_wtrylock);
249 tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE,
250 RWSEM_ACTIVE_WRITE_BIAS);
251 if (tmp == RWSEM_UNLOCKED_VALUE) {
252 rwsem_set_owner(sem);
253 return true;
254 }
255 return false;
256}
257
258/*
259 * unlock after reading
260 */
261static inline void __up_read(struct rw_semaphore *sem)
262{
263 long tmp;
264
265 DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED),
266 sem);
267 rwsem_clear_reader_owned(sem);
268 tmp = atomic_long_dec_return_release(&sem->count);
269 if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0))
270 rwsem_wake(sem);
271}
272
273/*
274 * unlock after writing
275 */
276static inline void __up_write(struct rw_semaphore *sem)
277{
278 DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
279 rwsem_clear_owner(sem);
280 if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS,
281 &sem->count) < 0))
282 rwsem_wake(sem);
283}
284
285/*
286 * downgrade write lock to read lock
287 */
288static inline void __downgrade_write(struct rw_semaphore *sem)
289{
290 long tmp;
291
292 /*
293 * When downgrading from exclusive to shared ownership,
294 * anything inside the write-locked region cannot leak
295 * into the read side. In contrast, anything in the
296 * read-locked region is ok to be re-ordered into the
297 * write side. As such, rely on RELEASE semantics.
298 */
299 DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem);
300 tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count);
301 rwsem_set_reader_owned(sem);
302 if (tmp < 0)
303 rwsem_downgrade_wake(sem);
304}