diff options
Diffstat (limited to 'kernel/locking')
| -rw-r--r-- | kernel/locking/lockdep.c | 116 | ||||
| -rw-r--r-- | kernel/locking/lockdep_internals.h | 27 | ||||
| -rw-r--r-- | kernel/locking/lockdep_proc.c | 2 | ||||
| -rw-r--r-- | kernel/locking/qspinlock.c | 143 | ||||
| -rw-r--r-- | kernel/locking/qspinlock_paravirt.h | 4 | ||||
| -rw-r--r-- | kernel/locking/qspinlock_stat.h | 6 | ||||
| -rw-r--r-- | kernel/locking/rtmutex.c | 4 | ||||
| -rw-r--r-- | kernel/locking/rwsem-xadd.c | 15 | ||||
| -rw-r--r-- | kernel/locking/rwsem.c | 7 | ||||
| -rw-r--r-- | kernel/locking/rwsem.h | 95 |
10 files changed, 252 insertions, 167 deletions
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index dd13f865ad40..1efada2dd9dd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
| @@ -138,7 +138,7 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; | |||
| 138 | * get freed - this significantly simplifies the debugging code. | 138 | * get freed - this significantly simplifies the debugging code. |
| 139 | */ | 139 | */ |
| 140 | unsigned long nr_lock_classes; | 140 | unsigned long nr_lock_classes; |
| 141 | static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; | 141 | struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; |
| 142 | 142 | ||
| 143 | static inline struct lock_class *hlock_class(struct held_lock *hlock) | 143 | static inline struct lock_class *hlock_class(struct held_lock *hlock) |
| 144 | { | 144 | { |
| @@ -1391,7 +1391,9 @@ static void print_lock_class_header(struct lock_class *class, int depth) | |||
| 1391 | 1391 | ||
| 1392 | printk("%*s->", depth, ""); | 1392 | printk("%*s->", depth, ""); |
| 1393 | print_lock_name(class); | 1393 | print_lock_name(class); |
| 1394 | printk(KERN_CONT " ops: %lu", class->ops); | 1394 | #ifdef CONFIG_DEBUG_LOCKDEP |
| 1395 | printk(KERN_CONT " ops: %lu", debug_class_ops_read(class)); | ||
| 1396 | #endif | ||
| 1395 | printk(KERN_CONT " {\n"); | 1397 | printk(KERN_CONT " {\n"); |
| 1396 | 1398 | ||
| 1397 | for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { | 1399 | for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { |
| @@ -2148,76 +2150,6 @@ static int check_no_collision(struct task_struct *curr, | |||
| 2148 | } | 2150 | } |
| 2149 | 2151 | ||
| 2150 | /* | 2152 | /* |
| 2151 | * This is for building a chain between just two different classes, | ||
| 2152 | * instead of adding a new hlock upon current, which is done by | ||
| 2153 | * add_chain_cache(). | ||
| 2154 | * | ||
| 2155 | * This can be called in any context with two classes, while | ||
| 2156 | * add_chain_cache() must be done within the lock owener's context | ||
| 2157 | * since it uses hlock which might be racy in another context. | ||
| 2158 | */ | ||
| 2159 | static inline int add_chain_cache_classes(unsigned int prev, | ||
| 2160 | unsigned int next, | ||
| 2161 | unsigned int irq_context, | ||
| 2162 | u64 chain_key) | ||
| 2163 | { | ||
| 2164 | struct hlist_head *hash_head = chainhashentry(chain_key); | ||
| 2165 | struct lock_chain *chain; | ||
| 2166 | |||
| 2167 | /* | ||
| 2168 | * Allocate a new chain entry from the static array, and add | ||
| 2169 | * it to the hash: | ||
| 2170 | */ | ||
| 2171 | |||
| 2172 | /* | ||
| 2173 | * We might need to take the graph lock, ensure we've got IRQs | ||
| 2174 | * disabled to make this an IRQ-safe lock.. for recursion reasons | ||
| 2175 | * lockdep won't complain about its own locking errors. | ||
| 2176 | */ | ||
| 2177 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 2178 | return 0; | ||
| 2179 | |||
| 2180 | if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { | ||
| 2181 | if (!debug_locks_off_graph_unlock()) | ||
| 2182 | return 0; | ||
| 2183 | |||
| 2184 | print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); | ||
| 2185 | dump_stack(); | ||
| 2186 | return 0; | ||
| 2187 | } | ||
| 2188 | |||
| 2189 | chain = lock_chains + nr_lock_chains++; | ||
| 2190 | chain->chain_key = chain_key; | ||
| 2191 | chain->irq_context = irq_context; | ||
| 2192 | chain->depth = 2; | ||
| 2193 | if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { | ||
| 2194 | chain->base = nr_chain_hlocks; | ||
| 2195 | nr_chain_hlocks += chain->depth; | ||
| 2196 | chain_hlocks[chain->base] = prev - 1; | ||
| 2197 | chain_hlocks[chain->base + 1] = next -1; | ||
| 2198 | } | ||
| 2199 | #ifdef CONFIG_DEBUG_LOCKDEP | ||
| 2200 | /* | ||
| 2201 | * Important for check_no_collision(). | ||
| 2202 | */ | ||
| 2203 | else { | ||
| 2204 | if (!debug_locks_off_graph_unlock()) | ||
| 2205 | return 0; | ||
| 2206 | |||
| 2207 | print_lockdep_off("BUG: MAX_LOCKDEP_CHAIN_HLOCKS too low!"); | ||
| 2208 | dump_stack(); | ||
| 2209 | return 0; | ||
| 2210 | } | ||
| 2211 | #endif | ||
| 2212 | |||
| 2213 | hlist_add_head_rcu(&chain->entry, hash_head); | ||
| 2214 | debug_atomic_inc(chain_lookup_misses); | ||
| 2215 | inc_chains(); | ||
| 2216 | |||
| 2217 | return 1; | ||
| 2218 | } | ||
| 2219 | |||
| 2220 | /* | ||
| 2221 | * Adds a dependency chain into chain hashtable. And must be called with | 2153 | * Adds a dependency chain into chain hashtable. And must be called with |
| 2222 | * graph_lock held. | 2154 | * graph_lock held. |
| 2223 | * | 2155 | * |
| @@ -3262,6 +3194,10 @@ static int __lock_is_held(const struct lockdep_map *lock, int read); | |||
| 3262 | /* | 3194 | /* |
| 3263 | * This gets called for every mutex_lock*()/spin_lock*() operation. | 3195 | * This gets called for every mutex_lock*()/spin_lock*() operation. |
| 3264 | * We maintain the dependency maps and validate the locking attempt: | 3196 | * We maintain the dependency maps and validate the locking attempt: |
| 3197 | * | ||
| 3198 | * The callers must make sure that IRQs are disabled before calling it, | ||
| 3199 | * otherwise we could get an interrupt which would want to take locks, | ||
| 3200 | * which would end up in lockdep again. | ||
| 3265 | */ | 3201 | */ |
| 3266 | static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | 3202 | static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, |
| 3267 | int trylock, int read, int check, int hardirqs_off, | 3203 | int trylock, int read, int check, int hardirqs_off, |
| @@ -3279,14 +3215,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3279 | if (unlikely(!debug_locks)) | 3215 | if (unlikely(!debug_locks)) |
| 3280 | return 0; | 3216 | return 0; |
| 3281 | 3217 | ||
| 3282 | /* | ||
| 3283 | * Lockdep should run with IRQs disabled, otherwise we could | ||
| 3284 | * get an interrupt which would want to take locks, which would | ||
| 3285 | * end up in lockdep and have you got a head-ache already? | ||
| 3286 | */ | ||
| 3287 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 3288 | return 0; | ||
| 3289 | |||
| 3290 | if (!prove_locking || lock->key == &__lockdep_no_validate__) | 3218 | if (!prove_locking || lock->key == &__lockdep_no_validate__) |
| 3291 | check = 0; | 3219 | check = 0; |
| 3292 | 3220 | ||
| @@ -3300,7 +3228,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3300 | if (!class) | 3228 | if (!class) |
| 3301 | return 0; | 3229 | return 0; |
| 3302 | } | 3230 | } |
| 3303 | atomic_inc((atomic_t *)&class->ops); | 3231 | |
| 3232 | debug_class_ops_inc(class); | ||
| 3233 | |||
| 3304 | if (very_verbose(class)) { | 3234 | if (very_verbose(class)) { |
| 3305 | printk("\nacquire class [%px] %s", class->key, class->name); | 3235 | printk("\nacquire class [%px] %s", class->key, class->name); |
| 3306 | if (class->name_version > 1) | 3236 | if (class->name_version > 1) |
| @@ -3543,6 +3473,9 @@ static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, | |||
| 3543 | { | 3473 | { |
| 3544 | struct held_lock *hlock; | 3474 | struct held_lock *hlock; |
| 3545 | 3475 | ||
| 3476 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | ||
| 3477 | return 0; | ||
| 3478 | |||
| 3546 | for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { | 3479 | for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { |
| 3547 | if (!__lock_acquire(hlock->instance, | 3480 | if (!__lock_acquire(hlock->instance, |
| 3548 | hlock_class(hlock)->subclass, | 3481 | hlock_class(hlock)->subclass, |
| @@ -3696,6 +3629,13 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
| 3696 | curr->lockdep_depth = i; | 3629 | curr->lockdep_depth = i; |
| 3697 | curr->curr_chain_key = hlock->prev_chain_key; | 3630 | curr->curr_chain_key = hlock->prev_chain_key; |
| 3698 | 3631 | ||
| 3632 | /* | ||
| 3633 | * The most likely case is when the unlock is on the innermost | ||
| 3634 | * lock. In this case, we are done! | ||
| 3635 | */ | ||
| 3636 | if (i == depth-1) | ||
| 3637 | return 1; | ||
| 3638 | |||
| 3699 | if (reacquire_held_locks(curr, depth, i + 1)) | 3639 | if (reacquire_held_locks(curr, depth, i + 1)) |
| 3700 | return 0; | 3640 | return 0; |
| 3701 | 3641 | ||
| @@ -3703,10 +3643,14 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
| 3703 | * We had N bottles of beer on the wall, we drank one, but now | 3643 | * We had N bottles of beer on the wall, we drank one, but now |
| 3704 | * there's not N-1 bottles of beer left on the wall... | 3644 | * there's not N-1 bottles of beer left on the wall... |
| 3705 | */ | 3645 | */ |
| 3706 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) | 3646 | DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1); |
| 3707 | return 0; | ||
| 3708 | 3647 | ||
| 3709 | return 1; | 3648 | /* |
| 3649 | * Since reacquire_held_locks() would have called check_chain_key() | ||
| 3650 | * indirectly via __lock_acquire(), we don't need to do it again | ||
| 3651 | * on return. | ||
| 3652 | */ | ||
| 3653 | return 0; | ||
| 3710 | } | 3654 | } |
| 3711 | 3655 | ||
| 3712 | static int __lock_is_held(const struct lockdep_map *lock, int read) | 3656 | static int __lock_is_held(const struct lockdep_map *lock, int read) |
| @@ -4122,7 +4066,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) | |||
| 4122 | { | 4066 | { |
| 4123 | unsigned long flags; | 4067 | unsigned long flags; |
| 4124 | 4068 | ||
| 4125 | if (unlikely(!lock_stat)) | 4069 | if (unlikely(!lock_stat || !debug_locks)) |
| 4126 | return; | 4070 | return; |
| 4127 | 4071 | ||
| 4128 | if (unlikely(current->lockdep_recursion)) | 4072 | if (unlikely(current->lockdep_recursion)) |
| @@ -4142,7 +4086,7 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) | |||
| 4142 | { | 4086 | { |
| 4143 | unsigned long flags; | 4087 | unsigned long flags; |
| 4144 | 4088 | ||
| 4145 | if (unlikely(!lock_stat)) | 4089 | if (unlikely(!lock_stat || !debug_locks)) |
| 4146 | return; | 4090 | return; |
| 4147 | 4091 | ||
| 4148 | if (unlikely(current->lockdep_recursion)) | 4092 | if (unlikely(current->lockdep_recursion)) |
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index d459d624ba2a..88c847a41c8a 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h | |||
| @@ -152,9 +152,15 @@ struct lockdep_stats { | |||
| 152 | int nr_find_usage_forwards_recursions; | 152 | int nr_find_usage_forwards_recursions; |
| 153 | int nr_find_usage_backwards_checks; | 153 | int nr_find_usage_backwards_checks; |
| 154 | int nr_find_usage_backwards_recursions; | 154 | int nr_find_usage_backwards_recursions; |
| 155 | |||
| 156 | /* | ||
| 157 | * Per lock class locking operation stat counts | ||
| 158 | */ | ||
| 159 | unsigned long lock_class_ops[MAX_LOCKDEP_KEYS]; | ||
| 155 | }; | 160 | }; |
| 156 | 161 | ||
| 157 | DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); | 162 | DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); |
| 163 | extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; | ||
| 158 | 164 | ||
| 159 | #define __debug_atomic_inc(ptr) \ | 165 | #define __debug_atomic_inc(ptr) \ |
| 160 | this_cpu_inc(lockdep_stats.ptr); | 166 | this_cpu_inc(lockdep_stats.ptr); |
| @@ -179,9 +185,30 @@ DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); | |||
| 179 | } \ | 185 | } \ |
| 180 | __total; \ | 186 | __total; \ |
| 181 | }) | 187 | }) |
| 188 | |||
| 189 | static inline void debug_class_ops_inc(struct lock_class *class) | ||
| 190 | { | ||
| 191 | int idx; | ||
| 192 | |||
| 193 | idx = class - lock_classes; | ||
| 194 | __debug_atomic_inc(lock_class_ops[idx]); | ||
| 195 | } | ||
| 196 | |||
| 197 | static inline unsigned long debug_class_ops_read(struct lock_class *class) | ||
| 198 | { | ||
| 199 | int idx, cpu; | ||
| 200 | unsigned long ops = 0; | ||
| 201 | |||
| 202 | idx = class - lock_classes; | ||
| 203 | for_each_possible_cpu(cpu) | ||
| 204 | ops += per_cpu(lockdep_stats.lock_class_ops[idx], cpu); | ||
| 205 | return ops; | ||
| 206 | } | ||
| 207 | |||
| 182 | #else | 208 | #else |
| 183 | # define __debug_atomic_inc(ptr) do { } while (0) | 209 | # define __debug_atomic_inc(ptr) do { } while (0) |
| 184 | # define debug_atomic_inc(ptr) do { } while (0) | 210 | # define debug_atomic_inc(ptr) do { } while (0) |
| 185 | # define debug_atomic_dec(ptr) do { } while (0) | 211 | # define debug_atomic_dec(ptr) do { } while (0) |
| 186 | # define debug_atomic_read(ptr) 0 | 212 | # define debug_atomic_read(ptr) 0 |
| 213 | # define debug_class_ops_inc(ptr) do { } while (0) | ||
| 187 | #endif | 214 | #endif |
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index 3dd980dfba2d..3d31f9b0059e 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c | |||
| @@ -68,7 +68,7 @@ static int l_show(struct seq_file *m, void *v) | |||
| 68 | 68 | ||
| 69 | seq_printf(m, "%p", class->key); | 69 | seq_printf(m, "%p", class->key); |
| 70 | #ifdef CONFIG_DEBUG_LOCKDEP | 70 | #ifdef CONFIG_DEBUG_LOCKDEP |
| 71 | seq_printf(m, " OPS:%8ld", class->ops); | 71 | seq_printf(m, " OPS:%8ld", debug_class_ops_read(class)); |
| 72 | #endif | 72 | #endif |
| 73 | #ifdef CONFIG_PROVE_LOCKING | 73 | #ifdef CONFIG_PROVE_LOCKING |
| 74 | seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); | 74 | seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); |
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index bfaeb05123ff..8a8c3c208c5e 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c | |||
| @@ -74,12 +74,24 @@ | |||
| 74 | */ | 74 | */ |
| 75 | 75 | ||
| 76 | #include "mcs_spinlock.h" | 76 | #include "mcs_spinlock.h" |
| 77 | #define MAX_NODES 4 | ||
| 77 | 78 | ||
| 79 | /* | ||
| 80 | * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in | ||
| 81 | * size and four of them will fit nicely in one 64-byte cacheline. For | ||
| 82 | * pvqspinlock, however, we need more space for extra data. To accommodate | ||
| 83 | * that, we insert two more long words to pad it up to 32 bytes. IOW, only | ||
| 84 | * two of them can fit in a cacheline in this case. That is OK as it is rare | ||
| 85 | * to have more than 2 levels of slowpath nesting in actual use. We don't | ||
| 86 | * want to penalize pvqspinlocks to optimize for a rare case in native | ||
| 87 | * qspinlocks. | ||
| 88 | */ | ||
| 89 | struct qnode { | ||
| 90 | struct mcs_spinlock mcs; | ||
| 78 | #ifdef CONFIG_PARAVIRT_SPINLOCKS | 91 | #ifdef CONFIG_PARAVIRT_SPINLOCKS |
| 79 | #define MAX_NODES 8 | 92 | long reserved[2]; |
| 80 | #else | ||
| 81 | #define MAX_NODES 4 | ||
| 82 | #endif | 93 | #endif |
| 94 | }; | ||
| 83 | 95 | ||
| 84 | /* | 96 | /* |
| 85 | * The pending bit spinning loop count. | 97 | * The pending bit spinning loop count. |
| @@ -101,7 +113,7 @@ | |||
| 101 | * | 113 | * |
| 102 | * PV doubles the storage and uses the second cacheline for PV state. | 114 | * PV doubles the storage and uses the second cacheline for PV state. |
| 103 | */ | 115 | */ |
| 104 | static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); | 116 | static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); |
| 105 | 117 | ||
| 106 | /* | 118 | /* |
| 107 | * We must be able to distinguish between no-tail and the tail at 0:0, | 119 | * We must be able to distinguish between no-tail and the tail at 0:0, |
| @@ -126,7 +138,13 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail) | |||
| 126 | int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; | 138 | int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; |
| 127 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; | 139 | int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; |
| 128 | 140 | ||
| 129 | return per_cpu_ptr(&mcs_nodes[idx], cpu); | 141 | return per_cpu_ptr(&qnodes[idx].mcs, cpu); |
| 142 | } | ||
| 143 | |||
| 144 | static inline __pure | ||
| 145 | struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx) | ||
| 146 | { | ||
| 147 | return &((struct qnode *)base + idx)->mcs; | ||
| 130 | } | 148 | } |
| 131 | 149 | ||
| 132 | #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) | 150 | #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) |
| @@ -232,6 +250,20 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) | |||
| 232 | #endif /* _Q_PENDING_BITS == 8 */ | 250 | #endif /* _Q_PENDING_BITS == 8 */ |
| 233 | 251 | ||
| 234 | /** | 252 | /** |
| 253 | * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending | ||
| 254 | * @lock : Pointer to queued spinlock structure | ||
| 255 | * Return: The previous lock value | ||
| 256 | * | ||
| 257 | * *,*,* -> *,1,* | ||
| 258 | */ | ||
| 259 | #ifndef queued_fetch_set_pending_acquire | ||
| 260 | static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) | ||
| 261 | { | ||
| 262 | return atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); | ||
| 263 | } | ||
| 264 | #endif | ||
| 265 | |||
| 266 | /** | ||
| 235 | * set_locked - Set the lock bit and own the lock | 267 | * set_locked - Set the lock bit and own the lock |
| 236 | * @lock: Pointer to queued spinlock structure | 268 | * @lock: Pointer to queued spinlock structure |
| 237 | * | 269 | * |
| @@ -326,43 +358,48 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) | |||
| 326 | /* | 358 | /* |
| 327 | * trylock || pending | 359 | * trylock || pending |
| 328 | * | 360 | * |
| 329 | * 0,0,0 -> 0,0,1 ; trylock | 361 | * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock |
| 330 | * 0,0,1 -> 0,1,1 ; pending | ||
| 331 | */ | 362 | */ |
| 332 | val = atomic_fetch_or_acquire(_Q_PENDING_VAL, &lock->val); | 363 | val = queued_fetch_set_pending_acquire(lock); |
| 333 | if (!(val & ~_Q_LOCKED_MASK)) { | ||
| 334 | /* | ||
| 335 | * We're pending, wait for the owner to go away. | ||
| 336 | * | ||
| 337 | * *,1,1 -> *,1,0 | ||
| 338 | * | ||
| 339 | * this wait loop must be a load-acquire such that we match the | ||
| 340 | * store-release that clears the locked bit and create lock | ||
| 341 | * sequentiality; this is because not all | ||
| 342 | * clear_pending_set_locked() implementations imply full | ||
| 343 | * barriers. | ||
| 344 | */ | ||
| 345 | if (val & _Q_LOCKED_MASK) { | ||
| 346 | atomic_cond_read_acquire(&lock->val, | ||
| 347 | !(VAL & _Q_LOCKED_MASK)); | ||
| 348 | } | ||
| 349 | 364 | ||
| 350 | /* | 365 | /* |
| 351 | * take ownership and clear the pending bit. | 366 | * If we observe contention, there is a concurrent locker. |
| 352 | * | 367 | * |
| 353 | * *,1,0 -> *,0,1 | 368 | * Undo and queue; our setting of PENDING might have made the |
| 354 | */ | 369 | * n,0,0 -> 0,0,0 transition fail and it will now be waiting |
| 355 | clear_pending_set_locked(lock); | 370 | * on @next to become !NULL. |
| 356 | qstat_inc(qstat_lock_pending, true); | 371 | */ |
| 357 | return; | 372 | if (unlikely(val & ~_Q_LOCKED_MASK)) { |
| 373 | |||
| 374 | /* Undo PENDING if we set it. */ | ||
| 375 | if (!(val & _Q_PENDING_MASK)) | ||
| 376 | clear_pending(lock); | ||
| 377 | |||
| 378 | goto queue; | ||
| 358 | } | 379 | } |
| 359 | 380 | ||
| 360 | /* | 381 | /* |
| 361 | * If pending was clear but there are waiters in the queue, then | 382 | * We're pending, wait for the owner to go away. |
| 362 | * we need to undo our setting of pending before we queue ourselves. | 383 | * |
| 384 | * 0,1,1 -> 0,1,0 | ||
| 385 | * | ||
| 386 | * this wait loop must be a load-acquire such that we match the | ||
| 387 | * store-release that clears the locked bit and create lock | ||
| 388 | * sequentiality; this is because not all | ||
| 389 | * clear_pending_set_locked() implementations imply full | ||
| 390 | * barriers. | ||
| 391 | */ | ||
| 392 | if (val & _Q_LOCKED_MASK) | ||
| 393 | atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK)); | ||
| 394 | |||
| 395 | /* | ||
| 396 | * take ownership and clear the pending bit. | ||
| 397 | * | ||
| 398 | * 0,1,0 -> 0,0,1 | ||
| 363 | */ | 399 | */ |
| 364 | if (!(val & _Q_PENDING_MASK)) | 400 | clear_pending_set_locked(lock); |
| 365 | clear_pending(lock); | 401 | qstat_inc(qstat_lock_pending, true); |
| 402 | return; | ||
| 366 | 403 | ||
| 367 | /* | 404 | /* |
| 368 | * End of pending bit optimistic spinning and beginning of MCS | 405 | * End of pending bit optimistic spinning and beginning of MCS |
| @@ -371,11 +408,16 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) | |||
| 371 | queue: | 408 | queue: |
| 372 | qstat_inc(qstat_lock_slowpath, true); | 409 | qstat_inc(qstat_lock_slowpath, true); |
| 373 | pv_queue: | 410 | pv_queue: |
| 374 | node = this_cpu_ptr(&mcs_nodes[0]); | 411 | node = this_cpu_ptr(&qnodes[0].mcs); |
| 375 | idx = node->count++; | 412 | idx = node->count++; |
| 376 | tail = encode_tail(smp_processor_id(), idx); | 413 | tail = encode_tail(smp_processor_id(), idx); |
| 377 | 414 | ||
| 378 | node += idx; | 415 | node = grab_mcs_node(node, idx); |
| 416 | |||
| 417 | /* | ||
| 418 | * Keep counts of non-zero index values: | ||
| 419 | */ | ||
| 420 | qstat_inc(qstat_lock_idx1 + idx - 1, idx); | ||
| 379 | 421 | ||
| 380 | /* | 422 | /* |
| 381 | * Ensure that we increment the head node->count before initialising | 423 | * Ensure that we increment the head node->count before initialising |
| @@ -476,16 +518,25 @@ locked: | |||
| 476 | */ | 518 | */ |
| 477 | 519 | ||
| 478 | /* | 520 | /* |
| 479 | * In the PV case we might already have _Q_LOCKED_VAL set. | 521 | * In the PV case we might already have _Q_LOCKED_VAL set, because |
| 522 | * of lock stealing; therefore we must also allow: | ||
| 523 | * | ||
| 524 | * n,0,1 -> 0,0,1 | ||
| 480 | * | 525 | * |
| 481 | * The atomic_cond_read_acquire() call above has provided the | 526 | * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the |
| 482 | * necessary acquire semantics required for locking. | 527 | * above wait condition, therefore any concurrent setting of |
| 528 | * PENDING will make the uncontended transition fail. | ||
| 483 | */ | 529 | */ |
| 484 | if (((val & _Q_TAIL_MASK) == tail) && | 530 | if ((val & _Q_TAIL_MASK) == tail) { |
| 485 | atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) | 531 | if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) |
| 486 | goto release; /* No contention */ | 532 | goto release; /* No contention */ |
| 533 | } | ||
| 487 | 534 | ||
| 488 | /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ | 535 | /* |
| 536 | * Either somebody is queued behind us or _Q_PENDING_VAL got set | ||
| 537 | * which will then detect the remaining tail and queue behind us | ||
| 538 | * ensuring we'll see a @next. | ||
| 539 | */ | ||
| 489 | set_locked(lock); | 540 | set_locked(lock); |
| 490 | 541 | ||
| 491 | /* | 542 | /* |
| @@ -501,7 +552,7 @@ release: | |||
| 501 | /* | 552 | /* |
| 502 | * release the node | 553 | * release the node |
| 503 | */ | 554 | */ |
| 504 | __this_cpu_dec(mcs_nodes[0].count); | 555 | __this_cpu_dec(qnodes[0].mcs.count); |
| 505 | } | 556 | } |
| 506 | EXPORT_SYMBOL(queued_spin_lock_slowpath); | 557 | EXPORT_SYMBOL(queued_spin_lock_slowpath); |
| 507 | 558 | ||
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 5a0cf5f9008c..0130e488ebfe 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h | |||
| @@ -49,8 +49,6 @@ enum vcpu_state { | |||
| 49 | 49 | ||
| 50 | struct pv_node { | 50 | struct pv_node { |
| 51 | struct mcs_spinlock mcs; | 51 | struct mcs_spinlock mcs; |
| 52 | struct mcs_spinlock __res[3]; | ||
| 53 | |||
| 54 | int cpu; | 52 | int cpu; |
| 55 | u8 state; | 53 | u8 state; |
| 56 | }; | 54 | }; |
| @@ -281,7 +279,7 @@ static void pv_init_node(struct mcs_spinlock *node) | |||
| 281 | { | 279 | { |
| 282 | struct pv_node *pn = (struct pv_node *)node; | 280 | struct pv_node *pn = (struct pv_node *)node; |
| 283 | 281 | ||
| 284 | BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock)); | 282 | BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode)); |
| 285 | 283 | ||
| 286 | pn->cpu = smp_processor_id(); | 284 | pn->cpu = smp_processor_id(); |
| 287 | pn->state = vcpu_running; | 285 | pn->state = vcpu_running; |
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 6bd78c0740fc..42d3d8dc8f49 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h | |||
| @@ -55,6 +55,9 @@ enum qlock_stats { | |||
| 55 | qstat_pv_wait_node, | 55 | qstat_pv_wait_node, |
| 56 | qstat_lock_pending, | 56 | qstat_lock_pending, |
| 57 | qstat_lock_slowpath, | 57 | qstat_lock_slowpath, |
| 58 | qstat_lock_idx1, | ||
| 59 | qstat_lock_idx2, | ||
| 60 | qstat_lock_idx3, | ||
| 58 | qstat_num, /* Total number of statistical counters */ | 61 | qstat_num, /* Total number of statistical counters */ |
| 59 | qstat_reset_cnts = qstat_num, | 62 | qstat_reset_cnts = qstat_num, |
| 60 | }; | 63 | }; |
| @@ -82,6 +85,9 @@ static const char * const qstat_names[qstat_num + 1] = { | |||
| 82 | [qstat_pv_wait_node] = "pv_wait_node", | 85 | [qstat_pv_wait_node] = "pv_wait_node", |
| 83 | [qstat_lock_pending] = "lock_pending", | 86 | [qstat_lock_pending] = "lock_pending", |
| 84 | [qstat_lock_slowpath] = "lock_slowpath", | 87 | [qstat_lock_slowpath] = "lock_slowpath", |
| 88 | [qstat_lock_idx1] = "lock_index1", | ||
| 89 | [qstat_lock_idx2] = "lock_index2", | ||
| 90 | [qstat_lock_idx3] = "lock_index3", | ||
| 85 | [qstat_reset_cnts] = "reset_counters", | 91 | [qstat_reset_cnts] = "reset_counters", |
| 86 | }; | 92 | }; |
| 87 | 93 | ||
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2823d4163a37..581edcc63c26 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -1485,9 +1485,9 @@ void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) | |||
| 1485 | __rt_mutex_lock(lock, subclass); | 1485 | __rt_mutex_lock(lock, subclass); |
| 1486 | } | 1486 | } |
| 1487 | EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); | 1487 | EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); |
| 1488 | #endif | ||
| 1489 | 1488 | ||
| 1490 | #ifndef CONFIG_DEBUG_LOCK_ALLOC | 1489 | #else /* !CONFIG_DEBUG_LOCK_ALLOC */ |
| 1490 | |||
| 1491 | /** | 1491 | /** |
| 1492 | * rt_mutex_lock - lock a rt_mutex | 1492 | * rt_mutex_lock - lock a rt_mutex |
| 1493 | * | 1493 | * |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 3064c50e181e..09b180063ee1 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c | |||
| @@ -180,7 +180,7 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, | |||
| 180 | * but it gives the spinners an early indication that the | 180 | * but it gives the spinners an early indication that the |
| 181 | * readers now have the lock. | 181 | * readers now have the lock. |
| 182 | */ | 182 | */ |
| 183 | rwsem_set_reader_owned(sem); | 183 | __rwsem_set_reader_owned(sem, waiter->task); |
| 184 | } | 184 | } |
| 185 | 185 | ||
| 186 | /* | 186 | /* |
| @@ -233,8 +233,19 @@ __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) | |||
| 233 | waiter.type = RWSEM_WAITING_FOR_READ; | 233 | waiter.type = RWSEM_WAITING_FOR_READ; |
| 234 | 234 | ||
| 235 | raw_spin_lock_irq(&sem->wait_lock); | 235 | raw_spin_lock_irq(&sem->wait_lock); |
| 236 | if (list_empty(&sem->wait_list)) | 236 | if (list_empty(&sem->wait_list)) { |
| 237 | /* | ||
| 238 | * In case the wait queue is empty and the lock isn't owned | ||
| 239 | * by a writer, this reader can exit the slowpath and return | ||
| 240 | * immediately as its RWSEM_ACTIVE_READ_BIAS has already | ||
| 241 | * been set in the count. | ||
| 242 | */ | ||
| 243 | if (atomic_long_read(&sem->count) >= 0) { | ||
| 244 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 245 | return sem; | ||
| 246 | } | ||
| 237 | adjustment += RWSEM_WAITING_BIAS; | 247 | adjustment += RWSEM_WAITING_BIAS; |
| 248 | } | ||
| 238 | list_add_tail(&waiter.list, &sem->wait_list); | 249 | list_add_tail(&waiter.list, &sem->wait_list); |
| 239 | 250 | ||
| 240 | /* we're now waiting on the lock, but no longer actively locking */ | 251 | /* we're now waiting on the lock, but no longer actively locking */ |
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 776308d2fa9e..e586f0d03ad3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
| @@ -117,8 +117,9 @@ EXPORT_SYMBOL(down_write_trylock); | |||
| 117 | void up_read(struct rw_semaphore *sem) | 117 | void up_read(struct rw_semaphore *sem) |
| 118 | { | 118 | { |
| 119 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 119 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
| 120 | DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); | 120 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); |
| 121 | 121 | ||
| 122 | rwsem_clear_reader_owned(sem); | ||
| 122 | __up_read(sem); | 123 | __up_read(sem); |
| 123 | } | 124 | } |
| 124 | 125 | ||
| @@ -181,7 +182,7 @@ void down_read_non_owner(struct rw_semaphore *sem) | |||
| 181 | might_sleep(); | 182 | might_sleep(); |
| 182 | 183 | ||
| 183 | __down_read(sem); | 184 | __down_read(sem); |
| 184 | rwsem_set_reader_owned(sem); | 185 | __rwsem_set_reader_owned(sem, NULL); |
| 185 | } | 186 | } |
| 186 | 187 | ||
| 187 | EXPORT_SYMBOL(down_read_non_owner); | 188 | EXPORT_SYMBOL(down_read_non_owner); |
| @@ -215,7 +216,7 @@ EXPORT_SYMBOL(down_write_killable_nested); | |||
| 215 | 216 | ||
| 216 | void up_read_non_owner(struct rw_semaphore *sem) | 217 | void up_read_non_owner(struct rw_semaphore *sem) |
| 217 | { | 218 | { |
| 218 | DEBUG_RWSEMS_WARN_ON(sem->owner != RWSEM_READER_OWNED); | 219 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED)); |
| 219 | __up_read(sem); | 220 | __up_read(sem); |
| 220 | } | 221 | } |
| 221 | 222 | ||
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index b9d0e72aa80f..bad2bca0268b 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h | |||
| @@ -1,24 +1,30 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | /* | 2 | /* |
| 3 | * The owner field of the rw_semaphore structure will be set to | 3 | * The least significant 2 bits of the owner value has the following |
| 4 | * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear | 4 | * meanings when set. |
| 5 | * the owner field when it unlocks. A reader, on the other hand, will | 5 | * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers |
| 6 | * not touch the owner field when it unlocks. | 6 | * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, |
| 7 | * i.e. the owner(s) cannot be readily determined. It can be reader | ||
| 8 | * owned or the owning writer is indeterminate. | ||
| 7 | * | 9 | * |
| 8 | * In essence, the owner field now has the following 4 states: | 10 | * When a writer acquires a rwsem, it puts its task_struct pointer |
| 9 | * 1) 0 | 11 | * into the owner field. It is cleared after an unlock. |
| 10 | * - lock is free or the owner hasn't set the field yet | 12 | * |
| 11 | * 2) RWSEM_READER_OWNED | 13 | * When a reader acquires a rwsem, it will also puts its task_struct |
| 12 | * - lock is currently or previously owned by readers (lock is free | 14 | * pointer into the owner field with both the RWSEM_READER_OWNED and |
| 13 | * or not set by owner yet) | 15 | * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will |
| 14 | * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well | 16 | * largely be left untouched. So for a free or reader-owned rwsem, |
| 15 | * - lock is owned by an anonymous writer, so spinning on the lock | 17 | * the owner value may contain information about the last reader that |
| 16 | * owner should be disabled. | 18 | * acquires the rwsem. The anonymous bit is set because that particular |
| 17 | * 4) Other non-zero value | 19 | * reader may or may not still own the lock. |
| 18 | * - a writer owns the lock and other writers can spin on the lock owner. | 20 | * |
| 21 | * That information may be helpful in debugging cases where the system | ||
| 22 | * seems to hang on a reader owned rwsem especially if only one reader | ||
| 23 | * is involved. Ideally we would like to track all the readers that own | ||
| 24 | * a rwsem, but the overhead is simply too big. | ||
| 19 | */ | 25 | */ |
| 20 | #define RWSEM_ANONYMOUSLY_OWNED (1UL << 0) | 26 | #define RWSEM_READER_OWNED (1UL << 0) |
| 21 | #define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED) | 27 | #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) |
| 22 | 28 | ||
| 23 | #ifdef CONFIG_DEBUG_RWSEMS | 29 | #ifdef CONFIG_DEBUG_RWSEMS |
| 24 | # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) | 30 | # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) |
| @@ -44,15 +50,26 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) | |||
| 44 | WRITE_ONCE(sem->owner, NULL); | 50 | WRITE_ONCE(sem->owner, NULL); |
| 45 | } | 51 | } |
| 46 | 52 | ||
| 53 | /* | ||
| 54 | * The task_struct pointer of the last owning reader will be left in | ||
| 55 | * the owner field. | ||
| 56 | * | ||
| 57 | * Note that the owner value just indicates the task has owned the rwsem | ||
| 58 | * previously, it may not be the real owner or one of the real owners | ||
| 59 | * anymore when that field is examined, so take it with a grain of salt. | ||
| 60 | */ | ||
| 61 | static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, | ||
| 62 | struct task_struct *owner) | ||
| 63 | { | ||
| 64 | unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | ||
| 65 | | RWSEM_ANONYMOUSLY_OWNED; | ||
| 66 | |||
| 67 | WRITE_ONCE(sem->owner, (struct task_struct *)val); | ||
| 68 | } | ||
| 69 | |||
| 47 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | 70 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) |
| 48 | { | 71 | { |
| 49 | /* | 72 | __rwsem_set_reader_owned(sem, current); |
| 50 | * We check the owner value first to make sure that we will only | ||
| 51 | * do a write to the rwsem cacheline when it is really necessary | ||
| 52 | * to minimize cacheline contention. | ||
| 53 | */ | ||
| 54 | if (READ_ONCE(sem->owner) != RWSEM_READER_OWNED) | ||
| 55 | WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); | ||
| 56 | } | 73 | } |
| 57 | 74 | ||
| 58 | /* | 75 | /* |
| @@ -72,6 +89,25 @@ static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) | |||
| 72 | { | 89 | { |
| 73 | return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; | 90 | return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; |
| 74 | } | 91 | } |
| 92 | |||
| 93 | #ifdef CONFIG_DEBUG_RWSEMS | ||
| 94 | /* | ||
| 95 | * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there | ||
| 96 | * is a task pointer in owner of a reader-owned rwsem, it will be the | ||
| 97 | * real owner or one of the real owners. The only exception is when the | ||
| 98 | * unlock is done by up_read_non_owner(). | ||
| 99 | */ | ||
| 100 | #define rwsem_clear_reader_owned rwsem_clear_reader_owned | ||
| 101 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) | ||
| 102 | { | ||
| 103 | unsigned long val = (unsigned long)current | RWSEM_READER_OWNED | ||
| 104 | | RWSEM_ANONYMOUSLY_OWNED; | ||
| 105 | if (READ_ONCE(sem->owner) == (struct task_struct *)val) | ||
| 106 | cmpxchg_relaxed((unsigned long *)&sem->owner, val, | ||
| 107 | RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); | ||
| 108 | } | ||
| 109 | #endif | ||
| 110 | |||
| 75 | #else | 111 | #else |
| 76 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | 112 | static inline void rwsem_set_owner(struct rw_semaphore *sem) |
| 77 | { | 113 | { |
| @@ -81,7 +117,18 @@ static inline void rwsem_clear_owner(struct rw_semaphore *sem) | |||
| 81 | { | 117 | { |
| 82 | } | 118 | } |
| 83 | 119 | ||
| 120 | static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, | ||
| 121 | struct task_struct *owner) | ||
| 122 | { | ||
| 123 | } | ||
| 124 | |||
| 84 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | 125 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) |
| 85 | { | 126 | { |
| 86 | } | 127 | } |
| 87 | #endif | 128 | #endif |
| 129 | |||
| 130 | #ifndef rwsem_clear_reader_owned | ||
| 131 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) | ||
| 132 | { | ||
| 133 | } | ||
| 134 | #endif | ||
