aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/locking
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/locking')
-rw-r--r--kernel/locking/qspinlock.c82
-rw-r--r--kernel/locking/qspinlock_paravirt.h252
-rw-r--r--kernel/locking/qspinlock_stat.h300
3 files changed, 576 insertions, 58 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 87e9ce6a63c5..393d1874b9e0 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -14,8 +14,9 @@
14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. 14 * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.
15 * (C) Copyright 2013-2014 Red Hat, Inc. 15 * (C) Copyright 2013-2014 Red Hat, Inc.
16 * (C) Copyright 2015 Intel Corp. 16 * (C) Copyright 2015 Intel Corp.
17 * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP
17 * 18 *
18 * Authors: Waiman Long <waiman.long@hp.com> 19 * Authors: Waiman Long <waiman.long@hpe.com>
19 * Peter Zijlstra <peterz@infradead.org> 20 * Peter Zijlstra <peterz@infradead.org>
20 */ 21 */
21 22
@@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
176{ 177{
177 struct __qspinlock *l = (void *)lock; 178 struct __qspinlock *l = (void *)lock;
178 179
179 return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; 180 /*
181 * Use release semantics to make sure that the MCS node is properly
182 * initialized before changing the tail code.
183 */
184 return (u32)xchg_release(&l->tail,
185 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
180} 186}
181 187
182#else /* _Q_PENDING_BITS == 8 */ 188#else /* _Q_PENDING_BITS == 8 */
@@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
208 214
209 for (;;) { 215 for (;;) {
210 new = (val & _Q_LOCKED_PENDING_MASK) | tail; 216 new = (val & _Q_LOCKED_PENDING_MASK) | tail;
211 old = atomic_cmpxchg(&lock->val, val, new); 217 /*
218 * Use release semantics to make sure that the MCS node is
219 * properly initialized before changing the tail code.
220 */
221 old = atomic_cmpxchg_release(&lock->val, val, new);
212 if (old == val) 222 if (old == val)
213 break; 223 break;
214 224
@@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)
238 */ 248 */
239 249
240static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } 250static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
241static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } 251static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
252 struct mcs_spinlock *prev) { }
242static __always_inline void __pv_kick_node(struct qspinlock *lock, 253static __always_inline void __pv_kick_node(struct qspinlock *lock,
243 struct mcs_spinlock *node) { } 254 struct mcs_spinlock *node) { }
244static __always_inline void __pv_wait_head(struct qspinlock *lock, 255static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
245 struct mcs_spinlock *node) { } 256 struct mcs_spinlock *node)
257 { return 0; }
246 258
247#define pv_enabled() false 259#define pv_enabled() false
248 260
249#define pv_init_node __pv_init_node 261#define pv_init_node __pv_init_node
250#define pv_wait_node __pv_wait_node 262#define pv_wait_node __pv_wait_node
251#define pv_kick_node __pv_kick_node 263#define pv_kick_node __pv_kick_node
252#define pv_wait_head __pv_wait_head 264#define pv_wait_head_or_lock __pv_wait_head_or_lock
253 265
254#ifdef CONFIG_PARAVIRT_SPINLOCKS 266#ifdef CONFIG_PARAVIRT_SPINLOCKS
255#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath 267#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
@@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
319 if (val == new) 331 if (val == new)
320 new |= _Q_PENDING_VAL; 332 new |= _Q_PENDING_VAL;
321 333
322 old = atomic_cmpxchg(&lock->val, val, new); 334 /*
335 * Acquire semantic is required here as the function may
336 * return immediately if the lock was free.
337 */
338 old = atomic_cmpxchg_acquire(&lock->val, val, new);
323 if (old == val) 339 if (old == val)
324 break; 340 break;
325 341
@@ -382,6 +398,7 @@ queue:
382 * p,*,* -> n,*,* 398 * p,*,* -> n,*,*
383 */ 399 */
384 old = xchg_tail(lock, tail); 400 old = xchg_tail(lock, tail);
401 next = NULL;
385 402
386 /* 403 /*
387 * if there was a previous node; link it and wait until reaching the 404 * if there was a previous node; link it and wait until reaching the
@@ -391,8 +408,18 @@ queue:
391 prev = decode_tail(old); 408 prev = decode_tail(old);
392 WRITE_ONCE(prev->next, node); 409 WRITE_ONCE(prev->next, node);
393 410
394 pv_wait_node(node); 411 pv_wait_node(node, prev);
395 arch_mcs_spin_lock_contended(&node->locked); 412 arch_mcs_spin_lock_contended(&node->locked);
413
414 /*
415 * While waiting for the MCS lock, the next pointer may have
416 * been set by another lock waiter. We optimistically load
417 * the next pointer & prefetch the cacheline for writing
418 * to reduce latency in the upcoming MCS unlock operation.
419 */
420 next = READ_ONCE(node->next);
421 if (next)
422 prefetchw(next);
396 } 423 }
397 424
398 /* 425 /*
@@ -406,11 +433,22 @@ queue:
406 * sequentiality; this is because the set_locked() function below 433 * sequentiality; this is because the set_locked() function below
407 * does not imply a full barrier. 434 * does not imply a full barrier.
408 * 435 *
436 * The PV pv_wait_head_or_lock function, if active, will acquire
437 * the lock and return a non-zero value. So we have to skip the
438 * smp_load_acquire() call. As the next PV queue head hasn't been
439 * designated yet, there is no way for the locked value to become
440 * _Q_SLOW_VAL. So both the set_locked() and the
441 * atomic_cmpxchg_relaxed() calls will be safe.
442 *
443 * If PV isn't active, 0 will be returned instead.
444 *
409 */ 445 */
410 pv_wait_head(lock, node); 446 if ((val = pv_wait_head_or_lock(lock, node)))
411 while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) 447 goto locked;
412 cpu_relax();
413 448
449 smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
450
451locked:
414 /* 452 /*
415 * claim the lock: 453 * claim the lock:
416 * 454 *
@@ -422,11 +460,17 @@ queue:
422 * to grab the lock. 460 * to grab the lock.
423 */ 461 */
424 for (;;) { 462 for (;;) {
425 if (val != tail) { 463 /* In the PV case we might already have _Q_LOCKED_VAL set */
464 if ((val & _Q_TAIL_MASK) != tail) {
426 set_locked(lock); 465 set_locked(lock);
427 break; 466 break;
428 } 467 }
429 old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); 468 /*
469 * The smp_load_acquire() call above has provided the necessary
470 * acquire semantics required for locking. At most two
471 * iterations of this loop may be ran.
472 */
473 old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
430 if (old == val) 474 if (old == val)
431 goto release; /* No contention */ 475 goto release; /* No contention */
432 476
@@ -434,10 +478,12 @@ queue:
434 } 478 }
435 479
436 /* 480 /*
437 * contended path; wait for next, release. 481 * contended path; wait for next if not observed yet, release.
438 */ 482 */
439 while (!(next = READ_ONCE(node->next))) 483 if (!next) {
440 cpu_relax(); 484 while (!(next = READ_ONCE(node->next)))
485 cpu_relax();
486 }
441 487
442 arch_mcs_spin_unlock_contended(&next->locked); 488 arch_mcs_spin_unlock_contended(&next->locked);
443 pv_kick_node(lock, next); 489 pv_kick_node(lock, next);
@@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
462#undef pv_init_node 508#undef pv_init_node
463#undef pv_wait_node 509#undef pv_wait_node
464#undef pv_kick_node 510#undef pv_kick_node
465#undef pv_wait_head 511#undef pv_wait_head_or_lock
466 512
467#undef queued_spin_lock_slowpath 513#undef queued_spin_lock_slowpath
468#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath 514#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index f0450ff4829b..87bb235c3448 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
23#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) 23#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
24 24
25/* 25/*
26 * Queue Node Adaptive Spinning
27 *
28 * A queue node vCPU will stop spinning if the vCPU in the previous node is
29 * not running. The one lock stealing attempt allowed at slowpath entry
30 * mitigates the slight slowdown for non-overcommitted guest with this
31 * aggressive wait-early mechanism.
32 *
33 * The status of the previous node will be checked at fixed interval
34 * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
35 * pound on the cacheline of the previous node too heavily.
36 */
37#define PV_PREV_CHECK_MASK 0xff
38
39/*
26 * Queue node uses: vcpu_running & vcpu_halted. 40 * Queue node uses: vcpu_running & vcpu_halted.
27 * Queue head uses: vcpu_running & vcpu_hashed. 41 * Queue head uses: vcpu_running & vcpu_hashed.
28 */ 42 */
@@ -41,6 +55,94 @@ struct pv_node {
41}; 55};
42 56
43/* 57/*
58 * By replacing the regular queued_spin_trylock() with the function below,
59 * it will be called once when a lock waiter enter the PV slowpath before
60 * being queued. By allowing one lock stealing attempt here when the pending
61 * bit is off, it helps to reduce the performance impact of lock waiter
62 * preemption without the drawback of lock starvation.
63 */
64#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
65static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
66{
67 struct __qspinlock *l = (void *)lock;
68
69 return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
70 (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
71}
72
73/*
74 * The pending bit is used by the queue head vCPU to indicate that it
75 * is actively spinning on the lock and no lock stealing is allowed.
76 */
77#if _Q_PENDING_BITS == 8
78static __always_inline void set_pending(struct qspinlock *lock)
79{
80 struct __qspinlock *l = (void *)lock;
81
82 WRITE_ONCE(l->pending, 1);
83}
84
85static __always_inline void clear_pending(struct qspinlock *lock)
86{
87 struct __qspinlock *l = (void *)lock;
88
89 WRITE_ONCE(l->pending, 0);
90}
91
92/*
93 * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
94 * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
95 * just to be sure that it will get it.
96 */
97static __always_inline int trylock_clear_pending(struct qspinlock *lock)
98{
99 struct __qspinlock *l = (void *)lock;
100
101 return !READ_ONCE(l->locked) &&
102 (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
103 == _Q_PENDING_VAL);
104}
105#else /* _Q_PENDING_BITS == 8 */
106static __always_inline void set_pending(struct qspinlock *lock)
107{
108 atomic_set_mask(_Q_PENDING_VAL, &lock->val);
109}
110
111static __always_inline void clear_pending(struct qspinlock *lock)
112{
113 atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
114}
115
116static __always_inline int trylock_clear_pending(struct qspinlock *lock)
117{
118 int val = atomic_read(&lock->val);
119
120 for (;;) {
121 int old, new;
122
123 if (val & _Q_LOCKED_MASK)
124 break;
125
126 /*
127 * Try to clear pending bit & set locked bit
128 */
129 old = val;
130 new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
131 val = atomic_cmpxchg(&lock->val, old, new);
132
133 if (val == old)
134 return 1;
135 }
136 return 0;
137}
138#endif /* _Q_PENDING_BITS == 8 */
139
140/*
141 * Include queued spinlock statistics code
142 */
143#include "qspinlock_stat.h"
144
145/*
44 * Lock and MCS node addresses hash table for fast lookup 146 * Lock and MCS node addresses hash table for fast lookup
45 * 147 *
46 * Hashing is done on a per-cacheline basis to minimize the need to access 148 * Hashing is done on a per-cacheline basis to minimize the need to access
@@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
100{ 202{
101 unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); 203 unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
102 struct pv_hash_entry *he; 204 struct pv_hash_entry *he;
205 int hopcnt = 0;
103 206
104 for_each_hash_entry(he, offset, hash) { 207 for_each_hash_entry(he, offset, hash) {
208 hopcnt++;
105 if (!cmpxchg(&he->lock, NULL, lock)) { 209 if (!cmpxchg(&he->lock, NULL, lock)) {
106 WRITE_ONCE(he->node, node); 210 WRITE_ONCE(he->node, node);
211 qstat_hop(hopcnt);
107 return &he->lock; 212 return &he->lock;
108 } 213 }
109 } 214 }
@@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
144} 249}
145 250
146/* 251/*
252 * Return true if when it is time to check the previous node which is not
253 * in a running state.
254 */
255static inline bool
256pv_wait_early(struct pv_node *prev, int loop)
257{
258
259 if ((loop & PV_PREV_CHECK_MASK) != 0)
260 return false;
261
262 return READ_ONCE(prev->state) != vcpu_running;
263}
264
265/*
147 * Initialize the PV part of the mcs_spinlock node. 266 * Initialize the PV part of the mcs_spinlock node.
148 */ 267 */
149static void pv_init_node(struct mcs_spinlock *node) 268static void pv_init_node(struct mcs_spinlock *node)
@@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
161 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its 280 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
162 * behalf. 281 * behalf.
163 */ 282 */
164static void pv_wait_node(struct mcs_spinlock *node) 283static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
165{ 284{
166 struct pv_node *pn = (struct pv_node *)node; 285 struct pv_node *pn = (struct pv_node *)node;
286 struct pv_node *pp = (struct pv_node *)prev;
287 int waitcnt = 0;
167 int loop; 288 int loop;
289 bool wait_early;
168 290
169 for (;;) { 291 /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
170 for (loop = SPIN_THRESHOLD; loop; loop--) { 292 for (;; waitcnt++) {
293 for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
171 if (READ_ONCE(node->locked)) 294 if (READ_ONCE(node->locked))
172 return; 295 return;
296 if (pv_wait_early(pp, loop)) {
297 wait_early = true;
298 break;
299 }
173 cpu_relax(); 300 cpu_relax();
174 } 301 }
175 302
@@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)
184 */ 311 */
185 smp_store_mb(pn->state, vcpu_halted); 312 smp_store_mb(pn->state, vcpu_halted);
186 313
187 if (!READ_ONCE(node->locked)) 314 if (!READ_ONCE(node->locked)) {
315 qstat_inc(qstat_pv_wait_node, true);
316 qstat_inc(qstat_pv_wait_again, waitcnt);
317 qstat_inc(qstat_pv_wait_early, wait_early);
188 pv_wait(&pn->state, vcpu_halted); 318 pv_wait(&pn->state, vcpu_halted);
319 }
189 320
190 /* 321 /*
191 * If pv_kick_node() changed us to vcpu_hashed, retain that value 322 * If pv_kick_node() changed us to vcpu_hashed, retain that
192 * so that pv_wait_head() knows to not also try to hash this lock. 323 * value so that pv_wait_head_or_lock() knows to not also try
324 * to hash this lock.
193 */ 325 */
194 cmpxchg(&pn->state, vcpu_halted, vcpu_running); 326 cmpxchg(&pn->state, vcpu_halted, vcpu_running);
195 327
@@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
200 * So it is better to spin for a while in the hope that the 332 * So it is better to spin for a while in the hope that the
201 * MCS lock will be released soon. 333 * MCS lock will be released soon.
202 */ 334 */
335 qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
203 } 336 }
204 337
205 /* 338 /*
@@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
212/* 345/*
213 * Called after setting next->locked = 1 when we're the lock owner. 346 * Called after setting next->locked = 1 when we're the lock owner.
214 * 347 *
215 * Instead of waking the waiters stuck in pv_wait_node() advance their state such 348 * Instead of waking the waiters stuck in pv_wait_node() advance their state
216 * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. 349 * such that they're waiting in pv_wait_head_or_lock(), this avoids a
350 * wake/sleep cycle.
217 */ 351 */
218static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) 352static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
219{ 353{
@@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
242} 376}
243 377
244/* 378/*
245 * Wait for l->locked to become clear; halt the vcpu after a short spin. 379 * Wait for l->locked to become clear and acquire the lock;
380 * halt the vcpu after a short spin.
246 * __pv_queued_spin_unlock() will wake us. 381 * __pv_queued_spin_unlock() will wake us.
382 *
383 * The current value of the lock will be returned for additional processing.
247 */ 384 */
248static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) 385static u32
386pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
249{ 387{
250 struct pv_node *pn = (struct pv_node *)node; 388 struct pv_node *pn = (struct pv_node *)node;
251 struct __qspinlock *l = (void *)lock; 389 struct __qspinlock *l = (void *)lock;
252 struct qspinlock **lp = NULL; 390 struct qspinlock **lp = NULL;
391 int waitcnt = 0;
253 int loop; 392 int loop;
254 393
255 /* 394 /*
@@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
259 if (READ_ONCE(pn->state) == vcpu_hashed) 398 if (READ_ONCE(pn->state) == vcpu_hashed)
260 lp = (struct qspinlock **)1; 399 lp = (struct qspinlock **)1;
261 400
262 for (;;) { 401 for (;; waitcnt++) {
402 /*
403 * Set correct vCPU state to be used by queue node wait-early
404 * mechanism.
405 */
406 WRITE_ONCE(pn->state, vcpu_running);
407
408 /*
409 * Set the pending bit in the active lock spinning loop to
410 * disable lock stealing before attempting to acquire the lock.
411 */
412 set_pending(lock);
263 for (loop = SPIN_THRESHOLD; loop; loop--) { 413 for (loop = SPIN_THRESHOLD; loop; loop--) {
264 if (!READ_ONCE(l->locked)) 414 if (trylock_clear_pending(lock))
265 return; 415 goto gotlock;
266 cpu_relax(); 416 cpu_relax();
267 } 417 }
418 clear_pending(lock);
419
268 420
269 if (!lp) { /* ONCE */ 421 if (!lp) { /* ONCE */
270 lp = pv_hash(lock, pn); 422 lp = pv_hash(lock, pn);
@@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
280 * 432 *
281 * Matches the smp_rmb() in __pv_queued_spin_unlock(). 433 * Matches the smp_rmb() in __pv_queued_spin_unlock().
282 */ 434 */
283 if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { 435 if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
284 /* 436 /*
285 * The lock is free and _Q_SLOW_VAL has never 437 * The lock was free and now we own the lock.
286 * been set. Therefore we need to unhash before 438 * Change the lock value back to _Q_LOCKED_VAL
287 * getting the lock. 439 * and unhash the table.
288 */ 440 */
441 WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
289 WRITE_ONCE(*lp, NULL); 442 WRITE_ONCE(*lp, NULL);
290 return; 443 goto gotlock;
291 } 444 }
292 } 445 }
446 WRITE_ONCE(pn->state, vcpu_halted);
447 qstat_inc(qstat_pv_wait_head, true);
448 qstat_inc(qstat_pv_wait_again, waitcnt);
293 pv_wait(&l->locked, _Q_SLOW_VAL); 449 pv_wait(&l->locked, _Q_SLOW_VAL);
294 450
295 /* 451 /*
296 * The unlocker should have freed the lock before kicking the 452 * The unlocker should have freed the lock before kicking the
297 * CPU. So if the lock is still not free, it is a spurious 453 * CPU. So if the lock is still not free, it is a spurious
298 * wakeup and so the vCPU should wait again after spinning for 454 * wakeup or another vCPU has stolen the lock. The current
299 * a while. 455 * vCPU should spin again.
300 */ 456 */
457 qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
301 } 458 }
302 459
303 /* 460 /*
304 * Lock is unlocked now; the caller will acquire it without waiting. 461 * The cmpxchg() or xchg() call before coming here provides the
305 * As with pv_wait_node() we rely on the caller to do a load-acquire 462 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
306 * for us. 463 * here is to indicate to the compiler that the value will always
464 * be nozero to enable better code optimization.
307 */ 465 */
466gotlock:
467 return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
308} 468}
309 469
310/* 470/*
311 * PV version of the unlock function to be used in stead of 471 * PV versions of the unlock fastpath and slowpath functions to be used
312 * queued_spin_unlock(). 472 * instead of queued_spin_unlock().
313 */ 473 */
314__visible void __pv_queued_spin_unlock(struct qspinlock *lock) 474__visible void
475__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
315{ 476{
316 struct __qspinlock *l = (void *)lock; 477 struct __qspinlock *l = (void *)lock;
317 struct pv_node *node; 478 struct pv_node *node;
318 u8 locked;
319
320 /*
321 * We must not unlock if SLOW, because in that case we must first
322 * unhash. Otherwise it would be possible to have multiple @lock
323 * entries, which would be BAD.
324 */
325 locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
326 if (likely(locked == _Q_LOCKED_VAL))
327 return;
328 479
329 if (unlikely(locked != _Q_SLOW_VAL)) { 480 if (unlikely(locked != _Q_SLOW_VAL)) {
330 WARN(!debug_locks_silent, 481 WARN(!debug_locks_silent,
@@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
338 * so we need a barrier to order the read of the node data in 489 * so we need a barrier to order the read of the node data in
339 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. 490 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
340 * 491 *
341 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. 492 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
342 */ 493 */
343 smp_rmb(); 494 smp_rmb();
344 495
@@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
361 * vCPU is harmless other than the additional latency in completing 512 * vCPU is harmless other than the additional latency in completing
362 * the unlock. 513 * the unlock.
363 */ 514 */
515 qstat_inc(qstat_pv_kick_unlock, true);
364 pv_kick(node->cpu); 516 pv_kick(node->cpu);
365} 517}
518
366/* 519/*
367 * Include the architecture specific callee-save thunk of the 520 * Include the architecture specific callee-save thunk of the
368 * __pv_queued_spin_unlock(). This thunk is put together with 521 * __pv_queued_spin_unlock(). This thunk is put together with
369 * __pv_queued_spin_unlock() near the top of the file to make sure 522 * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock
370 * that the callee-save thunk and the real unlock function are close 523 * function close to each other sharing consecutive instruction cachelines.
371 * to each other sharing consecutive instruction cachelines. 524 * Alternatively, architecture specific version of __pv_queued_spin_unlock()
525 * can be defined.
372 */ 526 */
373#include <asm/qspinlock_paravirt.h> 527#include <asm/qspinlock_paravirt.h>
374 528
529#ifndef __pv_queued_spin_unlock
530__visible void __pv_queued_spin_unlock(struct qspinlock *lock)
531{
532 struct __qspinlock *l = (void *)lock;
533 u8 locked;
534
535 /*
536 * We must not unlock if SLOW, because in that case we must first
537 * unhash. Otherwise it would be possible to have multiple @lock
538 * entries, which would be BAD.
539 */
540 locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
541 if (likely(locked == _Q_LOCKED_VAL))
542 return;
543
544 __pv_queued_spin_unlock_slowpath(lock, locked);
545}
546#endif /* __pv_queued_spin_unlock */
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644
index 000000000000..640dcecdd1df
--- /dev/null
+++ b/kernel/locking/qspinlock_stat.h
@@ -0,0 +1,300 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * Authors: Waiman Long <waiman.long@hpe.com>
13 */
14
15/*
16 * When queued spinlock statistical counters are enabled, the following
17 * debugfs files will be created for reporting the counter values:
18 *
19 * <debugfs>/qlockstat/
20 * pv_hash_hops - average # of hops per hashing operation
21 * pv_kick_unlock - # of vCPU kicks issued at unlock time
22 * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
23 * pv_latency_kick - average latency (ns) of vCPU kick operation
24 * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
25 * pv_lock_stealing - # of lock stealing operations
26 * pv_spurious_wakeup - # of spurious wakeups
27 * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
28 * pv_wait_early - # of early vCPU wait's
29 * pv_wait_head - # of vCPU wait's at the queue head
30 * pv_wait_node - # of vCPU wait's at a non-head queue node
31 *
32 * Writing to the "reset_counters" file will reset all the above counter
33 * values.
34 *
35 * These statistical counters are implemented as per-cpu variables which are
36 * summed and computed whenever the corresponding debugfs files are read. This
37 * minimizes added overhead making the counters usable even in a production
38 * environment.
39 *
40 * There may be slight difference between pv_kick_wake and pv_kick_unlock.
41 */
42enum qlock_stats {
43 qstat_pv_hash_hops,
44 qstat_pv_kick_unlock,
45 qstat_pv_kick_wake,
46 qstat_pv_latency_kick,
47 qstat_pv_latency_wake,
48 qstat_pv_lock_stealing,
49 qstat_pv_spurious_wakeup,
50 qstat_pv_wait_again,
51 qstat_pv_wait_early,
52 qstat_pv_wait_head,
53 qstat_pv_wait_node,
54 qstat_num, /* Total number of statistical counters */
55 qstat_reset_cnts = qstat_num,
56};
57
58#ifdef CONFIG_QUEUED_LOCK_STAT
59/*
60 * Collect pvqspinlock statistics
61 */
62#include <linux/debugfs.h>
63#include <linux/sched.h>
64#include <linux/fs.h>
65
66static const char * const qstat_names[qstat_num + 1] = {
67 [qstat_pv_hash_hops] = "pv_hash_hops",
68 [qstat_pv_kick_unlock] = "pv_kick_unlock",
69 [qstat_pv_kick_wake] = "pv_kick_wake",
70 [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
71 [qstat_pv_latency_kick] = "pv_latency_kick",
72 [qstat_pv_latency_wake] = "pv_latency_wake",
73 [qstat_pv_lock_stealing] = "pv_lock_stealing",
74 [qstat_pv_wait_again] = "pv_wait_again",
75 [qstat_pv_wait_early] = "pv_wait_early",
76 [qstat_pv_wait_head] = "pv_wait_head",
77 [qstat_pv_wait_node] = "pv_wait_node",
78 [qstat_reset_cnts] = "reset_counters",
79};
80
81/*
82 * Per-cpu counters
83 */
84static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
85static DEFINE_PER_CPU(u64, pv_kick_time);
86
87/*
88 * Function to read and return the qlock statistical counter values
89 *
90 * The following counters are handled specially:
91 * 1. qstat_pv_latency_kick
92 * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
93 * 2. qstat_pv_latency_wake
94 * Average wake latency (ns) = pv_latency_wake/pv_kick_wake
95 * 3. qstat_pv_hash_hops
96 * Average hops/hash = pv_hash_hops/pv_kick_unlock
97 */
98static ssize_t qstat_read(struct file *file, char __user *user_buf,
99 size_t count, loff_t *ppos)
100{
101 char buf[64];
102 int cpu, counter, len;
103 u64 stat = 0, kicks = 0;
104
105 /*
106 * Get the counter ID stored in file->f_inode->i_private
107 */
108 if (!file->f_inode) {
109 WARN_ON_ONCE(1);
110 return -EBADF;
111 }
112 counter = (long)(file->f_inode->i_private);
113
114 if (counter >= qstat_num)
115 return -EBADF;
116
117 for_each_possible_cpu(cpu) {
118 stat += per_cpu(qstats[counter], cpu);
119 /*
120 * Need to sum additional counter for some of them
121 */
122 switch (counter) {
123
124 case qstat_pv_latency_kick:
125 case qstat_pv_hash_hops:
126 kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
127 break;
128
129 case qstat_pv_latency_wake:
130 kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
131 break;
132 }
133 }
134
135 if (counter == qstat_pv_hash_hops) {
136 u64 frac;
137
138 frac = 100ULL * do_div(stat, kicks);
139 frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
140
141 /*
142 * Return a X.XX decimal number
143 */
144 len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
145 } else {
146 /*
147 * Round to the nearest ns
148 */
149 if ((counter == qstat_pv_latency_kick) ||
150 (counter == qstat_pv_latency_wake)) {
151 stat = 0;
152 if (kicks)
153 stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
154 }
155 len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
156 }
157
158 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
159}
160
161/*
162 * Function to handle write request
163 *
164 * When counter = reset_cnts, reset all the counter values.
165 * Since the counter updates aren't atomic, the resetting is done twice
166 * to make sure that the counters are very likely to be all cleared.
167 */
168static ssize_t qstat_write(struct file *file, const char __user *user_buf,
169 size_t count, loff_t *ppos)
170{
171 int cpu;
172
173 /*
174 * Get the counter ID stored in file->f_inode->i_private
175 */
176 if (!file->f_inode) {
177 WARN_ON_ONCE(1);
178 return -EBADF;
179 }
180 if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
181 return count;
182
183 for_each_possible_cpu(cpu) {
184 int i;
185 unsigned long *ptr = per_cpu_ptr(qstats, cpu);
186
187 for (i = 0 ; i < qstat_num; i++)
188 WRITE_ONCE(ptr[i], 0);
189 for (i = 0 ; i < qstat_num; i++)
190 WRITE_ONCE(ptr[i], 0);
191 }
192 return count;
193}
194
195/*
196 * Debugfs data structures
197 */
198static const struct file_operations fops_qstat = {
199 .read = qstat_read,
200 .write = qstat_write,
201 .llseek = default_llseek,
202};
203
204/*
205 * Initialize debugfs for the qspinlock statistical counters
206 */
207static int __init init_qspinlock_stat(void)
208{
209 struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
210 int i;
211
212 if (!d_qstat) {
213 pr_warn("Could not create 'qlockstat' debugfs directory\n");
214 return 0;
215 }
216
217 /*
218 * Create the debugfs files
219 *
220 * As reading from and writing to the stat files can be slow, only
221 * root is allowed to do the read/write to limit impact to system
222 * performance.
223 */
224 for (i = 0; i < qstat_num; i++)
225 debugfs_create_file(qstat_names[i], 0400, d_qstat,
226 (void *)(long)i, &fops_qstat);
227
228 debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
229 (void *)(long)qstat_reset_cnts, &fops_qstat);
230 return 0;
231}
232fs_initcall(init_qspinlock_stat);
233
234/*
235 * Increment the PV qspinlock statistical counters
236 */
237static inline void qstat_inc(enum qlock_stats stat, bool cond)
238{
239 if (cond)
240 this_cpu_inc(qstats[stat]);
241}
242
243/*
244 * PV hash hop count
245 */
246static inline void qstat_hop(int hopcnt)
247{
248 this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
249}
250
251/*
252 * Replacement function for pv_kick()
253 */
254static inline void __pv_kick(int cpu)
255{
256 u64 start = sched_clock();
257
258 per_cpu(pv_kick_time, cpu) = start;
259 pv_kick(cpu);
260 this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
261}
262
263/*
264 * Replacement function for pv_wait()
265 */
266static inline void __pv_wait(u8 *ptr, u8 val)
267{
268 u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
269
270 *pkick_time = 0;
271 pv_wait(ptr, val);
272 if (*pkick_time) {
273 this_cpu_add(qstats[qstat_pv_latency_wake],
274 sched_clock() - *pkick_time);
275 qstat_inc(qstat_pv_kick_wake, true);
276 }
277}
278
279#define pv_kick(c) __pv_kick(c)
280#define pv_wait(p, v) __pv_wait(p, v)
281
282/*
283 * PV unfair trylock count tracking function
284 */
285static inline int qstat_spin_steal_lock(struct qspinlock *lock)
286{
287 int ret = pv_queued_spin_steal_lock(lock);
288
289 qstat_inc(qstat_pv_lock_stealing, ret);
290 return ret;
291}
292#undef queued_spin_trylock
293#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
294
295#else /* CONFIG_QUEUED_LOCK_STAT */
296
297static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
298static inline void qstat_hop(int hopcnt) { }
299
300#endif /* CONFIG_QUEUED_LOCK_STAT */