aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWaiman Long <Waiman.Long@hpe.com>2015-11-10 16:18:56 -0500
committerIngo Molnar <mingo@kernel.org>2015-12-04 05:39:51 -0500
commit1c4941fd53afb46ab15826628e4819866d008a28 (patch)
treeeaf0f35a79725bf10476a90634245824a7ccb8c3
parent45e898b735620f426eddf105fc886d2966593a58 (diff)
locking/pvqspinlock: Allow limited lock stealing
This patch allows one attempt for the lock waiter to steal the lock when entering the PV slowpath. To prevent lock starvation, the pending bit will be set by the queue head vCPU when it is in the active lock spinning loop to disable any lock stealing attempt. This helps to reduce the performance penalty caused by lock waiter preemption while not having much of the downsides of a real unfair lock. The pv_wait_head() function was renamed as pv_wait_head_or_lock() as it was modified to acquire the lock before returning. This is necessary because of possible lock stealing attempts from other tasks. Linux kernel builds were run in KVM guest on an 8-socket, 4 cores/socket Westmere-EX system and a 4-socket, 8 cores/socket Haswell-EX system. Both systems are configured to have 32 physical CPUs. The kernel build times before and after the patch were: Westmere Haswell Patch 32 vCPUs 48 vCPUs 32 vCPUs 48 vCPUs ----- -------- -------- -------- -------- Before patch 3m15.6s 10m56.1s 1m44.1s 5m29.1s After patch 3m02.3s 5m00.2s 1m43.7s 3m03.5s For the overcommited case (48 vCPUs), this patch is able to reduce kernel build time by more than 54% for Westmere and 44% for Haswell. Signed-off-by: Waiman Long <Waiman.Long@hpe.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Douglas Hatch <doug.hatch@hpe.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Scott J Norton <scott.norton@hpe.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1447190336-53317-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/locking/qspinlock.c26
-rw-r--r--kernel/locking/qspinlock_paravirt.h141
-rw-r--r--kernel/locking/qspinlock_stat.h16
3 files changed, 155 insertions, 28 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ed9d96708f93..2ea42999d2d8 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -251,15 +251,16 @@ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
251static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } 251static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
252static __always_inline void __pv_kick_node(struct qspinlock *lock, 252static __always_inline void __pv_kick_node(struct qspinlock *lock,
253 struct mcs_spinlock *node) { } 253 struct mcs_spinlock *node) { }
254static __always_inline void __pv_wait_head(struct qspinlock *lock, 254static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
255 struct mcs_spinlock *node) { } 255 struct mcs_spinlock *node)
256 { return 0; }
256 257
257#define pv_enabled() false 258#define pv_enabled() false
258 259
259#define pv_init_node __pv_init_node 260#define pv_init_node __pv_init_node
260#define pv_wait_node __pv_wait_node 261#define pv_wait_node __pv_wait_node
261#define pv_kick_node __pv_kick_node 262#define pv_kick_node __pv_kick_node
262#define pv_wait_head __pv_wait_head 263#define pv_wait_head_or_lock __pv_wait_head_or_lock
263 264
264#ifdef CONFIG_PARAVIRT_SPINLOCKS 265#ifdef CONFIG_PARAVIRT_SPINLOCKS
265#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath 266#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
@@ -431,10 +432,22 @@ queue:
431 * sequentiality; this is because the set_locked() function below 432 * sequentiality; this is because the set_locked() function below
432 * does not imply a full barrier. 433 * does not imply a full barrier.
433 * 434 *
435 * The PV pv_wait_head_or_lock function, if active, will acquire
436 * the lock and return a non-zero value. So we have to skip the
437 * smp_load_acquire() call. As the next PV queue head hasn't been
438 * designated yet, there is no way for the locked value to become
439 * _Q_SLOW_VAL. So both the set_locked() and the
440 * atomic_cmpxchg_relaxed() calls will be safe.
441 *
442 * If PV isn't active, 0 will be returned instead.
443 *
434 */ 444 */
435 pv_wait_head(lock, node); 445 if ((val = pv_wait_head_or_lock(lock, node)))
446 goto locked;
447
436 smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); 448 smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
437 449
450locked:
438 /* 451 /*
439 * claim the lock: 452 * claim the lock:
440 * 453 *
@@ -446,7 +459,8 @@ queue:
446 * to grab the lock. 459 * to grab the lock.
447 */ 460 */
448 for (;;) { 461 for (;;) {
449 if (val != tail) { 462 /* In the PV case we might already have _Q_LOCKED_VAL set */
463 if ((val & _Q_TAIL_MASK) != tail) {
450 set_locked(lock); 464 set_locked(lock);
451 break; 465 break;
452 } 466 }
@@ -493,7 +507,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
493#undef pv_init_node 507#undef pv_init_node
494#undef pv_wait_node 508#undef pv_wait_node
495#undef pv_kick_node 509#undef pv_kick_node
496#undef pv_wait_head 510#undef pv_wait_head_or_lock
497 511
498#undef queued_spin_lock_slowpath 512#undef queued_spin_lock_slowpath
499#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath 513#define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index aaeeefb791f8..ace60a451b4f 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -41,6 +41,89 @@ struct pv_node {
41}; 41};
42 42
43/* 43/*
44 * By replacing the regular queued_spin_trylock() with the function below,
45 * it will be called once when a lock waiter enter the PV slowpath before
46 * being queued. By allowing one lock stealing attempt here when the pending
47 * bit is off, it helps to reduce the performance impact of lock waiter
48 * preemption without the drawback of lock starvation.
49 */
50#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
51static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
52{
53 struct __qspinlock *l = (void *)lock;
54
55 return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
56 (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
57}
58
59/*
60 * The pending bit is used by the queue head vCPU to indicate that it
61 * is actively spinning on the lock and no lock stealing is allowed.
62 */
63#if _Q_PENDING_BITS == 8
64static __always_inline void set_pending(struct qspinlock *lock)
65{
66 struct __qspinlock *l = (void *)lock;
67
68 WRITE_ONCE(l->pending, 1);
69}
70
71static __always_inline void clear_pending(struct qspinlock *lock)
72{
73 struct __qspinlock *l = (void *)lock;
74
75 WRITE_ONCE(l->pending, 0);
76}
77
78/*
79 * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
80 * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
81 * just to be sure that it will get it.
82 */
83static __always_inline int trylock_clear_pending(struct qspinlock *lock)
84{
85 struct __qspinlock *l = (void *)lock;
86
87 return !READ_ONCE(l->locked) &&
88 (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
89 == _Q_PENDING_VAL);
90}
91#else /* _Q_PENDING_BITS == 8 */
92static __always_inline void set_pending(struct qspinlock *lock)
93{
94 atomic_set_mask(_Q_PENDING_VAL, &lock->val);
95}
96
97static __always_inline void clear_pending(struct qspinlock *lock)
98{
99 atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
100}
101
102static __always_inline int trylock_clear_pending(struct qspinlock *lock)
103{
104 int val = atomic_read(&lock->val);
105
106 for (;;) {
107 int old, new;
108
109 if (val & _Q_LOCKED_MASK)
110 break;
111
112 /*
113 * Try to clear pending bit & set locked bit
114 */
115 old = val;
116 new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
117 val = atomic_cmpxchg(&lock->val, old, new);
118
119 if (val == old)
120 return 1;
121 }
122 return 0;
123}
124#endif /* _Q_PENDING_BITS == 8 */
125
126/*
44 * Include queued spinlock statistics code 127 * Include queued spinlock statistics code
45 */ 128 */
46#include "qspinlock_stat.h" 129#include "qspinlock_stat.h"
@@ -202,8 +285,8 @@ static void pv_wait_node(struct mcs_spinlock *node)
202 285
203 /* 286 /*
204 * If pv_kick_node() changed us to vcpu_hashed, retain that 287 * If pv_kick_node() changed us to vcpu_hashed, retain that
205 * value so that pv_wait_head() knows to not also try to hash 288 * value so that pv_wait_head_or_lock() knows to not also try
206 * this lock. 289 * to hash this lock.
207 */ 290 */
208 cmpxchg(&pn->state, vcpu_halted, vcpu_running); 291 cmpxchg(&pn->state, vcpu_halted, vcpu_running);
209 292
@@ -227,8 +310,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
227/* 310/*
228 * Called after setting next->locked = 1 when we're the lock owner. 311 * Called after setting next->locked = 1 when we're the lock owner.
229 * 312 *
230 * Instead of waking the waiters stuck in pv_wait_node() advance their state such 313 * Instead of waking the waiters stuck in pv_wait_node() advance their state
231 * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. 314 * such that they're waiting in pv_wait_head_or_lock(), this avoids a
315 * wake/sleep cycle.
232 */ 316 */
233static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) 317static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
234{ 318{
@@ -257,10 +341,14 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
257} 341}
258 342
259/* 343/*
260 * Wait for l->locked to become clear; halt the vcpu after a short spin. 344 * Wait for l->locked to become clear and acquire the lock;
345 * halt the vcpu after a short spin.
261 * __pv_queued_spin_unlock() will wake us. 346 * __pv_queued_spin_unlock() will wake us.
347 *
348 * The current value of the lock will be returned for additional processing.
262 */ 349 */
263static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) 350static u32
351pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
264{ 352{
265 struct pv_node *pn = (struct pv_node *)node; 353 struct pv_node *pn = (struct pv_node *)node;
266 struct __qspinlock *l = (void *)lock; 354 struct __qspinlock *l = (void *)lock;
@@ -276,11 +364,18 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
276 lp = (struct qspinlock **)1; 364 lp = (struct qspinlock **)1;
277 365
278 for (;; waitcnt++) { 366 for (;; waitcnt++) {
367 /*
368 * Set the pending bit in the active lock spinning loop to
369 * disable lock stealing before attempting to acquire the lock.
370 */
371 set_pending(lock);
279 for (loop = SPIN_THRESHOLD; loop; loop--) { 372 for (loop = SPIN_THRESHOLD; loop; loop--) {
280 if (!READ_ONCE(l->locked)) 373 if (trylock_clear_pending(lock))
281 return; 374 goto gotlock;
282 cpu_relax(); 375 cpu_relax();
283 } 376 }
377 clear_pending(lock);
378
284 379
285 if (!lp) { /* ONCE */ 380 if (!lp) { /* ONCE */
286 lp = pv_hash(lock, pn); 381 lp = pv_hash(lock, pn);
@@ -296,36 +391,38 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
296 * 391 *
297 * Matches the smp_rmb() in __pv_queued_spin_unlock(). 392 * Matches the smp_rmb() in __pv_queued_spin_unlock().
298 */ 393 */
299 if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { 394 if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
300 /* 395 /*
301 * The lock is free and _Q_SLOW_VAL has never 396 * The lock was free and now we own the lock.
302 * been set. Therefore we need to unhash before 397 * Change the lock value back to _Q_LOCKED_VAL
303 * getting the lock. 398 * and unhash the table.
304 */ 399 */
400 WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
305 WRITE_ONCE(*lp, NULL); 401 WRITE_ONCE(*lp, NULL);
306 return; 402 goto gotlock;
307 } 403 }
308 } 404 }
309 qstat_inc(qstat_pv_wait_head, true); 405 qstat_inc(qstat_pv_wait_head, true);
310 qstat_inc(qstat_pv_wait_again, waitcnt); 406 qstat_inc(qstat_pv_wait_again, waitcnt);
311 pv_wait(&l->locked, _Q_SLOW_VAL); 407 pv_wait(&l->locked, _Q_SLOW_VAL);
312 408
313 if (!READ_ONCE(l->locked))
314 return;
315 /* 409 /*
316 * The unlocker should have freed the lock before kicking the 410 * The unlocker should have freed the lock before kicking the
317 * CPU. So if the lock is still not free, it is a spurious 411 * CPU. So if the lock is still not free, it is a spurious
318 * wakeup and so the vCPU should wait again after spinning for 412 * wakeup or another vCPU has stolen the lock. The current
319 * a while. 413 * vCPU should spin again.
320 */ 414 */
321 qstat_inc(qstat_pv_spurious_wakeup, true); 415 qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
322 } 416 }
323 417
324 /* 418 /*
325 * Lock is unlocked now; the caller will acquire it without waiting. 419 * The cmpxchg() or xchg() call before coming here provides the
326 * As with pv_wait_node() we rely on the caller to do a load-acquire 420 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
327 * for us. 421 * here is to indicate to the compiler that the value will always
422 * be nozero to enable better code optimization.
328 */ 423 */
424gotlock:
425 return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
329} 426}
330 427
331/* 428/*
@@ -350,7 +447,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
350 * so we need a barrier to order the read of the node data in 447 * so we need a barrier to order the read of the node data in
351 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. 448 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
352 * 449 *
353 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. 450 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
354 */ 451 */
355 smp_rmb(); 452 smp_rmb();
356 453
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index b1553adec2e7..94d4533fe984 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,6 +22,7 @@
22 * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake 22 * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
23 * pv_latency_kick - average latency (ns) of vCPU kick operation 23 * pv_latency_kick - average latency (ns) of vCPU kick operation
24 * pv_latency_wake - average latency (ns) from vCPU kick to wakeup 24 * pv_latency_wake - average latency (ns) from vCPU kick to wakeup
25 * pv_lock_stealing - # of lock stealing operations
25 * pv_spurious_wakeup - # of spurious wakeups 26 * pv_spurious_wakeup - # of spurious wakeups
26 * pv_wait_again - # of vCPU wait's that happened after a vCPU kick 27 * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
27 * pv_wait_head - # of vCPU wait's at the queue head 28 * pv_wait_head - # of vCPU wait's at the queue head
@@ -43,6 +44,7 @@ enum qlock_stats {
43 qstat_pv_kick_wake, 44 qstat_pv_kick_wake,
44 qstat_pv_latency_kick, 45 qstat_pv_latency_kick,
45 qstat_pv_latency_wake, 46 qstat_pv_latency_wake,
47 qstat_pv_lock_stealing,
46 qstat_pv_spurious_wakeup, 48 qstat_pv_spurious_wakeup,
47 qstat_pv_wait_again, 49 qstat_pv_wait_again,
48 qstat_pv_wait_head, 50 qstat_pv_wait_head,
@@ -66,6 +68,7 @@ static const char * const qstat_names[qstat_num + 1] = {
66 [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", 68 [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
67 [qstat_pv_latency_kick] = "pv_latency_kick", 69 [qstat_pv_latency_kick] = "pv_latency_kick",
68 [qstat_pv_latency_wake] = "pv_latency_wake", 70 [qstat_pv_latency_wake] = "pv_latency_wake",
71 [qstat_pv_lock_stealing] = "pv_lock_stealing",
69 [qstat_pv_wait_again] = "pv_wait_again", 72 [qstat_pv_wait_again] = "pv_wait_again",
70 [qstat_pv_wait_head] = "pv_wait_head", 73 [qstat_pv_wait_head] = "pv_wait_head",
71 [qstat_pv_wait_node] = "pv_wait_node", 74 [qstat_pv_wait_node] = "pv_wait_node",
@@ -273,6 +276,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
273#define pv_kick(c) __pv_kick(c) 276#define pv_kick(c) __pv_kick(c)
274#define pv_wait(p, v) __pv_wait(p, v) 277#define pv_wait(p, v) __pv_wait(p, v)
275 278
279/*
280 * PV unfair trylock count tracking function
281 */
282static inline int qstat_spin_steal_lock(struct qspinlock *lock)
283{
284 int ret = pv_queued_spin_steal_lock(lock);
285
286 qstat_inc(qstat_pv_lock_stealing, ret);
287 return ret;
288}
289#undef queued_spin_trylock
290#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
291
276#else /* CONFIG_QUEUED_LOCK_STAT */ 292#else /* CONFIG_QUEUED_LOCK_STAT */
277 293
278static inline void qstat_inc(enum qlock_stats stat, bool cond) { } 294static inline void qstat_inc(enum qlock_stats stat, bool cond) { }