aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/locking
diff options
context:
space:
mode:
authorWaiman Long <Waiman.Long@hpe.com>2015-11-09 19:09:27 -0500
committerIngo Molnar <mingo@kernel.org>2015-12-04 05:39:51 -0500
commitcd0272fab785077c121aa91ec2401090965bbc37 (patch)
treea83b168dc2b3fa4086e5409398855ac699272d83 /kernel/locking
parent1c4941fd53afb46ab15826628e4819866d008a28 (diff)
locking/pvqspinlock: Queue node adaptive spinning
In an overcommitted guest where some vCPUs have to be halted to make forward progress in other areas, it is highly likely that a vCPU later in the spinlock queue will be spinning while the ones earlier in the queue would have been halted. The spinning in the later vCPUs is then just a waste of precious CPU cycles because they are not going to get the lock soon as the earlier ones have to be woken up and take their turn to get the lock. This patch implements an adaptive spinning mechanism where the vCPU will call pv_wait() if the previous vCPU is not running. Linux kernel builds were run in KVM guest on an 8-socket, 4 cores/socket Westmere-EX system and a 4-socket, 8 cores/socket Haswell-EX system. Both systems are configured to have 32 physical CPUs. The kernel build times before and after the patch were: Westmere Haswell Patch 32 vCPUs 48 vCPUs 32 vCPUs 48 vCPUs ----- -------- -------- -------- -------- Before patch 3m02.3s 5m00.2s 1m43.7s 3m03.5s After patch 3m03.0s 4m37.5s 1m43.0s 2m47.2s For 32 vCPUs, this patch doesn't cause any noticeable change in performance. For 48 vCPUs (over-committed), there is about 8% performance improvement. Signed-off-by: Waiman Long <Waiman.Long@hpe.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Douglas Hatch <doug.hatch@hpe.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Scott J Norton <scott.norton@hpe.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/1447114167-47185-8-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/locking')
-rw-r--r--kernel/locking/qspinlock.c5
-rw-r--r--kernel/locking/qspinlock_paravirt.h46
-rw-r--r--kernel/locking/qspinlock_stat.h3
3 files changed, 50 insertions, 4 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 2ea42999d2d8..393d1874b9e0 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -248,7 +248,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
248 */ 248 */
249 249
250static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } 250static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
251static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } 251static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
252 struct mcs_spinlock *prev) { }
252static __always_inline void __pv_kick_node(struct qspinlock *lock, 253static __always_inline void __pv_kick_node(struct qspinlock *lock,
253 struct mcs_spinlock *node) { } 254 struct mcs_spinlock *node) { }
254static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, 255static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
@@ -407,7 +408,7 @@ queue:
407 prev = decode_tail(old); 408 prev = decode_tail(old);
408 WRITE_ONCE(prev->next, node); 409 WRITE_ONCE(prev->next, node);
409 410
410 pv_wait_node(node); 411 pv_wait_node(node, prev);
411 arch_mcs_spin_lock_contended(&node->locked); 412 arch_mcs_spin_lock_contended(&node->locked);
412 413
413 /* 414 /*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index ace60a451b4f..87bb235c3448 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,6 +23,20 @@
23#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) 23#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
24 24
25/* 25/*
26 * Queue Node Adaptive Spinning
27 *
28 * A queue node vCPU will stop spinning if the vCPU in the previous node is
29 * not running. The one lock stealing attempt allowed at slowpath entry
30 * mitigates the slight slowdown for non-overcommitted guest with this
31 * aggressive wait-early mechanism.
32 *
33 * The status of the previous node will be checked at fixed interval
34 * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't
35 * pound on the cacheline of the previous node too heavily.
36 */
37#define PV_PREV_CHECK_MASK 0xff
38
39/*
26 * Queue node uses: vcpu_running & vcpu_halted. 40 * Queue node uses: vcpu_running & vcpu_halted.
27 * Queue head uses: vcpu_running & vcpu_hashed. 41 * Queue head uses: vcpu_running & vcpu_hashed.
28 */ 42 */
@@ -235,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
235} 249}
236 250
237/* 251/*
252 * Return true if when it is time to check the previous node which is not
253 * in a running state.
254 */
255static inline bool
256pv_wait_early(struct pv_node *prev, int loop)
257{
258
259 if ((loop & PV_PREV_CHECK_MASK) != 0)
260 return false;
261
262 return READ_ONCE(prev->state) != vcpu_running;
263}
264
265/*
238 * Initialize the PV part of the mcs_spinlock node. 266 * Initialize the PV part of the mcs_spinlock node.
239 */ 267 */
240static void pv_init_node(struct mcs_spinlock *node) 268static void pv_init_node(struct mcs_spinlock *node)
@@ -252,17 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)
252 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its 280 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
253 * behalf. 281 * behalf.
254 */ 282 */
255static void pv_wait_node(struct mcs_spinlock *node) 283static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
256{ 284{
257 struct pv_node *pn = (struct pv_node *)node; 285 struct pv_node *pn = (struct pv_node *)node;
286 struct pv_node *pp = (struct pv_node *)prev;
258 int waitcnt = 0; 287 int waitcnt = 0;
259 int loop; 288 int loop;
289 bool wait_early;
260 290
261 /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ 291 /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
262 for (;; waitcnt++) { 292 for (;; waitcnt++) {
263 for (loop = SPIN_THRESHOLD; loop; loop--) { 293 for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
264 if (READ_ONCE(node->locked)) 294 if (READ_ONCE(node->locked))
265 return; 295 return;
296 if (pv_wait_early(pp, loop)) {
297 wait_early = true;
298 break;
299 }
266 cpu_relax(); 300 cpu_relax();
267 } 301 }
268 302
@@ -280,6 +314,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
280 if (!READ_ONCE(node->locked)) { 314 if (!READ_ONCE(node->locked)) {
281 qstat_inc(qstat_pv_wait_node, true); 315 qstat_inc(qstat_pv_wait_node, true);
282 qstat_inc(qstat_pv_wait_again, waitcnt); 316 qstat_inc(qstat_pv_wait_again, waitcnt);
317 qstat_inc(qstat_pv_wait_early, wait_early);
283 pv_wait(&pn->state, vcpu_halted); 318 pv_wait(&pn->state, vcpu_halted);
284 } 319 }
285 320
@@ -365,6 +400,12 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
365 400
366 for (;; waitcnt++) { 401 for (;; waitcnt++) {
367 /* 402 /*
403 * Set correct vCPU state to be used by queue node wait-early
404 * mechanism.
405 */
406 WRITE_ONCE(pn->state, vcpu_running);
407
408 /*
368 * Set the pending bit in the active lock spinning loop to 409 * Set the pending bit in the active lock spinning loop to
369 * disable lock stealing before attempting to acquire the lock. 410 * disable lock stealing before attempting to acquire the lock.
370 */ 411 */
@@ -402,6 +443,7 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
402 goto gotlock; 443 goto gotlock;
403 } 444 }
404 } 445 }
446 WRITE_ONCE(pn->state, vcpu_halted);
405 qstat_inc(qstat_pv_wait_head, true); 447 qstat_inc(qstat_pv_wait_head, true);
406 qstat_inc(qstat_pv_wait_again, waitcnt); 448 qstat_inc(qstat_pv_wait_again, waitcnt);
407 pv_wait(&l->locked, _Q_SLOW_VAL); 449 pv_wait(&l->locked, _Q_SLOW_VAL);
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
index 94d4533fe984..640dcecdd1df 100644
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -25,6 +25,7 @@
25 * pv_lock_stealing - # of lock stealing operations 25 * pv_lock_stealing - # of lock stealing operations
26 * pv_spurious_wakeup - # of spurious wakeups 26 * pv_spurious_wakeup - # of spurious wakeups
27 * pv_wait_again - # of vCPU wait's that happened after a vCPU kick 27 * pv_wait_again - # of vCPU wait's that happened after a vCPU kick
28 * pv_wait_early - # of early vCPU wait's
28 * pv_wait_head - # of vCPU wait's at the queue head 29 * pv_wait_head - # of vCPU wait's at the queue head
29 * pv_wait_node - # of vCPU wait's at a non-head queue node 30 * pv_wait_node - # of vCPU wait's at a non-head queue node
30 * 31 *
@@ -47,6 +48,7 @@ enum qlock_stats {
47 qstat_pv_lock_stealing, 48 qstat_pv_lock_stealing,
48 qstat_pv_spurious_wakeup, 49 qstat_pv_spurious_wakeup,
49 qstat_pv_wait_again, 50 qstat_pv_wait_again,
51 qstat_pv_wait_early,
50 qstat_pv_wait_head, 52 qstat_pv_wait_head,
51 qstat_pv_wait_node, 53 qstat_pv_wait_node,
52 qstat_num, /* Total number of statistical counters */ 54 qstat_num, /* Total number of statistical counters */
@@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = {
70 [qstat_pv_latency_wake] = "pv_latency_wake", 72 [qstat_pv_latency_wake] = "pv_latency_wake",
71 [qstat_pv_lock_stealing] = "pv_lock_stealing", 73 [qstat_pv_lock_stealing] = "pv_lock_stealing",
72 [qstat_pv_wait_again] = "pv_wait_again", 74 [qstat_pv_wait_again] = "pv_wait_again",
75 [qstat_pv_wait_early] = "pv_wait_early",
73 [qstat_pv_wait_head] = "pv_wait_head", 76 [qstat_pv_wait_head] = "pv_wait_head",
74 [qstat_pv_wait_node] = "pv_wait_node", 77 [qstat_pv_wait_node] = "pv_wait_node",
75 [qstat_reset_cnts] = "reset_counters", 78 [qstat_reset_cnts] = "reset_counters",