aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWaiman Long <longman@redhat.com>2018-10-16 09:45:07 -0400
committerIngo Molnar <mingo@kernel.org>2018-10-17 02:37:32 -0400
commit0fa809ca7f81c47bea6706bc689e941eb25d7e89 (patch)
tree6fb16d5e7c68cec519d095c8ab92c2ac6c8df758
parent1222109a53637f96c581224198b86856d503f892 (diff)
locking/pvqspinlock: Extend node size when pvqspinlock is configured
The qspinlock code supports up to 4 levels of slowpath nesting using four per-CPU mcs_spinlock structures. For 64-bit architectures, they fit nicely in one 64-byte cacheline. For para-virtualized (PV) qspinlocks it needs to store more information in the per-CPU node structure than there is space for. It uses a trick to use a second cacheline to hold the extra information that it needs. So PV qspinlock needs to access two extra cachelines for its information whereas the native qspinlock code only needs one extra cacheline. Freshly added counter profiling of the qspinlock code, however, revealed that it was very rare to use more than two levels of slowpath nesting. So it doesn't make sense to penalize PV qspinlock code in order to have four mcs_spinlock structures in the same cacheline to optimize for a case in the native qspinlock code that rarely happens. Extend the per-CPU node structure to have two more long words when PV qspinlock locks are configured to hold the extra data that it needs. As a result, the PV qspinlock code will enjoy the same benefit of using just one extra cacheline like the native counterpart, for most cases. [ mingo: Minor changelog edits. ] Signed-off-by: Waiman Long <longman@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will.deacon@arm.com> Link: http://lkml.kernel.org/r/1539697507-28084-2-git-send-email-longman@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/locking/qspinlock.c34
-rw-r--r--kernel/locking/qspinlock_paravirt.h4
2 files changed, 27 insertions, 11 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce6af1ee2cac..8a8c3c208c5e 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -74,12 +74,24 @@
74 */ 74 */
75 75
76#include "mcs_spinlock.h" 76#include "mcs_spinlock.h"
77#define MAX_NODES 4
77 78
79/*
80 * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
81 * size and four of them will fit nicely in one 64-byte cacheline. For
82 * pvqspinlock, however, we need more space for extra data. To accommodate
83 * that, we insert two more long words to pad it up to 32 bytes. IOW, only
84 * two of them can fit in a cacheline in this case. That is OK as it is rare
85 * to have more than 2 levels of slowpath nesting in actual use. We don't
86 * want to penalize pvqspinlocks to optimize for a rare case in native
87 * qspinlocks.
88 */
89struct qnode {
90 struct mcs_spinlock mcs;
78#ifdef CONFIG_PARAVIRT_SPINLOCKS 91#ifdef CONFIG_PARAVIRT_SPINLOCKS
79#define MAX_NODES 8 92 long reserved[2];
80#else
81#define MAX_NODES 4
82#endif 93#endif
94};
83 95
84/* 96/*
85 * The pending bit spinning loop count. 97 * The pending bit spinning loop count.
@@ -101,7 +113,7 @@
101 * 113 *
102 * PV doubles the storage and uses the second cacheline for PV state. 114 * PV doubles the storage and uses the second cacheline for PV state.
103 */ 115 */
104static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); 116static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
105 117
106/* 118/*
107 * We must be able to distinguish between no-tail and the tail at 0:0, 119 * We must be able to distinguish between no-tail and the tail at 0:0,
@@ -126,7 +138,13 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
126 int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; 138 int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
127 int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; 139 int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
128 140
129 return per_cpu_ptr(&mcs_nodes[idx], cpu); 141 return per_cpu_ptr(&qnodes[idx].mcs, cpu);
142}
143
144static inline __pure
145struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
146{
147 return &((struct qnode *)base + idx)->mcs;
130} 148}
131 149
132#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) 150#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
@@ -390,11 +408,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
390queue: 408queue:
391 qstat_inc(qstat_lock_slowpath, true); 409 qstat_inc(qstat_lock_slowpath, true);
392pv_queue: 410pv_queue:
393 node = this_cpu_ptr(&mcs_nodes[0]); 411 node = this_cpu_ptr(&qnodes[0].mcs);
394 idx = node->count++; 412 idx = node->count++;
395 tail = encode_tail(smp_processor_id(), idx); 413 tail = encode_tail(smp_processor_id(), idx);
396 414
397 node += idx; 415 node = grab_mcs_node(node, idx);
398 416
399 /* 417 /*
400 * Keep counts of non-zero index values: 418 * Keep counts of non-zero index values:
@@ -534,7 +552,7 @@ release:
534 /* 552 /*
535 * release the node 553 * release the node
536 */ 554 */
537 __this_cpu_dec(mcs_nodes[0].count); 555 __this_cpu_dec(qnodes[0].mcs.count);
538} 556}
539EXPORT_SYMBOL(queued_spin_lock_slowpath); 557EXPORT_SYMBOL(queued_spin_lock_slowpath);
540 558
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 5a0cf5f9008c..0130e488ebfe 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -49,8 +49,6 @@ enum vcpu_state {
49 49
50struct pv_node { 50struct pv_node {
51 struct mcs_spinlock mcs; 51 struct mcs_spinlock mcs;
52 struct mcs_spinlock __res[3];
53
54 int cpu; 52 int cpu;
55 u8 state; 53 u8 state;
56}; 54};
@@ -281,7 +279,7 @@ static void pv_init_node(struct mcs_spinlock *node)
281{ 279{
282 struct pv_node *pn = (struct pv_node *)node; 280 struct pv_node *pn = (struct pv_node *)node;
283 281
284 BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock)); 282 BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode));
285 283
286 pn->cpu = smp_processor_id(); 284 pn->cpu = smp_processor_id();
287 pn->state = vcpu_running; 285 pn->state = vcpu_running;