2 files changed, 27 insertions, 11 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce6af1ee2cac..8a8c3c208c5e 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -74,12 +74,24 @@
 */
 #include "mcs_spinlock.h"
+#define MAX_NODES       4
+/*
+ * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in
+ * size and four of them will fit nicely in one 64-byte cacheline. For
+ * pvqspinlock, however, we need more space for extra data. To accommodate
+ * that, we insert two more long words to pad it up to 32 bytes. IOW, only
+ * two of them can fit in a cacheline in this case. That is OK as it is rare
+ * to have more than 2 levels of slowpath nesting in actual use. We don't
+ * want to penalize pvqspinlocks to optimize for a rare case in native
+ * qspinlocks.
+ */
+struct qnode {
+        struct mcs_spinlock mcs;
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
-#define MAX_NODES       8
+        long reserved[2];
-#else
-#define MAX_NODES       4
 #endif
+};
 /*
 * The pending bit spinning loop count.
@@ -101,7 +113,7 @@
 *
 * PV doubles the storage and uses the second cacheline for PV state.
 */
-static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]);
+static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]);
 /*
 * We must be able to distinguish between no-tail and the tail at 0:0,
@@ -126,7 +138,13 @@ static inline __pure struct mcs_spinlock *decode_tail(u32 tail)
        int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1;
        int idx = (tail &  _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
-        return per_cpu_ptr(&mcs_nodes[idx], cpu);
+        return per_cpu_ptr(&qnodes[idx].mcs, cpu);
+}
+static inline __pure
+struct mcs_spinlock *grab_mcs_node(struct mcs_spinlock *base, int idx)
+{
+        return &((struct qnode *)base + idx)->mcs;
 }
 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
@@ -390,11 +408,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 queue:
        qstat_inc(qstat_lock_slowpath, true);
 pv_queue:
-        node = this_cpu_ptr(&mcs_nodes[0]);
+        node = this_cpu_ptr(&qnodes[0].mcs);
        idx = node->count++;
        tail = encode_tail(smp_processor_id(), idx);
-        node += idx;
+        node = grab_mcs_node(node, idx);
        /*
         * Keep counts of non-zero index values:
@@ -534,7 +552,7 @@ release:
        /*
         * release the node
         */
-        __this_cpu_dec(mcs_nodes[0].count);
+        __this_cpu_dec(qnodes[0].mcs.count);
 }
 EXPORT_SYMBOL(queued_spin_lock_slowpath);
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 5a0cf5f9008c..0130e488ebfe 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -49,8 +49,6 @@ enum vcpu_state {
 struct pv_node {
        struct mcs_spinlock     mcs;
-        struct mcs_spinlock     __res[3];
        int                     cpu;
        u8                      state;
 };
@@ -281,7 +279,7 @@ static void pv_init_node(struct mcs_spinlock *node)
 {
        struct pv_node *pn = (struct pv_node *)node;
-        BUILD_BUG_ON(sizeof(struct pv_node) > 5*sizeof(struct mcs_spinlock));
+        BUILD_BUG_ON(sizeof(struct pv_node) > sizeof(struct qnode));
        pn->cpu = smp_processor_id();
        pn->state = vcpu_running;