x86: Spread tlb flush vector between nodes

Currently flush tlb vector allocation is based on below equation: sender = smp_processor_id() % 8 This isn't optimal, CPUs from different node can have the same vector, this causes a lot of lock contention. Instead, we can assign the same vectors to CPUs from the same node, while different node has different vectors. This has below advantages: a. if there is lock contention, the lock contention is between CPUs from one node. This should be much cheaper than the contention between nodes. b. completely avoid lock contention between nodes. This especially benefits kswapd, which is the biggest user of tlb flush, since kswapd sets its affinity to specific node. In my test, this could reduce > 20% CPU overhead in extreme case.The test machine has 4 nodes and each node has 16 CPUs. I then bind each node's kswapd to the first CPU of the node. I run a workload with 4 sequential mmap file read thread. The files are empty sparse file. This workload will trigger a lot of page reclaim and tlbflush. The kswapd bind is to easy trigger the extreme tlb flush lock contention because otherwise kswapd keeps migrating between CPUs of a node and I can't get stable result. Sure in real workload, we can't always see so big tlb flush lock contention, but it's possible. [ hpa: folded in fix from Eric Dumazet to use this_cpu_read() ] Signed-off-by: Shaohua Li <shaohua.li@intel.com> LKML-Reference: <1287544023.4571.8.camel@sli10-conroe.sh.intel.com> Cc: Eric Dumazet <eric.dumazet@gmail.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
author: Shaohua Li <shaohua.li@intel.com> 2010-10-19 23:07:03 -0400
committer: H. Peter Anvin <hpa@linux.intel.com> 2010-10-20 17:44:42 -0400
commit: 932967202182743c01a2eee4bdfa2c42697bc586 (patch)
tree: ab4b813b6ce0ce17dedc977b57bb1a0d9c4d3888
parent: c957ef2c59e952803766ddc22e89981ab534606f (diff)
1 files changed, 47 insertions, 1 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab6667..49358481c733 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
 #include <linux/smp.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
   want false sharing in the per cpu data segment. */
 static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
+static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
 /*
 * We cannot call mmdrop() because we are in interrupt context,
 * instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
        union smp_flush_state *f;
        /* Caller has disabled preemption */
-        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
+        sender = this_cpu_read(tlb_vector_offset);
        f = &flush_state[sender];
        /*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
        flush_tlb_others_ipi(cpumask, mm, va);
 }
+static void __cpuinit calculate_tlb_offset(void)
+{
+        int cpu, node, nr_node_vecs;
+        /*
+         * we are changing tlb_vector_offset for each CPU in runtime, but this
+         * will not cause inconsistency, as the write is atomic under X86. we
+         * might see more lock contentions in a short time, but after all CPU's
+         * tlb_vector_offset are changed, everything should go normal
+         *
+         * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
+         * waste some vectors.
+         **/
+        if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
+                nr_node_vecs = 1;
+        else
+                nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
+        for_each_online_node(node) {
+                int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+                        nr_node_vecs;
+                int cpu_offset = 0;
+                for_each_cpu(cpu, cpumask_of_node(node)) {
+                        per_cpu(tlb_vector_offset, cpu) = node_offset +
+                                cpu_offset;
+                        cpu_offset++;
+                        cpu_offset = cpu_offset % nr_node_vecs;
+                }
+        }
+}
+static int tlb_cpuhp_notify(struct notifier_block *n,
+                unsigned long action, void *hcpu)
+{
+        switch (action & 0xf) {
+        case CPU_ONLINE:
+        case CPU_DEAD:
+                calculate_tlb_offset();
+        }
+        return NOTIFY_OK;
+}
 static int __cpuinit init_smp_flush(void)
 {
        int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
        for (i = 0; i < ARRAY_SIZE(flush_state); i++)
                raw_spin_lock_init(&flush_state[i].tlbstate_lock);
+        calculate_tlb_offset();
+        hotcpu_notifier(tlb_cpuhp_notify, 0);
        return 0;
 }
 core_initcall(init_smp_flush);
author	Shaohua Li <shaohua.li@intel.com>	2010-10-19 23:07:03 -0400
committer	H. Peter Anvin <hpa@linux.intel.com>	2010-10-20 17:44:42 -0400
commit	932967202182743c01a2eee4bdfa2c42697bc586 (patch)
tree	ab4b813b6ce0ce17dedc977b57bb1a0d9c4d3888
parent	c957ef2c59e952803766ddc22e89981ab534606f (diff)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c03f14ab6667..49358481c733 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
5	#include <linux/smp.h>	5	#include <linux/smp.h>
6	#include <linux/interrupt.h>	6	#include <linux/interrupt.h>
7	#include <linux/module.h>	7	#include <linux/module.h>
		8	#include <linux/cpu.h>
8		9
9	#include <asm/tlbflush.h>	10	#include <asm/tlbflush.h>
10	#include <asm/mmu_context.h>	11	#include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
52	want false sharing in the per cpu data segment. */	53	want false sharing in the per cpu data segment. */
53	static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];	54	static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54		55
		56	static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
		57
55	/*	58	/*
56	* We cannot call mmdrop() because we are in interrupt context,	59	* We cannot call mmdrop() because we are in interrupt context,
57	* instead update mm->cpu_vm_mask.	60	* instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
173	union smp_flush_state *f;	176	union smp_flush_state *f;
174		177
175	/* Caller has disabled preemption */	178	/* Caller has disabled preemption */
176	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;	179	sender = this_cpu_read(tlb_vector_offset);
177	f = &flush_state[sender];	180	f = &flush_state[sender];
178		181
179	/*	182	/*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
218	flush_tlb_others_ipi(cpumask, mm, va);	221	flush_tlb_others_ipi(cpumask, mm, va);
219	}	222	}
220		223
		224	static void __cpuinit calculate_tlb_offset(void)
		225	{
		226	int cpu, node, nr_node_vecs;
		227	/*
		228	* we are changing tlb_vector_offset for each CPU in runtime, but this
		229	* will not cause inconsistency, as the write is atomic under X86. we
		230	* might see more lock contentions in a short time, but after all CPU's
		231	* tlb_vector_offset are changed, everything should go normal
		232	*
		233	* Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
		234	* waste some vectors.
		235	**/
		236	if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
		237	nr_node_vecs = 1;
		238	else
		239	nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
		240
		241	for_each_online_node(node) {
		242	int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
		243	nr_node_vecs;
		244	int cpu_offset = 0;
		245	for_each_cpu(cpu, cpumask_of_node(node)) {
		246	per_cpu(tlb_vector_offset, cpu) = node_offset +
		247	cpu_offset;
		248	cpu_offset++;
		249	cpu_offset = cpu_offset % nr_node_vecs;
		250	}
		251	}
		252	}
		253
		254	static int tlb_cpuhp_notify(struct notifier_block *n,
		255	unsigned long action, void *hcpu)
		256	{
		257	switch (action & 0xf) {
		258	case CPU_ONLINE:
		259	case CPU_DEAD:
		260	calculate_tlb_offset();
		261	}
		262	return NOTIFY_OK;
		263	}
		264
221	static int __cpuinit init_smp_flush(void)	265	static int __cpuinit init_smp_flush(void)
222	{	266	{
223	int i;	267	int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
225	for (i = 0; i < ARRAY_SIZE(flush_state); i++)	269	for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226	raw_spin_lock_init(&flush_state[i].tlbstate_lock);	270	raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227		271
		272	calculate_tlb_offset();
		273	hotcpu_notifier(tlb_cpuhp_notify, 0);
228	return 0;	274	return 0;
229	}	275	}
230	core_initcall(init_smp_flush);	276	core_initcall(init_smp_flush);