[PATCH] x86-64: Make remote TLB flush more scalable

Instead of using a global spinlock to protect the state of the remote TLB flush use a lock and state for each sending CPU. To tell the receiver where to look for the state use 8 different call vectors. Each CPU uses a specific vector to trigger flushes on other CPUs. Depending on the received vector the target CPUs look into the right per cpu variable for the flush data. When the system has more than 8 CPUs they are hashed to the 8 available vectors. The limited global vector space forces us to this right now. In future when interrupts are split into per CPU domains this could be fixed, at the cost of needing more IPIs in flat mode. Also some minor cleanup in the smp flush code and remove some outdated debug code. Requires patch to move cpu_possible_map setup earlier. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Andi Kleen <ak@suse.de> 2005-09-12 12:49:24 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-09-12 13:49:58 -0400
commit: e5bc8b6baf86538f3d0407cf0880f86aec828a13 (patch)
tree: b67a98a866a51286fd342d08b27b511826875138 /arch/x86_64/kernel/smp.c
parent: 83b942bd3437d84d6ddf582477120b6b86369052 (diff)
1 files changed, 78 insertions, 45 deletions
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 82d38f145b43..801db885955c 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -29,6 +29,8 @@
 #include <asm/proto.h>
 #include <asm/apicdef.h>
+#define __cpuinit __init
 /*
 *      Smarter SMP flushing macros. 
 *              c/o Linus Torvalds.
@@ -37,19 +39,41 @@
 *      writing to user space from interrupts. (Its not allowed anyway).
 *
 *      Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ *      More scalable flush, from Andi Kleen
+ *
+ *      To avoid global state use 8 different call vectors.
+ *      Each CPU uses a specific vector to trigger flushes on other
+ *      CPUs. Depending on the received vector the target CPUs look into
+ *      the right per cpu variable for the flush data.
+ *
+ *      With more than 8 CPUs they are hashed to the 8 available
+ *      vectors. The limited global vector space forces us to this right now.
+ *      In future when interrupts are split into per CPU domains this could be
+ *      fixed, at the cost of triggering multiple IPIs in some cases.
 */
-static cpumask_t flush_cpumask;
+union smp_flush_state {
-static struct mm_struct * flush_mm;
+        struct {
-static unsigned long flush_va;
+                cpumask_t flush_cpumask;
-static DEFINE_SPINLOCK(tlbstate_lock);
+                struct mm_struct *flush_mm;
+                unsigned long flush_va;
 #define FLUSH_ALL       -1ULL
+                spinlock_t tlbstate_lock;
+        };
+        char pad[SMP_CACHE_BYTES];
+} ____cacheline_aligned;
+/* State is put into the per CPU data section, but padded
+   to a full cache line because other CPUs can access it and we don't
+   want false sharing in the per cpu data segment. */
+static DEFINE_PER_CPU(union smp_flush_state, flush_state);
 /*
 * We cannot call mmdrop() because we are in interrupt context, 
 * instead update mm->cpu_vm_mask.
 */
-static inline void leave_mm (unsigned long cpu)
+static inline void leave_mm(int cpu)
 {
        if (read_pda(mmu_state) == TLBSTATE_OK)
                BUG();
@@ -101,15 +125,25 @@ static inline void leave_mm (unsigned long cpu)
 *
 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
 * 2) Leave the mm if we are in the lazy tlb mode.
+ *
+ * Interrupts are disabled.
 */
-asmlinkage void smp_invalidate_interrupt (void)
+asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
 {
-        unsigned long cpu;
+        int cpu;
+        int sender;
+        union smp_flush_state *f;
-        cpu = get_cpu();
+        cpu = smp_processor_id();
+        /*
+         * orig_rax contains the interrupt vector - 256.
+         * Use that to determine where the sender put the data.
+         */
+        sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
+        f = &per_cpu(flush_state, sender);
-        if (!cpu_isset(cpu, flush_cpumask))
+        if (!cpu_isset(cpu, f->flush_cpumask))
                goto out;
                /* 
                 * This was a BUG() but until someone can quote me the
@@ -120,64 +154,63 @@ asmlinkage void smp_invalidate_interrupt (void)
                 * BUG();
                 */
                 
-        if (flush_mm == read_pda(active_mm)) {
+        if (f->flush_mm == read_pda(active_mm)) {
                if (read_pda(mmu_state) == TLBSTATE_OK) {
-                        if (flush_va == FLUSH_ALL)
+                        if (f->flush_va == FLUSH_ALL)
                                local_flush_tlb();
                        else
-                                __flush_tlb_one(flush_va);
+                                __flush_tlb_one(f->flush_va);
                } else
                        leave_mm(cpu);
        }
 out:
        ack_APIC_irq();
-        cpu_clear(cpu, flush_cpumask);
+        cpu_clear(cpu, f->flush_cpumask);
-        put_cpu_no_resched();
 }
 static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
                                                unsigned long va)
 {
-        cpumask_t tmp;
+        int sender;
-        /*
+        union smp_flush_state *f;
-         * A couple of (to be removed) sanity checks:
-         *
-         * - we do not send IPIs to not-yet booted CPUs.
-         * - current CPU must not be in mask
-         * - mask must exist :)
-         */
-        BUG_ON(cpus_empty(cpumask));
-        cpus_and(tmp, cpumask, cpu_online_map);
-        BUG_ON(!cpus_equal(tmp, cpumask));
-        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-        if (!mm)
-                BUG();
-        /*
+        /* Caller has disabled preemption */
-         * I'm not happy about this global shared spinlock in the
+        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
-         * MM hot path, but we'll see how contended it is.
+        f = &per_cpu(flush_state, sender);
-         * Temporarily this turns IRQs off, so that lockups are
-         * detected by the NMI watchdog.
+        /* Could avoid this lock when
-         */
+           num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-        spin_lock(&tlbstate_lock);
+           probably not worth checking this for a cache-hot lock. */
-        
+        spin_lock(&f->tlbstate_lock);
-        flush_mm = mm;
-        flush_va = va;
+        f->flush_mm = mm;
-        cpus_or(flush_cpumask, cpumask, flush_cpumask);
+        f->flush_va = va;
+        cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
        /*
         * We have to send the IPI only to
         * CPUs affected.
         */
-        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
-        while (!cpus_empty(flush_cpumask))
+        while (!cpus_empty(f->flush_cpumask))
-                mb();   /* nothing. lockup detection does not belong here */;
+                cpu_relax();
-        flush_mm = NULL;
+        f->flush_mm = NULL;
-        flush_va = 0;
+        f->flush_va = 0;
-        spin_unlock(&tlbstate_lock);
+        spin_unlock(&f->tlbstate_lock);
 }
+int __cpuinit init_smp_flush(void)
+{
+        int i;
+        for_each_cpu_mask(i, cpu_possible_map) {
+                spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
+        }
+        return 0;
+}
+core_initcall(init_smp_flush);
        
 void flush_tlb_current_task(void)
 {
author	Andi Kleen <ak@suse.de>	2005-09-12 12:49:24 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-09-12 13:49:58 -0400
commit	e5bc8b6baf86538f3d0407cf0880f86aec828a13 (patch)
tree	b67a98a866a51286fd342d08b27b511826875138 /arch/x86_64/kernel/smp.c
parent	83b942bd3437d84d6ddf582477120b6b86369052 (diff)

diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 82d38f145b43..801db885955c 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c
@@ -29,6 +29,8 @@
29	#include <asm/proto.h>	29	#include <asm/proto.h>
30	#include <asm/apicdef.h>	30	#include <asm/apicdef.h>
31		31
		32	#define __cpuinit __init
		33
32	/*	34	/*
33	* Smarter SMP flushing macros.	35	* Smarter SMP flushing macros.
34	* c/o Linus Torvalds.	36	* c/o Linus Torvalds.
@@ -37,19 +39,41 @@
37	* writing to user space from interrupts. (Its not allowed anyway).	39	* writing to user space from interrupts. (Its not allowed anyway).
38	*	40	*
39	* Optimizations Manfred Spraul <manfred@colorfullife.com>	41	* Optimizations Manfred Spraul <manfred@colorfullife.com>
		42	*
		43	* More scalable flush, from Andi Kleen
		44	*
		45	* To avoid global state use 8 different call vectors.
		46	* Each CPU uses a specific vector to trigger flushes on other
		47	* CPUs. Depending on the received vector the target CPUs look into
		48	* the right per cpu variable for the flush data.
		49	*
		50	* With more than 8 CPUs they are hashed to the 8 available
		51	* vectors. The limited global vector space forces us to this right now.
		52	* In future when interrupts are split into per CPU domains this could be
		53	* fixed, at the cost of triggering multiple IPIs in some cases.
40	*/	54	*/
41		55
42	static cpumask_t flush_cpumask;	56	union smp_flush_state {
43	static struct mm_struct * flush_mm;	57	struct {
44	static unsigned long flush_va;	58	cpumask_t flush_cpumask;
45	static DEFINE_SPINLOCK(tlbstate_lock);	59	struct mm_struct *flush_mm;
		60	unsigned long flush_va;
46	#define FLUSH_ALL -1ULL	61	#define FLUSH_ALL -1ULL
		62	spinlock_t tlbstate_lock;
		63	};
		64	char pad[SMP_CACHE_BYTES];
		65	} ____cacheline_aligned;
		66
		67	/* State is put into the per CPU data section, but padded
		68	to a full cache line because other CPUs can access it and we don't
		69	want false sharing in the per cpu data segment. */
		70	static DEFINE_PER_CPU(union smp_flush_state, flush_state);
47		71
48	/*	72	/*
49	* We cannot call mmdrop() because we are in interrupt context,	73	* We cannot call mmdrop() because we are in interrupt context,
50	* instead update mm->cpu_vm_mask.	74	* instead update mm->cpu_vm_mask.
51	*/	75	*/
52	static inline void leave_mm (unsigned long cpu)	76	static inline void leave_mm(int cpu)
53	{	77	{
54	if (read_pda(mmu_state) == TLBSTATE_OK)	78	if (read_pda(mmu_state) == TLBSTATE_OK)
55	BUG();	79	BUG();
@@ -101,15 +125,25 @@ static inline void leave_mm (unsigned long cpu)
101	*	125	*
102	* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.	126	* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
103	* 2) Leave the mm if we are in the lazy tlb mode.	127	* 2) Leave the mm if we are in the lazy tlb mode.
		128	*
		129	* Interrupts are disabled.
104	*/	130	*/
105		131
106	asmlinkage void smp_invalidate_interrupt (void)	132	asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
107	{	133	{
108	unsigned long cpu;	134	int cpu;
		135	int sender;
		136	union smp_flush_state *f;
109		137
110	cpu = get_cpu();	138	cpu = smp_processor_id();
		139	/*
		140	* orig_rax contains the interrupt vector - 256.
		141	* Use that to determine where the sender put the data.
		142	*/
		143	sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
		144	f = &per_cpu(flush_state, sender);
111		145
112	if (!cpu_isset(cpu, flush_cpumask))	146	if (!cpu_isset(cpu, f->flush_cpumask))
113	goto out;	147	goto out;
114	/*	148	/*
115	* This was a BUG() but until someone can quote me the	149	* This was a BUG() but until someone can quote me the
@@ -120,64 +154,63 @@ asmlinkage void smp_invalidate_interrupt (void)
120	* BUG();	154	* BUG();
121	*/	155	*/
122		156
123	if (flush_mm == read_pda(active_mm)) {	157	if (f->flush_mm == read_pda(active_mm)) {
124	if (read_pda(mmu_state) == TLBSTATE_OK) {	158	if (read_pda(mmu_state) == TLBSTATE_OK) {
125	if (flush_va == FLUSH_ALL)	159	if (f->flush_va == FLUSH_ALL)
126	local_flush_tlb();	160	local_flush_tlb();
127	else	161	else
128	__flush_tlb_one(flush_va);	162	__flush_tlb_one(f->flush_va);
129	} else	163	} else
130	leave_mm(cpu);	164	leave_mm(cpu);
131	}	165	}
132	out:	166	out:
133	ack_APIC_irq();	167	ack_APIC_irq();
134	cpu_clear(cpu, flush_cpumask);	168	cpu_clear(cpu, f->flush_cpumask);
135	put_cpu_no_resched();
136	}	169	}
137		170
138	static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,	171	static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
139	unsigned long va)	172	unsigned long va)
140	{	173	{
141	cpumask_t tmp;	174	int sender;
142	/*	175	union smp_flush_state *f;
143	* A couple of (to be removed) sanity checks:
144	*
145	* - we do not send IPIs to not-yet booted CPUs.
146	* - current CPU must not be in mask
147	* - mask must exist :)
148	*/
149	BUG_ON(cpus_empty(cpumask));
150	cpus_and(tmp, cpumask, cpu_online_map);
151	BUG_ON(!cpus_equal(tmp, cpumask));
152	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
153	if (!mm)
154	BUG();
155		176
156	/*	177	/* Caller has disabled preemption */
157	* I'm not happy about this global shared spinlock in the	178	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
158	* MM hot path, but we'll see how contended it is.	179	f = &per_cpu(flush_state, sender);
159	* Temporarily this turns IRQs off, so that lockups are	180
160	* detected by the NMI watchdog.	181	/* Could avoid this lock when
161	*/	182	num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
162	spin_lock(&tlbstate_lock);	183	probably not worth checking this for a cache-hot lock. */
163		184	spin_lock(&f->tlbstate_lock);
164	flush_mm = mm;	185
165	flush_va = va;	186	f->flush_mm = mm;
166	cpus_or(flush_cpumask, cpumask, flush_cpumask);	187	f->flush_va = va;
		188	cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
167		189
168	/*	190	/*
169	* We have to send the IPI only to	191	* We have to send the IPI only to
170	* CPUs affected.	192	* CPUs affected.
171	*/	193	*/
172	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);	194	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
173		195
174	while (!cpus_empty(flush_cpumask))	196	while (!cpus_empty(f->flush_cpumask))
175	mb(); /* nothing. lockup detection does not belong here */;	197	cpu_relax();
176		198
177	flush_mm = NULL;	199	f->flush_mm = NULL;
178	flush_va = 0;	200	f->flush_va = 0;
179	spin_unlock(&tlbstate_lock);	201	spin_unlock(&f->tlbstate_lock);
180	}	202	}
		203
		204	int __cpuinit init_smp_flush(void)
		205	{
		206	int i;
		207	for_each_cpu_mask(i, cpu_possible_map) {
		208	spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
		209	}
		210	return 0;
		211	}
		212
		213	core_initcall(init_smp_flush);
181		214
182	void flush_tlb_current_task(void)	215	void flush_tlb_current_task(void)
183	{	216	{