1 files changed, 78 insertions, 45 deletions
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 82d38f145b43..801db885955c 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -29,6 +29,8 @@
 #include <asm/proto.h>
 #include <asm/apicdef.h>
+#define __cpuinit __init
 /*
 *      Smarter SMP flushing macros. 
 *              c/o Linus Torvalds.
@@ -37,19 +39,41 @@
 *      writing to user space from interrupts. (Its not allowed anyway).
 *
 *      Optimizations Manfred Spraul <manfred@colorfullife.com>
+ *
+ *      More scalable flush, from Andi Kleen
+ *
+ *      To avoid global state use 8 different call vectors.
+ *      Each CPU uses a specific vector to trigger flushes on other
+ *      CPUs. Depending on the received vector the target CPUs look into
+ *      the right per cpu variable for the flush data.
+ *
+ *      With more than 8 CPUs they are hashed to the 8 available
+ *      vectors. The limited global vector space forces us to this right now.
+ *      In future when interrupts are split into per CPU domains this could be
+ *      fixed, at the cost of triggering multiple IPIs in some cases.
 */
-static cpumask_t flush_cpumask;
+union smp_flush_state {
-static struct mm_struct * flush_mm;
+        struct {
-static unsigned long flush_va;
+                cpumask_t flush_cpumask;
-static DEFINE_SPINLOCK(tlbstate_lock);
+                struct mm_struct *flush_mm;
+                unsigned long flush_va;
 #define FLUSH_ALL       -1ULL
+                spinlock_t tlbstate_lock;
+        };
+        char pad[SMP_CACHE_BYTES];
+} ____cacheline_aligned;
+/* State is put into the per CPU data section, but padded
+   to a full cache line because other CPUs can access it and we don't
+   want false sharing in the per cpu data segment. */
+static DEFINE_PER_CPU(union smp_flush_state, flush_state);
 /*
 * We cannot call mmdrop() because we are in interrupt context, 
 * instead update mm->cpu_vm_mask.
 */
-static inline void leave_mm (unsigned long cpu)
+static inline void leave_mm(int cpu)
 {
        if (read_pda(mmu_state) == TLBSTATE_OK)
                BUG();
@@ -101,15 +125,25 @@ static inline void leave_mm (unsigned long cpu)
 *
 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
 * 2) Leave the mm if we are in the lazy tlb mode.
+ *
+ * Interrupts are disabled.
 */
-asmlinkage void smp_invalidate_interrupt (void)
+asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
 {
-        unsigned long cpu;
+        int cpu;
+        int sender;
+        union smp_flush_state *f;
-        cpu = get_cpu();
+        cpu = smp_processor_id();
+        /*
+         * orig_rax contains the interrupt vector - 256.
+         * Use that to determine where the sender put the data.
+         */
+        sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
+        f = &per_cpu(flush_state, sender);
-        if (!cpu_isset(cpu, flush_cpumask))
+        if (!cpu_isset(cpu, f->flush_cpumask))
                goto out;
                /* 
                 * This was a BUG() but until someone can quote me the
@@ -120,64 +154,63 @@ asmlinkage void smp_invalidate_interrupt (void)
                 * BUG();
                 */
                 
-        if (flush_mm == read_pda(active_mm)) {
+        if (f->flush_mm == read_pda(active_mm)) {
                if (read_pda(mmu_state) == TLBSTATE_OK) {
-                        if (flush_va == FLUSH_ALL)
+                        if (f->flush_va == FLUSH_ALL)
                                local_flush_tlb();
                        else
-                                __flush_tlb_one(flush_va);
+                                __flush_tlb_one(f->flush_va);
                } else
                        leave_mm(cpu);
        }
 out:
        ack_APIC_irq();
-        cpu_clear(cpu, flush_cpumask);
+        cpu_clear(cpu, f->flush_cpumask);
-        put_cpu_no_resched();
 }
 static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
                                                unsigned long va)
 {
-        cpumask_t tmp;
+        int sender;
-        /*
+        union smp_flush_state *f;
-         * A couple of (to be removed) sanity checks:
-         *
-         * - we do not send IPIs to not-yet booted CPUs.
-         * - current CPU must not be in mask
-         * - mask must exist :)
-         */
-        BUG_ON(cpus_empty(cpumask));
-        cpus_and(tmp, cpumask, cpu_online_map);
-        BUG_ON(!cpus_equal(tmp, cpumask));
-        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
-        if (!mm)
-                BUG();
-        /*
+        /* Caller has disabled preemption */
-         * I'm not happy about this global shared spinlock in the
+        sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
-         * MM hot path, but we'll see how contended it is.
+        f = &per_cpu(flush_state, sender);
-         * Temporarily this turns IRQs off, so that lockups are
-         * detected by the NMI watchdog.
+        /* Could avoid this lock when
-         */
+           num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
-        spin_lock(&tlbstate_lock);
+           probably not worth checking this for a cache-hot lock. */
-        
+        spin_lock(&f->tlbstate_lock);
-        flush_mm = mm;
-        flush_va = va;
+        f->flush_mm = mm;
-        cpus_or(flush_cpumask, cpumask, flush_cpumask);
+        f->flush_va = va;
+        cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
        /*
         * We have to send the IPI only to
         * CPUs affected.
         */
-        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+        send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
-        while (!cpus_empty(flush_cpumask))
+        while (!cpus_empty(f->flush_cpumask))
-                mb();   /* nothing. lockup detection does not belong here */;
+                cpu_relax();
-        flush_mm = NULL;
+        f->flush_mm = NULL;
-        flush_va = 0;
+        f->flush_va = 0;
-        spin_unlock(&tlbstate_lock);
+        spin_unlock(&f->tlbstate_lock);
 }
+int __cpuinit init_smp_flush(void)
+{
+        int i;
+        for_each_cpu_mask(i, cpu_possible_map) {
+                spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
+        }
+        return 0;
+}
+core_initcall(init_smp_flush);
        
 void flush_tlb_current_task(void)
 {

diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 82d38f145b43..801db885955c 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c
@@ -29,6 +29,8 @@
29	#include <asm/proto.h>	29	#include <asm/proto.h>
30	#include <asm/apicdef.h>	30	#include <asm/apicdef.h>
31		31
		32	#define __cpuinit __init
		33
32	/*	34	/*
33	* Smarter SMP flushing macros.	35	* Smarter SMP flushing macros.
34	* c/o Linus Torvalds.	36	* c/o Linus Torvalds.
@@ -37,19 +39,41 @@
37	* writing to user space from interrupts. (Its not allowed anyway).	39	* writing to user space from interrupts. (Its not allowed anyway).
38	*	40	*
39	* Optimizations Manfred Spraul <manfred@colorfullife.com>	41	* Optimizations Manfred Spraul <manfred@colorfullife.com>
		42	*
		43	* More scalable flush, from Andi Kleen
		44	*
		45	* To avoid global state use 8 different call vectors.
		46	* Each CPU uses a specific vector to trigger flushes on other
		47	* CPUs. Depending on the received vector the target CPUs look into
		48	* the right per cpu variable for the flush data.
		49	*
		50	* With more than 8 CPUs they are hashed to the 8 available
		51	* vectors. The limited global vector space forces us to this right now.
		52	* In future when interrupts are split into per CPU domains this could be
		53	* fixed, at the cost of triggering multiple IPIs in some cases.
40	*/	54	*/
41		55
42	static cpumask_t flush_cpumask;	56	union smp_flush_state {
43	static struct mm_struct * flush_mm;	57	struct {
44	static unsigned long flush_va;	58	cpumask_t flush_cpumask;
45	static DEFINE_SPINLOCK(tlbstate_lock);	59	struct mm_struct *flush_mm;
		60	unsigned long flush_va;
46	#define FLUSH_ALL -1ULL	61	#define FLUSH_ALL -1ULL
		62	spinlock_t tlbstate_lock;
		63	};
		64	char pad[SMP_CACHE_BYTES];
		65	} ____cacheline_aligned;
		66
		67	/* State is put into the per CPU data section, but padded
		68	to a full cache line because other CPUs can access it and we don't
		69	want false sharing in the per cpu data segment. */
		70	static DEFINE_PER_CPU(union smp_flush_state, flush_state);
47		71
48	/*	72	/*
49	* We cannot call mmdrop() because we are in interrupt context,	73	* We cannot call mmdrop() because we are in interrupt context,
50	* instead update mm->cpu_vm_mask.	74	* instead update mm->cpu_vm_mask.
51	*/	75	*/
52	static inline void leave_mm (unsigned long cpu)	76	static inline void leave_mm(int cpu)
53	{	77	{
54	if (read_pda(mmu_state) == TLBSTATE_OK)	78	if (read_pda(mmu_state) == TLBSTATE_OK)
55	BUG();	79	BUG();
@@ -101,15 +125,25 @@ static inline void leave_mm (unsigned long cpu)
101	*	125	*
102	* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.	126	* 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
103	* 2) Leave the mm if we are in the lazy tlb mode.	127	* 2) Leave the mm if we are in the lazy tlb mode.
		128	*
		129	* Interrupts are disabled.
104	*/	130	*/
105		131
106	asmlinkage void smp_invalidate_interrupt (void)	132	asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
107	{	133	{
108	unsigned long cpu;	134	int cpu;
		135	int sender;
		136	union smp_flush_state *f;
109		137
110	cpu = get_cpu();	138	cpu = smp_processor_id();
		139	/*
		140	* orig_rax contains the interrupt vector - 256.
		141	* Use that to determine where the sender put the data.
		142	*/
		143	sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
		144	f = &per_cpu(flush_state, sender);
111		145
112	if (!cpu_isset(cpu, flush_cpumask))	146	if (!cpu_isset(cpu, f->flush_cpumask))
113	goto out;	147	goto out;
114	/*	148	/*
115	* This was a BUG() but until someone can quote me the	149	* This was a BUG() but until someone can quote me the
@@ -120,64 +154,63 @@ asmlinkage void smp_invalidate_interrupt (void)
120	* BUG();	154	* BUG();
121	*/	155	*/
122		156
123	if (flush_mm == read_pda(active_mm)) {	157	if (f->flush_mm == read_pda(active_mm)) {
124	if (read_pda(mmu_state) == TLBSTATE_OK) {	158	if (read_pda(mmu_state) == TLBSTATE_OK) {
125	if (flush_va == FLUSH_ALL)	159	if (f->flush_va == FLUSH_ALL)
126	local_flush_tlb();	160	local_flush_tlb();
127	else	161	else
128	__flush_tlb_one(flush_va);	162	__flush_tlb_one(f->flush_va);
129	} else	163	} else
130	leave_mm(cpu);	164	leave_mm(cpu);
131	}	165	}
132	out:	166	out:
133	ack_APIC_irq();	167	ack_APIC_irq();
134	cpu_clear(cpu, flush_cpumask);	168	cpu_clear(cpu, f->flush_cpumask);
135	put_cpu_no_resched();
136	}	169	}
137		170
138	static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,	171	static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
139	unsigned long va)	172	unsigned long va)
140	{	173	{
141	cpumask_t tmp;	174	int sender;
142	/*	175	union smp_flush_state *f;
143	* A couple of (to be removed) sanity checks:
144	*
145	* - we do not send IPIs to not-yet booted CPUs.
146	* - current CPU must not be in mask
147	* - mask must exist :)
148	*/
149	BUG_ON(cpus_empty(cpumask));
150	cpus_and(tmp, cpumask, cpu_online_map);
151	BUG_ON(!cpus_equal(tmp, cpumask));
152	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
153	if (!mm)
154	BUG();
155		176
156	/*	177	/* Caller has disabled preemption */
157	* I'm not happy about this global shared spinlock in the	178	sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
158	* MM hot path, but we'll see how contended it is.	179	f = &per_cpu(flush_state, sender);
159	* Temporarily this turns IRQs off, so that lockups are	180
160	* detected by the NMI watchdog.	181	/* Could avoid this lock when
161	*/	182	num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
162	spin_lock(&tlbstate_lock);	183	probably not worth checking this for a cache-hot lock. */
163		184	spin_lock(&f->tlbstate_lock);
164	flush_mm = mm;	185
165	flush_va = va;	186	f->flush_mm = mm;
166	cpus_or(flush_cpumask, cpumask, flush_cpumask);	187	f->flush_va = va;
		188	cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
167		189
168	/*	190	/*
169	* We have to send the IPI only to	191	* We have to send the IPI only to
170	* CPUs affected.	192	* CPUs affected.
171	*/	193	*/
172	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);	194	send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
173		195
174	while (!cpus_empty(flush_cpumask))	196	while (!cpus_empty(f->flush_cpumask))
175	mb(); /* nothing. lockup detection does not belong here */;	197	cpu_relax();
176		198
177	flush_mm = NULL;	199	f->flush_mm = NULL;
178	flush_va = 0;	200	f->flush_va = 0;
179	spin_unlock(&tlbstate_lock);	201	spin_unlock(&f->tlbstate_lock);
180	}	202	}
		203
		204	int __cpuinit init_smp_flush(void)
		205	{
		206	int i;
		207	for_each_cpu_mask(i, cpu_possible_map) {
		208	spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
		209	}
		210	return 0;
		211	}
		212
		213	core_initcall(init_smp_flush);
181		214
182	void flush_tlb_current_task(void)	215	void flush_tlb_current_task(void)
183	{	216	{