aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2005-09-12 12:49:24 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-12 13:49:58 -0400
commite5bc8b6baf86538f3d0407cf0880f86aec828a13 (patch)
treeb67a98a866a51286fd342d08b27b511826875138
parent83b942bd3437d84d6ddf582477120b6b86369052 (diff)
[PATCH] x86-64: Make remote TLB flush more scalable
Instead of using a global spinlock to protect the state of the remote TLB flush use a lock and state for each sending CPU. To tell the receiver where to look for the state use 8 different call vectors. Each CPU uses a specific vector to trigger flushes on other CPUs. Depending on the received vector the target CPUs look into the right per cpu variable for the flush data. When the system has more than 8 CPUs they are hashed to the 8 available vectors. The limited global vector space forces us to this right now. In future when interrupts are split into per CPU domains this could be fixed, at the cost of needing more IPIs in flat mode. Also some minor cleanup in the smp flush code and remove some outdated debug code. Requires patch to move cpu_possible_map setup earlier. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/x86_64/kernel/entry.S15
-rw-r--r--arch/x86_64/kernel/i8259.c20
-rw-r--r--arch/x86_64/kernel/smp.c123
-rw-r--r--include/asm-x86_64/hw_irq.h17
4 files changed, 117 insertions, 58 deletions
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 3620508c8bd9..873c39d8f818 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -536,8 +536,19 @@ ENTRY(thermal_interrupt)
536ENTRY(reschedule_interrupt) 536ENTRY(reschedule_interrupt)
537 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt 537 apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt
538 538
539ENTRY(invalidate_interrupt) 539 .macro INVALIDATE_ENTRY num
540 apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt 540ENTRY(invalidate_interrupt\num)
541 apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt
542 .endm
543
544 INVALIDATE_ENTRY 0
545 INVALIDATE_ENTRY 1
546 INVALIDATE_ENTRY 2
547 INVALIDATE_ENTRY 3
548 INVALIDATE_ENTRY 4
549 INVALIDATE_ENTRY 5
550 INVALIDATE_ENTRY 6
551 INVALIDATE_ENTRY 7
541 552
542ENTRY(call_function_interrupt) 553ENTRY(call_function_interrupt)
543 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt 554 apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index a89169095129..909fc63c03b6 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -486,7 +486,14 @@ void spurious_interrupt(void);
486void error_interrupt(void); 486void error_interrupt(void);
487void reschedule_interrupt(void); 487void reschedule_interrupt(void);
488void call_function_interrupt(void); 488void call_function_interrupt(void);
489void invalidate_interrupt(void); 489void invalidate_interrupt0(void);
490void invalidate_interrupt1(void);
491void invalidate_interrupt2(void);
492void invalidate_interrupt3(void);
493void invalidate_interrupt4(void);
494void invalidate_interrupt5(void);
495void invalidate_interrupt6(void);
496void invalidate_interrupt7(void);
490void thermal_interrupt(void); 497void thermal_interrupt(void);
491void i8254_timer_resume(void); 498void i8254_timer_resume(void);
492 499
@@ -562,8 +569,15 @@ void __init init_IRQ(void)
562 */ 569 */
563 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 570 set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
564 571
565 /* IPI for invalidation */ 572 /* IPIs for invalidation */
566 set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); 573 set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
574 set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
575 set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
576 set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
577 set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
578 set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
579 set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
580 set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
567 581
568 /* IPI for generic function call */ 582 /* IPI for generic function call */
569 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 583 set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 82d38f145b43..801db885955c 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -29,6 +29,8 @@
29#include <asm/proto.h> 29#include <asm/proto.h>
30#include <asm/apicdef.h> 30#include <asm/apicdef.h>
31 31
32#define __cpuinit __init
33
32/* 34/*
33 * Smarter SMP flushing macros. 35 * Smarter SMP flushing macros.
34 * c/o Linus Torvalds. 36 * c/o Linus Torvalds.
@@ -37,19 +39,41 @@
37 * writing to user space from interrupts. (Its not allowed anyway). 39 * writing to user space from interrupts. (Its not allowed anyway).
38 * 40 *
39 * Optimizations Manfred Spraul <manfred@colorfullife.com> 41 * Optimizations Manfred Spraul <manfred@colorfullife.com>
42 *
43 * More scalable flush, from Andi Kleen
44 *
45 * To avoid global state use 8 different call vectors.
46 * Each CPU uses a specific vector to trigger flushes on other
47 * CPUs. Depending on the received vector the target CPUs look into
48 * the right per cpu variable for the flush data.
49 *
50 * With more than 8 CPUs they are hashed to the 8 available
51 * vectors. The limited global vector space forces us to this right now.
52 * In future when interrupts are split into per CPU domains this could be
53 * fixed, at the cost of triggering multiple IPIs in some cases.
40 */ 54 */
41 55
42static cpumask_t flush_cpumask; 56union smp_flush_state {
43static struct mm_struct * flush_mm; 57 struct {
44static unsigned long flush_va; 58 cpumask_t flush_cpumask;
45static DEFINE_SPINLOCK(tlbstate_lock); 59 struct mm_struct *flush_mm;
60 unsigned long flush_va;
46#define FLUSH_ALL -1ULL 61#define FLUSH_ALL -1ULL
62 spinlock_t tlbstate_lock;
63 };
64 char pad[SMP_CACHE_BYTES];
65} ____cacheline_aligned;
66
67/* State is put into the per CPU data section, but padded
68 to a full cache line because other CPUs can access it and we don't
69 want false sharing in the per cpu data segment. */
70static DEFINE_PER_CPU(union smp_flush_state, flush_state);
47 71
48/* 72/*
49 * We cannot call mmdrop() because we are in interrupt context, 73 * We cannot call mmdrop() because we are in interrupt context,
50 * instead update mm->cpu_vm_mask. 74 * instead update mm->cpu_vm_mask.
51 */ 75 */
52static inline void leave_mm (unsigned long cpu) 76static inline void leave_mm(int cpu)
53{ 77{
54 if (read_pda(mmu_state) == TLBSTATE_OK) 78 if (read_pda(mmu_state) == TLBSTATE_OK)
55 BUG(); 79 BUG();
@@ -101,15 +125,25 @@ static inline void leave_mm (unsigned long cpu)
101 * 125 *
102 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. 126 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
103 * 2) Leave the mm if we are in the lazy tlb mode. 127 * 2) Leave the mm if we are in the lazy tlb mode.
128 *
129 * Interrupts are disabled.
104 */ 130 */
105 131
106asmlinkage void smp_invalidate_interrupt (void) 132asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
107{ 133{
108 unsigned long cpu; 134 int cpu;
135 int sender;
136 union smp_flush_state *f;
109 137
110 cpu = get_cpu(); 138 cpu = smp_processor_id();
139 /*
140 * orig_rax contains the interrupt vector - 256.
141 * Use that to determine where the sender put the data.
142 */
143 sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START;
144 f = &per_cpu(flush_state, sender);
111 145
112 if (!cpu_isset(cpu, flush_cpumask)) 146 if (!cpu_isset(cpu, f->flush_cpumask))
113 goto out; 147 goto out;
114 /* 148 /*
115 * This was a BUG() but until someone can quote me the 149 * This was a BUG() but until someone can quote me the
@@ -120,64 +154,63 @@ asmlinkage void smp_invalidate_interrupt (void)
120 * BUG(); 154 * BUG();
121 */ 155 */
122 156
123 if (flush_mm == read_pda(active_mm)) { 157 if (f->flush_mm == read_pda(active_mm)) {
124 if (read_pda(mmu_state) == TLBSTATE_OK) { 158 if (read_pda(mmu_state) == TLBSTATE_OK) {
125 if (flush_va == FLUSH_ALL) 159 if (f->flush_va == FLUSH_ALL)
126 local_flush_tlb(); 160 local_flush_tlb();
127 else 161 else
128 __flush_tlb_one(flush_va); 162 __flush_tlb_one(f->flush_va);
129 } else 163 } else
130 leave_mm(cpu); 164 leave_mm(cpu);
131 } 165 }
132out: 166out:
133 ack_APIC_irq(); 167 ack_APIC_irq();
134 cpu_clear(cpu, flush_cpumask); 168 cpu_clear(cpu, f->flush_cpumask);
135 put_cpu_no_resched();
136} 169}
137 170
138static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, 171static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
139 unsigned long va) 172 unsigned long va)
140{ 173{
141 cpumask_t tmp; 174 int sender;
142 /* 175 union smp_flush_state *f;
143 * A couple of (to be removed) sanity checks:
144 *
145 * - we do not send IPIs to not-yet booted CPUs.
146 * - current CPU must not be in mask
147 * - mask must exist :)
148 */
149 BUG_ON(cpus_empty(cpumask));
150 cpus_and(tmp, cpumask, cpu_online_map);
151 BUG_ON(!cpus_equal(tmp, cpumask));
152 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
153 if (!mm)
154 BUG();
155 176
156 /* 177 /* Caller has disabled preemption */
157 * I'm not happy about this global shared spinlock in the 178 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
158 * MM hot path, but we'll see how contended it is. 179 f = &per_cpu(flush_state, sender);
159 * Temporarily this turns IRQs off, so that lockups are 180
160 * detected by the NMI watchdog. 181 /* Could avoid this lock when
161 */ 182 num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
162 spin_lock(&tlbstate_lock); 183 probably not worth checking this for a cache-hot lock. */
163 184 spin_lock(&f->tlbstate_lock);
164 flush_mm = mm; 185
165 flush_va = va; 186 f->flush_mm = mm;
166 cpus_or(flush_cpumask, cpumask, flush_cpumask); 187 f->flush_va = va;
188 cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
167 189
168 /* 190 /*
169 * We have to send the IPI only to 191 * We have to send the IPI only to
170 * CPUs affected. 192 * CPUs affected.
171 */ 193 */
172 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); 194 send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender);
173 195
174 while (!cpus_empty(flush_cpumask)) 196 while (!cpus_empty(f->flush_cpumask))
175 mb(); /* nothing. lockup detection does not belong here */; 197 cpu_relax();
176 198
177 flush_mm = NULL; 199 f->flush_mm = NULL;
178 flush_va = 0; 200 f->flush_va = 0;
179 spin_unlock(&tlbstate_lock); 201 spin_unlock(&f->tlbstate_lock);
180} 202}
203
204int __cpuinit init_smp_flush(void)
205{
206 int i;
207 for_each_cpu_mask(i, cpu_possible_map) {
208 spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i));
209 }
210 return 0;
211}
212
213core_initcall(init_smp_flush);
181 214
182void flush_tlb_current_task(void) 215void flush_tlb_current_task(void)
183{ 216{
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h
index 2b5cb2865d21..d9212eb4e894 100644
--- a/include/asm-x86_64/hw_irq.h
+++ b/include/asm-x86_64/hw_irq.h
@@ -50,14 +50,15 @@ struct hw_interrupt_type;
50 */ 50 */
51#define SPURIOUS_APIC_VECTOR 0xff 51#define SPURIOUS_APIC_VECTOR 0xff
52#define ERROR_APIC_VECTOR 0xfe 52#define ERROR_APIC_VECTOR 0xfe
53#define INVALIDATE_TLB_VECTOR 0xfd 53#define RESCHEDULE_VECTOR 0xfd
54#define RESCHEDULE_VECTOR 0xfc 54#define CALL_FUNCTION_VECTOR 0xfc
55#define TASK_MIGRATION_VECTOR 0xfb 55#define KDB_VECTOR 0xfb /* reserved for KDB */
56#define CALL_FUNCTION_VECTOR 0xfa 56#define THERMAL_APIC_VECTOR 0xfa
57#define KDB_VECTOR 0xf9 57/* 0xf9 free */
58 58#define INVALIDATE_TLB_VECTOR_END 0xf8
59#define THERMAL_APIC_VECTOR 0xf0 59#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f8 used for TLB flush */
60 60
61#define NUM_INVALIDATE_TLB_VECTORS 8
61 62
62/* 63/*
63 * Local APIC timer IRQ vector is on a different priority level, 64 * Local APIC timer IRQ vector is on a different priority level,