diff options
author | Andi Kleen <ak@suse.de> | 2005-09-12 12:49:24 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-09-12 13:49:58 -0400 |
commit | e5bc8b6baf86538f3d0407cf0880f86aec828a13 (patch) | |
tree | b67a98a866a51286fd342d08b27b511826875138 | |
parent | 83b942bd3437d84d6ddf582477120b6b86369052 (diff) |
[PATCH] x86-64: Make remote TLB flush more scalable
Instead of using a global spinlock to protect the state
of the remote TLB flush use a lock and state for each sending CPU.
To tell the receiver where to look for the state use 8 different
call vectors. Each CPU uses a specific vector to trigger flushes on other
CPUs. Depending on the received vector the target CPUs look into
the right per cpu variable for the flush data.
When the system has more than 8 CPUs they are hashed to the 8 available
vectors. The limited global vector space forces us to this right now.
In future when interrupts are split into per CPU domains this could be
fixed, at the cost of needing more IPIs in flat mode.
Also some minor cleanup in the smp flush code and remove some outdated
debug code.
Requires patch to move cpu_possible_map setup earlier.
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | arch/x86_64/kernel/entry.S | 15 | ||||
-rw-r--r-- | arch/x86_64/kernel/i8259.c | 20 | ||||
-rw-r--r-- | arch/x86_64/kernel/smp.c | 123 | ||||
-rw-r--r-- | include/asm-x86_64/hw_irq.h | 17 |
4 files changed, 117 insertions, 58 deletions
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 3620508c8bd9..873c39d8f818 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S | |||
@@ -536,8 +536,19 @@ ENTRY(thermal_interrupt) | |||
536 | ENTRY(reschedule_interrupt) | 536 | ENTRY(reschedule_interrupt) |
537 | apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt | 537 | apicinterrupt RESCHEDULE_VECTOR,smp_reschedule_interrupt |
538 | 538 | ||
539 | ENTRY(invalidate_interrupt) | 539 | .macro INVALIDATE_ENTRY num |
540 | apicinterrupt INVALIDATE_TLB_VECTOR,smp_invalidate_interrupt | 540 | ENTRY(invalidate_interrupt\num) |
541 | apicinterrupt INVALIDATE_TLB_VECTOR_START+\num,smp_invalidate_interrupt | ||
542 | .endm | ||
543 | |||
544 | INVALIDATE_ENTRY 0 | ||
545 | INVALIDATE_ENTRY 1 | ||
546 | INVALIDATE_ENTRY 2 | ||
547 | INVALIDATE_ENTRY 3 | ||
548 | INVALIDATE_ENTRY 4 | ||
549 | INVALIDATE_ENTRY 5 | ||
550 | INVALIDATE_ENTRY 6 | ||
551 | INVALIDATE_ENTRY 7 | ||
541 | 552 | ||
542 | ENTRY(call_function_interrupt) | 553 | ENTRY(call_function_interrupt) |
543 | apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt | 554 | apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt |
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index a89169095129..909fc63c03b6 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c | |||
@@ -486,7 +486,14 @@ void spurious_interrupt(void); | |||
486 | void error_interrupt(void); | 486 | void error_interrupt(void); |
487 | void reschedule_interrupt(void); | 487 | void reschedule_interrupt(void); |
488 | void call_function_interrupt(void); | 488 | void call_function_interrupt(void); |
489 | void invalidate_interrupt(void); | 489 | void invalidate_interrupt0(void); |
490 | void invalidate_interrupt1(void); | ||
491 | void invalidate_interrupt2(void); | ||
492 | void invalidate_interrupt3(void); | ||
493 | void invalidate_interrupt4(void); | ||
494 | void invalidate_interrupt5(void); | ||
495 | void invalidate_interrupt6(void); | ||
496 | void invalidate_interrupt7(void); | ||
490 | void thermal_interrupt(void); | 497 | void thermal_interrupt(void); |
491 | void i8254_timer_resume(void); | 498 | void i8254_timer_resume(void); |
492 | 499 | ||
@@ -562,8 +569,15 @@ void __init init_IRQ(void) | |||
562 | */ | 569 | */ |
563 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | 570 | set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); |
564 | 571 | ||
565 | /* IPI for invalidation */ | 572 | /* IPIs for invalidation */ |
566 | set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | 573 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); |
574 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); | ||
575 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); | ||
576 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); | ||
577 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); | ||
578 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); | ||
579 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); | ||
580 | set_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); | ||
567 | 581 | ||
568 | /* IPI for generic function call */ | 582 | /* IPI for generic function call */ |
569 | set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | 583 | set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); |
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 82d38f145b43..801db885955c 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c | |||
@@ -29,6 +29,8 @@ | |||
29 | #include <asm/proto.h> | 29 | #include <asm/proto.h> |
30 | #include <asm/apicdef.h> | 30 | #include <asm/apicdef.h> |
31 | 31 | ||
32 | #define __cpuinit __init | ||
33 | |||
32 | /* | 34 | /* |
33 | * Smarter SMP flushing macros. | 35 | * Smarter SMP flushing macros. |
34 | * c/o Linus Torvalds. | 36 | * c/o Linus Torvalds. |
@@ -37,19 +39,41 @@ | |||
37 | * writing to user space from interrupts. (Its not allowed anyway). | 39 | * writing to user space from interrupts. (Its not allowed anyway). |
38 | * | 40 | * |
39 | * Optimizations Manfred Spraul <manfred@colorfullife.com> | 41 | * Optimizations Manfred Spraul <manfred@colorfullife.com> |
42 | * | ||
43 | * More scalable flush, from Andi Kleen | ||
44 | * | ||
45 | * To avoid global state use 8 different call vectors. | ||
46 | * Each CPU uses a specific vector to trigger flushes on other | ||
47 | * CPUs. Depending on the received vector the target CPUs look into | ||
48 | * the right per cpu variable for the flush data. | ||
49 | * | ||
50 | * With more than 8 CPUs they are hashed to the 8 available | ||
51 | * vectors. The limited global vector space forces us to this right now. | ||
52 | * In future when interrupts are split into per CPU domains this could be | ||
53 | * fixed, at the cost of triggering multiple IPIs in some cases. | ||
40 | */ | 54 | */ |
41 | 55 | ||
42 | static cpumask_t flush_cpumask; | 56 | union smp_flush_state { |
43 | static struct mm_struct * flush_mm; | 57 | struct { |
44 | static unsigned long flush_va; | 58 | cpumask_t flush_cpumask; |
45 | static DEFINE_SPINLOCK(tlbstate_lock); | 59 | struct mm_struct *flush_mm; |
60 | unsigned long flush_va; | ||
46 | #define FLUSH_ALL -1ULL | 61 | #define FLUSH_ALL -1ULL |
62 | spinlock_t tlbstate_lock; | ||
63 | }; | ||
64 | char pad[SMP_CACHE_BYTES]; | ||
65 | } ____cacheline_aligned; | ||
66 | |||
67 | /* State is put into the per CPU data section, but padded | ||
68 | to a full cache line because other CPUs can access it and we don't | ||
69 | want false sharing in the per cpu data segment. */ | ||
70 | static DEFINE_PER_CPU(union smp_flush_state, flush_state); | ||
47 | 71 | ||
48 | /* | 72 | /* |
49 | * We cannot call mmdrop() because we are in interrupt context, | 73 | * We cannot call mmdrop() because we are in interrupt context, |
50 | * instead update mm->cpu_vm_mask. | 74 | * instead update mm->cpu_vm_mask. |
51 | */ | 75 | */ |
52 | static inline void leave_mm (unsigned long cpu) | 76 | static inline void leave_mm(int cpu) |
53 | { | 77 | { |
54 | if (read_pda(mmu_state) == TLBSTATE_OK) | 78 | if (read_pda(mmu_state) == TLBSTATE_OK) |
55 | BUG(); | 79 | BUG(); |
@@ -101,15 +125,25 @@ static inline void leave_mm (unsigned long cpu) | |||
101 | * | 125 | * |
102 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | 126 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. |
103 | * 2) Leave the mm if we are in the lazy tlb mode. | 127 | * 2) Leave the mm if we are in the lazy tlb mode. |
128 | * | ||
129 | * Interrupts are disabled. | ||
104 | */ | 130 | */ |
105 | 131 | ||
106 | asmlinkage void smp_invalidate_interrupt (void) | 132 | asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) |
107 | { | 133 | { |
108 | unsigned long cpu; | 134 | int cpu; |
135 | int sender; | ||
136 | union smp_flush_state *f; | ||
109 | 137 | ||
110 | cpu = get_cpu(); | 138 | cpu = smp_processor_id(); |
139 | /* | ||
140 | * orig_rax contains the interrupt vector - 256. | ||
141 | * Use that to determine where the sender put the data. | ||
142 | */ | ||
143 | sender = regs->orig_rax + 256 - INVALIDATE_TLB_VECTOR_START; | ||
144 | f = &per_cpu(flush_state, sender); | ||
111 | 145 | ||
112 | if (!cpu_isset(cpu, flush_cpumask)) | 146 | if (!cpu_isset(cpu, f->flush_cpumask)) |
113 | goto out; | 147 | goto out; |
114 | /* | 148 | /* |
115 | * This was a BUG() but until someone can quote me the | 149 | * This was a BUG() but until someone can quote me the |
@@ -120,64 +154,63 @@ asmlinkage void smp_invalidate_interrupt (void) | |||
120 | * BUG(); | 154 | * BUG(); |
121 | */ | 155 | */ |
122 | 156 | ||
123 | if (flush_mm == read_pda(active_mm)) { | 157 | if (f->flush_mm == read_pda(active_mm)) { |
124 | if (read_pda(mmu_state) == TLBSTATE_OK) { | 158 | if (read_pda(mmu_state) == TLBSTATE_OK) { |
125 | if (flush_va == FLUSH_ALL) | 159 | if (f->flush_va == FLUSH_ALL) |
126 | local_flush_tlb(); | 160 | local_flush_tlb(); |
127 | else | 161 | else |
128 | __flush_tlb_one(flush_va); | 162 | __flush_tlb_one(f->flush_va); |
129 | } else | 163 | } else |
130 | leave_mm(cpu); | 164 | leave_mm(cpu); |
131 | } | 165 | } |
132 | out: | 166 | out: |
133 | ack_APIC_irq(); | 167 | ack_APIC_irq(); |
134 | cpu_clear(cpu, flush_cpumask); | 168 | cpu_clear(cpu, f->flush_cpumask); |
135 | put_cpu_no_resched(); | ||
136 | } | 169 | } |
137 | 170 | ||
138 | static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, | 171 | static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, |
139 | unsigned long va) | 172 | unsigned long va) |
140 | { | 173 | { |
141 | cpumask_t tmp; | 174 | int sender; |
142 | /* | 175 | union smp_flush_state *f; |
143 | * A couple of (to be removed) sanity checks: | ||
144 | * | ||
145 | * - we do not send IPIs to not-yet booted CPUs. | ||
146 | * - current CPU must not be in mask | ||
147 | * - mask must exist :) | ||
148 | */ | ||
149 | BUG_ON(cpus_empty(cpumask)); | ||
150 | cpus_and(tmp, cpumask, cpu_online_map); | ||
151 | BUG_ON(!cpus_equal(tmp, cpumask)); | ||
152 | BUG_ON(cpu_isset(smp_processor_id(), cpumask)); | ||
153 | if (!mm) | ||
154 | BUG(); | ||
155 | 176 | ||
156 | /* | 177 | /* Caller has disabled preemption */ |
157 | * I'm not happy about this global shared spinlock in the | 178 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; |
158 | * MM hot path, but we'll see how contended it is. | 179 | f = &per_cpu(flush_state, sender); |
159 | * Temporarily this turns IRQs off, so that lockups are | 180 | |
160 | * detected by the NMI watchdog. | 181 | /* Could avoid this lock when |
161 | */ | 182 | num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is |
162 | spin_lock(&tlbstate_lock); | 183 | probably not worth checking this for a cache-hot lock. */ |
163 | 184 | spin_lock(&f->tlbstate_lock); | |
164 | flush_mm = mm; | 185 | |
165 | flush_va = va; | 186 | f->flush_mm = mm; |
166 | cpus_or(flush_cpumask, cpumask, flush_cpumask); | 187 | f->flush_va = va; |
188 | cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask); | ||
167 | 189 | ||
168 | /* | 190 | /* |
169 | * We have to send the IPI only to | 191 | * We have to send the IPI only to |
170 | * CPUs affected. | 192 | * CPUs affected. |
171 | */ | 193 | */ |
172 | send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR); | 194 | send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR_START + sender); |
173 | 195 | ||
174 | while (!cpus_empty(flush_cpumask)) | 196 | while (!cpus_empty(f->flush_cpumask)) |
175 | mb(); /* nothing. lockup detection does not belong here */; | 197 | cpu_relax(); |
176 | 198 | ||
177 | flush_mm = NULL; | 199 | f->flush_mm = NULL; |
178 | flush_va = 0; | 200 | f->flush_va = 0; |
179 | spin_unlock(&tlbstate_lock); | 201 | spin_unlock(&f->tlbstate_lock); |
180 | } | 202 | } |
203 | |||
204 | int __cpuinit init_smp_flush(void) | ||
205 | { | ||
206 | int i; | ||
207 | for_each_cpu_mask(i, cpu_possible_map) { | ||
208 | spin_lock_init(&per_cpu(flush_state.tlbstate_lock, i)); | ||
209 | } | ||
210 | return 0; | ||
211 | } | ||
212 | |||
213 | core_initcall(init_smp_flush); | ||
181 | 214 | ||
182 | void flush_tlb_current_task(void) | 215 | void flush_tlb_current_task(void) |
183 | { | 216 | { |
diff --git a/include/asm-x86_64/hw_irq.h b/include/asm-x86_64/hw_irq.h index 2b5cb2865d21..d9212eb4e894 100644 --- a/include/asm-x86_64/hw_irq.h +++ b/include/asm-x86_64/hw_irq.h | |||
@@ -50,14 +50,15 @@ struct hw_interrupt_type; | |||
50 | */ | 50 | */ |
51 | #define SPURIOUS_APIC_VECTOR 0xff | 51 | #define SPURIOUS_APIC_VECTOR 0xff |
52 | #define ERROR_APIC_VECTOR 0xfe | 52 | #define ERROR_APIC_VECTOR 0xfe |
53 | #define INVALIDATE_TLB_VECTOR 0xfd | 53 | #define RESCHEDULE_VECTOR 0xfd |
54 | #define RESCHEDULE_VECTOR 0xfc | 54 | #define CALL_FUNCTION_VECTOR 0xfc |
55 | #define TASK_MIGRATION_VECTOR 0xfb | 55 | #define KDB_VECTOR 0xfb /* reserved for KDB */ |
56 | #define CALL_FUNCTION_VECTOR 0xfa | 56 | #define THERMAL_APIC_VECTOR 0xfa |
57 | #define KDB_VECTOR 0xf9 | 57 | /* 0xf9 free */ |
58 | 58 | #define INVALIDATE_TLB_VECTOR_END 0xf8 | |
59 | #define THERMAL_APIC_VECTOR 0xf0 | 59 | #define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f8 used for TLB flush */ |
60 | 60 | ||
61 | #define NUM_INVALIDATE_TLB_VECTORS 8 | ||
61 | 62 | ||
62 | /* | 63 | /* |
63 | * Local APIC timer IRQ vector is on a different priority level, | 64 | * Local APIC timer IRQ vector is on a different priority level, |