diff options
author | Tejun Heo <tj@kernel.org> | 2009-01-21 03:26:06 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2009-01-21 03:26:06 -0500 |
commit | 02cf94c370e0dc9bf408fe45eb86fe9ad58eaf7f (patch) | |
tree | e12620b5fead5ec6d90b54046a5025c2b28234a0 /arch/x86/kernel | |
parent | 6dd01bedee6c3191643db303a1dc530bad56ec55 (diff) |
x86: make x86_32 use tlb_64.c
Impact: less contention when issuing invalidate IPI, cleanup
Make x86_32 use the same tlb code as 64bit. The 64bit code uses
multiple IPI vectors for tlb shootdown to reduce contention. This
patch makes x86_32 allocate the same 8 IPIs as x86_64 and share the
code paths.
Note that the usage of asmlinkage is inconsistent for x86_32 and 64
and calls for further cleanup. This has been noted with a FIXME
comment in tlb_64.c.
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kernel/entry_32.S | 6 | ||||
-rw-r--r-- | arch/x86/kernel/irqinit_32.c | 11 | ||||
-rw-r--r-- | arch/x86/kernel/tlb_32.c | 239 | ||||
-rw-r--r-- | arch/x86/kernel/tlb_64.c | 12 |
5 files changed, 25 insertions, 245 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index eb074530c7d3..a62a15c22227 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -58,7 +58,7 @@ obj-$(CONFIG_PCI) += early-quirks.o | |||
58 | apm-y := apm_32.o | 58 | apm-y := apm_32.o |
59 | obj-$(CONFIG_APM) += apm.o | 59 | obj-$(CONFIG_APM) += apm.o |
60 | obj-$(CONFIG_X86_SMP) += smp.o | 60 | obj-$(CONFIG_X86_SMP) += smp.o |
61 | obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o | 61 | obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_64.o |
62 | obj-$(CONFIG_X86_32_SMP) += smpcommon.o | 62 | obj-$(CONFIG_X86_32_SMP) += smpcommon.o |
63 | obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o | 63 | obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o |
64 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o | 64 | obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 46469029e9d3..a0b91aac72a1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -672,7 +672,7 @@ common_interrupt: | |||
672 | ENDPROC(common_interrupt) | 672 | ENDPROC(common_interrupt) |
673 | CFI_ENDPROC | 673 | CFI_ENDPROC |
674 | 674 | ||
675 | #define BUILD_INTERRUPT(name, nr) \ | 675 | #define BUILD_INTERRUPT3(name, nr, fn) \ |
676 | ENTRY(name) \ | 676 | ENTRY(name) \ |
677 | RING0_INT_FRAME; \ | 677 | RING0_INT_FRAME; \ |
678 | pushl $~(nr); \ | 678 | pushl $~(nr); \ |
@@ -680,11 +680,13 @@ ENTRY(name) \ | |||
680 | SAVE_ALL; \ | 680 | SAVE_ALL; \ |
681 | TRACE_IRQS_OFF \ | 681 | TRACE_IRQS_OFF \ |
682 | movl %esp,%eax; \ | 682 | movl %esp,%eax; \ |
683 | call smp_##name; \ | 683 | call fn; \ |
684 | jmp ret_from_intr; \ | 684 | jmp ret_from_intr; \ |
685 | CFI_ENDPROC; \ | 685 | CFI_ENDPROC; \ |
686 | ENDPROC(name) | 686 | ENDPROC(name) |
687 | 687 | ||
688 | #define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) | ||
689 | |||
688 | /* The include is where all of the SMP etc. interrupts come from */ | 690 | /* The include is where all of the SMP etc. interrupts come from */ |
689 | #include "entry_arch.h" | 691 | #include "entry_arch.h" |
690 | 692 | ||
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 1507ad4e674d..bf629cadec1a 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c | |||
@@ -149,8 +149,15 @@ void __init native_init_IRQ(void) | |||
149 | */ | 149 | */ |
150 | alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); | 150 | alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); |
151 | 151 | ||
152 | /* IPI for invalidation */ | 152 | /* IPIs for invalidation */ |
153 | alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); | 153 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); |
154 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); | ||
155 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); | ||
156 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); | ||
157 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); | ||
158 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); | ||
159 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); | ||
160 | alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); | ||
154 | 161 | ||
155 | /* IPI for generic function call */ | 162 | /* IPI for generic function call */ |
156 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); | 163 | alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); |
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c deleted file mode 100644 index 93fcb05c7d43..000000000000 --- a/arch/x86/kernel/tlb_32.c +++ /dev/null | |||
@@ -1,239 +0,0 @@ | |||
1 | #include <linux/spinlock.h> | ||
2 | #include <linux/cpu.h> | ||
3 | #include <linux/interrupt.h> | ||
4 | |||
5 | #include <asm/tlbflush.h> | ||
6 | |||
7 | DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) | ||
8 | = { &init_mm, 0, }; | ||
9 | |||
10 | /* must come after the send_IPI functions above for inlining */ | ||
11 | #include <mach_ipi.h> | ||
12 | |||
13 | /* | ||
14 | * Smarter SMP flushing macros. | ||
15 | * c/o Linus Torvalds. | ||
16 | * | ||
17 | * These mean you can really definitely utterly forget about | ||
18 | * writing to user space from interrupts. (Its not allowed anyway). | ||
19 | * | ||
20 | * Optimizations Manfred Spraul <manfred@colorfullife.com> | ||
21 | */ | ||
22 | |||
23 | static cpumask_var_t flush_cpumask; | ||
24 | static struct mm_struct *flush_mm; | ||
25 | static unsigned long flush_va; | ||
26 | static DEFINE_SPINLOCK(tlbstate_lock); | ||
27 | |||
28 | /* | ||
29 | * We cannot call mmdrop() because we are in interrupt context, | ||
30 | * instead update mm->cpu_vm_mask. | ||
31 | * | ||
32 | * We need to reload %cr3 since the page tables may be going | ||
33 | * away from under us.. | ||
34 | */ | ||
35 | void leave_mm(int cpu) | ||
36 | { | ||
37 | BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK); | ||
38 | cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); | ||
39 | load_cr3(swapper_pg_dir); | ||
40 | } | ||
41 | EXPORT_SYMBOL_GPL(leave_mm); | ||
42 | |||
43 | /* | ||
44 | * | ||
45 | * The flush IPI assumes that a thread switch happens in this order: | ||
46 | * [cpu0: the cpu that switches] | ||
47 | * 1) switch_mm() either 1a) or 1b) | ||
48 | * 1a) thread switch to a different mm | ||
49 | * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); | ||
50 | * Stop ipi delivery for the old mm. This is not synchronized with | ||
51 | * the other cpus, but smp_invalidate_interrupt ignore flush ipis | ||
52 | * for the wrong mm, and in the worst case we perform a superfluous | ||
53 | * tlb flush. | ||
54 | * 1a2) set cpu_tlbstate to TLBSTATE_OK | ||
55 | * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 | ||
56 | * was in lazy tlb mode. | ||
57 | * 1a3) update cpu_tlbstate[].active_mm | ||
58 | * Now cpu0 accepts tlb flushes for the new mm. | ||
59 | * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); | ||
60 | * Now the other cpus will send tlb flush ipis. | ||
61 | * 1a4) change cr3. | ||
62 | * 1b) thread switch without mm change | ||
63 | * cpu_tlbstate[].active_mm is correct, cpu0 already handles | ||
64 | * flush ipis. | ||
65 | * 1b1) set cpu_tlbstate to TLBSTATE_OK | ||
66 | * 1b2) test_and_set the cpu bit in cpu_vm_mask. | ||
67 | * Atomically set the bit [other cpus will start sending flush ipis], | ||
68 | * and test the bit. | ||
69 | * 1b3) if the bit was 0: leave_mm was called, flush the tlb. | ||
70 | * 2) switch %%esp, ie current | ||
71 | * | ||
72 | * The interrupt must handle 2 special cases: | ||
73 | * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. | ||
74 | * - the cpu performs speculative tlb reads, i.e. even if the cpu only | ||
75 | * runs in kernel space, the cpu could load tlb entries for user space | ||
76 | * pages. | ||
77 | * | ||
78 | * The good news is that cpu_tlbstate is local to each cpu, no | ||
79 | * write/read ordering problems. | ||
80 | */ | ||
81 | |||
82 | /* | ||
83 | * TLB flush IPI: | ||
84 | * | ||
85 | * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. | ||
86 | * 2) Leave the mm if we are in the lazy tlb mode. | ||
87 | * | ||
88 | * Interrupts are disabled. | ||
89 | */ | ||
90 | |||
91 | void smp_invalidate_interrupt(struct pt_regs *regs) | ||
92 | { | ||
93 | unsigned int cpu; | ||
94 | |||
95 | cpu = smp_processor_id(); | ||
96 | |||
97 | if (!cpumask_test_cpu(cpu, flush_cpumask)) | ||
98 | goto out; | ||
99 | /* | ||
100 | * This was a BUG() but until someone can quote me the | ||
101 | * line from the intel manual that guarantees an IPI to | ||
102 | * multiple CPUs is retried _only_ on the erroring CPUs | ||
103 | * its staying as a return | ||
104 | * | ||
105 | * BUG(); | ||
106 | */ | ||
107 | |||
108 | if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) { | ||
109 | if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { | ||
110 | if (flush_va == TLB_FLUSH_ALL) | ||
111 | local_flush_tlb(); | ||
112 | else | ||
113 | __flush_tlb_one(flush_va); | ||
114 | } else | ||
115 | leave_mm(cpu); | ||
116 | } | ||
117 | out: | ||
118 | ack_APIC_irq(); | ||
119 | smp_mb__before_clear_bit(); | ||
120 | cpumask_clear_cpu(cpu, flush_cpumask); | ||
121 | smp_mb__after_clear_bit(); | ||
122 | inc_irq_stat(irq_tlb_count); | ||
123 | } | ||
124 | |||
125 | void native_flush_tlb_others(const struct cpumask *cpumask, | ||
126 | struct mm_struct *mm, unsigned long va) | ||
127 | { | ||
128 | /* | ||
129 | * - mask must exist :) | ||
130 | */ | ||
131 | BUG_ON(cpumask_empty(cpumask)); | ||
132 | BUG_ON(!mm); | ||
133 | |||
134 | /* | ||
135 | * i'm not happy about this global shared spinlock in the | ||
136 | * MM hot path, but we'll see how contended it is. | ||
137 | * AK: x86-64 has a faster method that could be ported. | ||
138 | */ | ||
139 | spin_lock(&tlbstate_lock); | ||
140 | |||
141 | cpumask_andnot(flush_cpumask, cpumask, cpumask_of(smp_processor_id())); | ||
142 | #ifdef CONFIG_HOTPLUG_CPU | ||
143 | /* If a CPU which we ran on has gone down, OK. */ | ||
144 | cpumask_and(flush_cpumask, flush_cpumask, cpu_online_mask); | ||
145 | if (unlikely(cpumask_empty(flush_cpumask))) { | ||
146 | spin_unlock(&tlbstate_lock); | ||
147 | return; | ||
148 | } | ||
149 | #endif | ||
150 | flush_mm = mm; | ||
151 | flush_va = va; | ||
152 | |||
153 | /* | ||
154 | * Make the above memory operations globally visible before | ||
155 | * sending the IPI. | ||
156 | */ | ||
157 | smp_mb(); | ||
158 | /* | ||
159 | * We have to send the IPI only to | ||
160 | * CPUs affected. | ||
161 | */ | ||
162 | send_IPI_mask(flush_cpumask, INVALIDATE_TLB_VECTOR); | ||
163 | |||
164 | while (!cpumask_empty(flush_cpumask)) | ||
165 | /* nothing. lockup detection does not belong here */ | ||
166 | cpu_relax(); | ||
167 | |||
168 | flush_mm = NULL; | ||
169 | flush_va = 0; | ||
170 | spin_unlock(&tlbstate_lock); | ||
171 | } | ||
172 | |||
173 | void flush_tlb_current_task(void) | ||
174 | { | ||
175 | struct mm_struct *mm = current->mm; | ||
176 | |||
177 | preempt_disable(); | ||
178 | |||
179 | local_flush_tlb(); | ||
180 | if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) | ||
181 | flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); | ||
182 | preempt_enable(); | ||
183 | } | ||
184 | |||
185 | void flush_tlb_mm(struct mm_struct *mm) | ||
186 | { | ||
187 | |||
188 | preempt_disable(); | ||
189 | |||
190 | if (current->active_mm == mm) { | ||
191 | if (current->mm) | ||
192 | local_flush_tlb(); | ||
193 | else | ||
194 | leave_mm(smp_processor_id()); | ||
195 | } | ||
196 | if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) | ||
197 | flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); | ||
198 | |||
199 | preempt_enable(); | ||
200 | } | ||
201 | |||
202 | void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) | ||
203 | { | ||
204 | struct mm_struct *mm = vma->vm_mm; | ||
205 | |||
206 | preempt_disable(); | ||
207 | |||
208 | if (current->active_mm == mm) { | ||
209 | if (current->mm) | ||
210 | __flush_tlb_one(va); | ||
211 | else | ||
212 | leave_mm(smp_processor_id()); | ||
213 | } | ||
214 | |||
215 | if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) | ||
216 | flush_tlb_others(&mm->cpu_vm_mask, mm, va); | ||
217 | preempt_enable(); | ||
218 | } | ||
219 | |||
220 | static void do_flush_tlb_all(void *info) | ||
221 | { | ||
222 | unsigned long cpu = smp_processor_id(); | ||
223 | |||
224 | __flush_tlb_all(); | ||
225 | if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) | ||
226 | leave_mm(cpu); | ||
227 | } | ||
228 | |||
229 | void flush_tlb_all(void) | ||
230 | { | ||
231 | on_each_cpu(do_flush_tlb_all, NULL, 1); | ||
232 | } | ||
233 | |||
234 | static int init_flush_cpumask(void) | ||
235 | { | ||
236 | alloc_cpumask_var(&flush_cpumask, GFP_KERNEL); | ||
237 | return 0; | ||
238 | } | ||
239 | early_initcall(init_flush_cpumask); | ||
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 19ac661422f7..b3ca1b940654 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c | |||
@@ -113,7 +113,17 @@ EXPORT_SYMBOL_GPL(leave_mm); | |||
113 | * Interrupts are disabled. | 113 | * Interrupts are disabled. |
114 | */ | 114 | */ |
115 | 115 | ||
116 | asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) | 116 | /* |
117 | * FIXME: use of asmlinkage is not consistent. On x86_64 it's noop | ||
118 | * but still used for documentation purpose but the usage is slightly | ||
119 | * inconsistent. On x86_32, asmlinkage is regparm(0) but interrupt | ||
120 | * entry calls in with the first parameter in %eax. Maybe define | ||
121 | * intrlinkage? | ||
122 | */ | ||
123 | #ifdef CONFIG_X86_64 | ||
124 | asmlinkage | ||
125 | #endif | ||
126 | void smp_invalidate_interrupt(struct pt_regs *regs) | ||
117 | { | 127 | { |
118 | unsigned int cpu; | 128 | unsigned int cpu; |
119 | unsigned int sender; | 129 | unsigned int sender; |