aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorAlex Shi <alex.shi@intel.com>2012-06-27 21:02:17 -0400
committerH. Peter Anvin <hpa@zytor.com>2012-06-27 22:29:07 -0400
commite7b52ffd45a6d834473f43b349e7d86593d763c7 (patch)
tree12a930bdf1c43608e932f422505bf228afaf9880 /arch/x86/mm
parente0ba94f14f747c2661c4d21f8c44e5b0b8cd8e48 (diff)
x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range
x86 has no flush_tlb_range support in instruction level. Currently the flush_tlb_range just implemented by flushing all page table. That is not the best solution for all scenarios. In fact, if we just use 'invlpg' to flush few lines from TLB, we can get the performance gain from later remain TLB lines accessing. But the 'invlpg' instruction costs much of time. Its execution time can compete with cr3 rewriting, and even a bit more on SNB CPU. So, on a 512 4KB TLB entries CPU, the balance points is at: (512 - X) * 100ns(assumed TLB refill cost) = X(TLB flush entries) * 100ns(assumed invlpg cost) Here, X is 256, that is 1/2 of 512 entries. But with the mysterious CPU pre-fetcher and page miss handler Unit, the assumed TLB refill cost is far lower then 100ns in sequential access. And 2 HT siblings in one core makes the memory access more faster if they are accessing the same memory. So, in the patch, I just do the change when the target entries is less than 1/16 of whole active tlb entries. Actually, I have no data support for the percentage '1/16', so any suggestions are welcomed. As to hugetlb, guess due to smaller page table, and smaller active TLB entries, I didn't see benefit via my benchmark, so no optimizing now. My micro benchmark show in ideal scenarios, the performance improves 70 percent in reading. And in worst scenario, the reading/writing performance is similar with unpatched 3.4-rc4 kernel. Here is the reading data on my 2P * 4cores *HT NHM EP machine, with THP 'always': multi thread testing, '-t' paramter is thread number: with patch unpatched 3.4-rc4 ./mprotect -t 1 14ns 24ns ./mprotect -t 2 13ns 22ns ./mprotect -t 4 12ns 19ns ./mprotect -t 8 14ns 16ns ./mprotect -t 16 28ns 26ns ./mprotect -t 32 54ns 51ns ./mprotect -t 128 200ns 199ns Single process with sequencial flushing and memory accessing: with patch unpatched 3.4-rc4 ./mprotect 7ns 11ns ./mprotect -p 4096 -l 8 -n 10240 21ns 21ns [ hpa: http://lkml.kernel.org/r/1B4B44D9196EFF41AE41FDA404FC0A100BFF94@SHSMSX101.ccr.corp.intel.com has additional performance numbers. ] Signed-off-by: Alex Shi <alex.shi@intel.com> Link: http://lkml.kernel.org/r/1340845344-27557-3-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/tlb.c97
1 files changed, 81 insertions, 16 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5e57e113b72c..3b91c981a27f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -41,7 +41,8 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
41union smp_flush_state { 41union smp_flush_state {
42 struct { 42 struct {
43 struct mm_struct *flush_mm; 43 struct mm_struct *flush_mm;
44 unsigned long flush_va; 44 unsigned long flush_start;
45 unsigned long flush_end;
45 raw_spinlock_t tlbstate_lock; 46 raw_spinlock_t tlbstate_lock;
46 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 47 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
47 }; 48 };
@@ -156,10 +157,19 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
156 157
157 if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) { 158 if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
158 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { 159 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
159 if (f->flush_va == TLB_FLUSH_ALL) 160 if (f->flush_end == TLB_FLUSH_ALL
161 || !cpu_has_invlpg)
160 local_flush_tlb(); 162 local_flush_tlb();
161 else 163 else if (!f->flush_end)
162 __flush_tlb_one(f->flush_va); 164 __flush_tlb_single(f->flush_start);
165 else {
166 unsigned long addr;
167 addr = f->flush_start;
168 while (addr < f->flush_end) {
169 __flush_tlb_single(addr);
170 addr += PAGE_SIZE;
171 }
172 }
163 } else 173 } else
164 leave_mm(cpu); 174 leave_mm(cpu);
165 } 175 }
@@ -172,7 +182,8 @@ out:
172} 182}
173 183
174static void flush_tlb_others_ipi(const struct cpumask *cpumask, 184static void flush_tlb_others_ipi(const struct cpumask *cpumask,
175 struct mm_struct *mm, unsigned long va) 185 struct mm_struct *mm, unsigned long start,
186 unsigned long end)
176{ 187{
177 unsigned int sender; 188 unsigned int sender;
178 union smp_flush_state *f; 189 union smp_flush_state *f;
@@ -185,7 +196,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
185 raw_spin_lock(&f->tlbstate_lock); 196 raw_spin_lock(&f->tlbstate_lock);
186 197
187 f->flush_mm = mm; 198 f->flush_mm = mm;
188 f->flush_va = va; 199 f->flush_start = start;
200 f->flush_end = end;
189 if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) { 201 if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
190 /* 202 /*
191 * We have to send the IPI only to 203 * We have to send the IPI only to
@@ -199,24 +211,26 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
199 } 211 }
200 212
201 f->flush_mm = NULL; 213 f->flush_mm = NULL;
202 f->flush_va = 0; 214 f->flush_start = 0;
215 f->flush_end = 0;
203 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) 216 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
204 raw_spin_unlock(&f->tlbstate_lock); 217 raw_spin_unlock(&f->tlbstate_lock);
205} 218}
206 219
207void native_flush_tlb_others(const struct cpumask *cpumask, 220void native_flush_tlb_others(const struct cpumask *cpumask,
208 struct mm_struct *mm, unsigned long va) 221 struct mm_struct *mm, unsigned long start,
222 unsigned long end)
209{ 223{
210 if (is_uv_system()) { 224 if (is_uv_system()) {
211 unsigned int cpu; 225 unsigned int cpu;
212 226
213 cpu = smp_processor_id(); 227 cpu = smp_processor_id();
214 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 228 cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
215 if (cpumask) 229 if (cpumask)
216 flush_tlb_others_ipi(cpumask, mm, va); 230 flush_tlb_others_ipi(cpumask, mm, start, end);
217 return; 231 return;
218 } 232 }
219 flush_tlb_others_ipi(cpumask, mm, va); 233 flush_tlb_others_ipi(cpumask, mm, start, end);
220} 234}
221 235
222static void __cpuinit calculate_tlb_offset(void) 236static void __cpuinit calculate_tlb_offset(void)
@@ -282,7 +296,7 @@ void flush_tlb_current_task(void)
282 296
283 local_flush_tlb(); 297 local_flush_tlb();
284 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 298 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
285 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); 299 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
286 preempt_enable(); 300 preempt_enable();
287} 301}
288 302
@@ -297,12 +311,63 @@ void flush_tlb_mm(struct mm_struct *mm)
297 leave_mm(smp_processor_id()); 311 leave_mm(smp_processor_id());
298 } 312 }
299 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 313 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
300 flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); 314 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
315
316 preempt_enable();
317}
318
319#define FLUSHALL_BAR 16
320
321void flush_tlb_range(struct vm_area_struct *vma,
322 unsigned long start, unsigned long end)
323{
324 struct mm_struct *mm;
325
326 if (!cpu_has_invlpg || vma->vm_flags & VM_HUGETLB) {
327 flush_tlb_mm(vma->vm_mm);
328 return;
329 }
330
331 preempt_disable();
332 mm = vma->vm_mm;
333 if (current->active_mm == mm) {
334 if (current->mm) {
335 unsigned long addr, vmflag = vma->vm_flags;
336 unsigned act_entries, tlb_entries = 0;
337
338 if (vmflag & VM_EXEC)
339 tlb_entries = tlb_lli_4k[ENTRIES];
340 else
341 tlb_entries = tlb_lld_4k[ENTRIES];
342
343 act_entries = tlb_entries > mm->total_vm ?
344 mm->total_vm : tlb_entries;
301 345
346 if ((end - start)/PAGE_SIZE > act_entries/FLUSHALL_BAR)
347 local_flush_tlb();
348 else {
349 for (addr = start; addr < end;
350 addr += PAGE_SIZE)
351 __flush_tlb_single(addr);
352
353 if (cpumask_any_but(mm_cpumask(mm),
354 smp_processor_id()) < nr_cpu_ids)
355 flush_tlb_others(mm_cpumask(mm), mm,
356 start, end);
357 preempt_enable();
358 return;
359 }
360 } else {
361 leave_mm(smp_processor_id());
362 }
363 }
364 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
365 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
302 preempt_enable(); 366 preempt_enable();
303} 367}
304 368
305void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) 369
370void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
306{ 371{
307 struct mm_struct *mm = vma->vm_mm; 372 struct mm_struct *mm = vma->vm_mm;
308 373
@@ -310,13 +375,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
310 375
311 if (current->active_mm == mm) { 376 if (current->active_mm == mm) {
312 if (current->mm) 377 if (current->mm)
313 __flush_tlb_one(va); 378 __flush_tlb_one(start);
314 else 379 else
315 leave_mm(smp_processor_id()); 380 leave_mm(smp_processor_id());
316 } 381 }
317 382
318 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 383 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
319 flush_tlb_others(mm_cpumask(mm), mm, va); 384 flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
320 385
321 preempt_enable(); 386 preempt_enable();
322} 387}