aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen/mmu.c
diff options
context:
space:
mode:
authorAlex Shi <alex.shi@intel.com>2012-06-27 21:02:17 -0400
committerH. Peter Anvin <hpa@zytor.com>2012-06-27 22:29:07 -0400
commite7b52ffd45a6d834473f43b349e7d86593d763c7 (patch)
tree12a930bdf1c43608e932f422505bf228afaf9880 /arch/x86/xen/mmu.c
parente0ba94f14f747c2661c4d21f8c44e5b0b8cd8e48 (diff)
x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range
x86 has no flush_tlb_range support in instruction level. Currently the flush_tlb_range just implemented by flushing all page table. That is not the best solution for all scenarios. In fact, if we just use 'invlpg' to flush few lines from TLB, we can get the performance gain from later remain TLB lines accessing. But the 'invlpg' instruction costs much of time. Its execution time can compete with cr3 rewriting, and even a bit more on SNB CPU. So, on a 512 4KB TLB entries CPU, the balance points is at: (512 - X) * 100ns(assumed TLB refill cost) = X(TLB flush entries) * 100ns(assumed invlpg cost) Here, X is 256, that is 1/2 of 512 entries. But with the mysterious CPU pre-fetcher and page miss handler Unit, the assumed TLB refill cost is far lower then 100ns in sequential access. And 2 HT siblings in one core makes the memory access more faster if they are accessing the same memory. So, in the patch, I just do the change when the target entries is less than 1/16 of whole active tlb entries. Actually, I have no data support for the percentage '1/16', so any suggestions are welcomed. As to hugetlb, guess due to smaller page table, and smaller active TLB entries, I didn't see benefit via my benchmark, so no optimizing now. My micro benchmark show in ideal scenarios, the performance improves 70 percent in reading. And in worst scenario, the reading/writing performance is similar with unpatched 3.4-rc4 kernel. Here is the reading data on my 2P * 4cores *HT NHM EP machine, with THP 'always': multi thread testing, '-t' paramter is thread number: with patch unpatched 3.4-rc4 ./mprotect -t 1 14ns 24ns ./mprotect -t 2 13ns 22ns ./mprotect -t 4 12ns 19ns ./mprotect -t 8 14ns 16ns ./mprotect -t 16 28ns 26ns ./mprotect -t 32 54ns 51ns ./mprotect -t 128 200ns 199ns Single process with sequencial flushing and memory accessing: with patch unpatched 3.4-rc4 ./mprotect 7ns 11ns ./mprotect -p 4096 -l 8 -n 10240 21ns 21ns [ hpa: http://lkml.kernel.org/r/1B4B44D9196EFF41AE41FDA404FC0A100BFF94@SHSMSX101.ccr.corp.intel.com has additional performance numbers. ] Signed-off-by: Alex Shi <alex.shi@intel.com> Link: http://lkml.kernel.org/r/1340845344-27557-3-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86/xen/mmu.c')
-rw-r--r--arch/x86/xen/mmu.c12
1 files changed, 6 insertions, 6 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3a73785631ce..39ed56789f68 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1244,7 +1244,8 @@ static void xen_flush_tlb_single(unsigned long addr)
1244} 1244}
1245 1245
1246static void xen_flush_tlb_others(const struct cpumask *cpus, 1246static void xen_flush_tlb_others(const struct cpumask *cpus,
1247 struct mm_struct *mm, unsigned long va) 1247 struct mm_struct *mm, unsigned long start,
1248 unsigned long end)
1248{ 1249{
1249 struct { 1250 struct {
1250 struct mmuext_op op; 1251 struct mmuext_op op;
@@ -1256,7 +1257,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1256 } *args; 1257 } *args;
1257 struct multicall_space mcs; 1258 struct multicall_space mcs;
1258 1259
1259 trace_xen_mmu_flush_tlb_others(cpus, mm, va); 1260 trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1260 1261
1261 if (cpumask_empty(cpus)) 1262 if (cpumask_empty(cpus))
1262 return; /* nothing to do */ 1263 return; /* nothing to do */
@@ -1269,11 +1270,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1269 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 1270 cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1270 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1271 cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1271 1272
1272 if (va == TLB_FLUSH_ALL) { 1273 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1273 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1274 if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1274 } else {
1275 args->op.cmd = MMUEXT_INVLPG_MULTI; 1275 args->op.cmd = MMUEXT_INVLPG_MULTI;
1276 args->op.arg1.linear_addr = va; 1276 args->op.arg1.linear_addr = start;
1277 } 1277 }
1278 1278
1279 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1279 MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);