x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range

x86 has no flush_tlb_range support in instruction level. Currently the flush_tlb_range just implemented by flushing all page table. That is not the best solution for all scenarios. In fact, if we just use 'invlpg' to flush few lines from TLB, we can get the performance gain from later remain TLB lines accessing. But the 'invlpg' instruction costs much of time. Its execution time can compete with cr3 rewriting, and even a bit more on SNB CPU. So, on a 512 4KB TLB entries CPU, the balance points is at: (512 - X) * 100ns(assumed TLB refill cost) = X(TLB flush entries) * 100ns(assumed invlpg cost) Here, X is 256, that is 1/2 of 512 entries. But with the mysterious CPU pre-fetcher and page miss handler Unit, the assumed TLB refill cost is far lower then 100ns in sequential access. And 2 HT siblings in one core makes the memory access more faster if they are accessing the same memory. So, in the patch, I just do the change when the target entries is less than 1/16 of whole active tlb entries. Actually, I have no data support for the percentage '1/16', so any suggestions are welcomed. As to hugetlb, guess due to smaller page table, and smaller active TLB entries, I didn't see benefit via my benchmark, so no optimizing now. My micro benchmark show in ideal scenarios, the performance improves 70 percent in reading. And in worst scenario, the reading/writing performance is similar with unpatched 3.4-rc4 kernel. Here is the reading data on my 2P * 4cores *HT NHM EP machine, with THP 'always': multi thread testing, '-t' paramter is thread number: with patch unpatched 3.4-rc4 ./mprotect -t 1 14ns 24ns ./mprotect -t 2 13ns 22ns ./mprotect -t 4 12ns 19ns ./mprotect -t 8 14ns 16ns ./mprotect -t 16 28ns 26ns ./mprotect -t 32 54ns 51ns ./mprotect -t 128 200ns 199ns Single process with sequencial flushing and memory accessing: with patch unpatched 3.4-rc4 ./mprotect 7ns 11ns ./mprotect -p 4096 -l 8 -n 10240 21ns 21ns [ hpa: http://lkml.kernel.org/r/1B4B44D9196EFF41AE41FDA404FC0A100BFF94@SHSMSX101.ccr.corp.intel.com has additional performance numbers. ] Signed-off-by: Alex Shi <alex.shi@intel.com> Link: http://lkml.kernel.org/r/1340845344-27557-3-git-send-email-alex.shi@intel.com Signed-off-by: H. Peter Anvin <hpa@zytor.com>
author: Alex Shi <alex.shi@intel.com> 2012-06-27 21:02:17 -0400
committer: H. Peter Anvin <hpa@zytor.com> 2012-06-27 22:29:07 -0400
commit: e7b52ffd45a6d834473f43b349e7d86593d763c7 (patch)
tree: 12a930bdf1c43608e932f422505bf228afaf9880 /arch/x86/mm
parent: e0ba94f14f747c2661c4d21f8c44e5b0b8cd8e48 (diff)
1 files changed, 81 insertions, 16 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5e57e113b72c..3b91c981a27f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -41,7 +41,8 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
 union smp_flush_state {
        struct {
                struct mm_struct *flush_mm;
-                unsigned long flush_va;
+                unsigned long flush_start;
+                unsigned long flush_end;
                raw_spinlock_t tlbstate_lock;
                DECLARE_BITMAP(flush_cpumask, NR_CPUS);
        };
@@ -156,10 +157,19 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
        if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
                if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
-                        if (f->flush_va == TLB_FLUSH_ALL)
+                        if (f->flush_end == TLB_FLUSH_ALL
+                                        || !cpu_has_invlpg)
                                local_flush_tlb();
-                        else
+                        else if (!f->flush_end)
-                                __flush_tlb_one(f->flush_va);
+                                __flush_tlb_single(f->flush_start);
+                        else {
+                                unsigned long addr;
+                                addr = f->flush_start;
+                                while (addr < f->flush_end) {
+                                        __flush_tlb_single(addr);
+                                        addr += PAGE_SIZE;
+                                }
+                        }
                } else
                        leave_mm(cpu);
        }
@@ -172,7 +182,8 @@ out:
 }
 static void flush_tlb_others_ipi(const struct cpumask *cpumask,
-                                 struct mm_struct *mm, unsigned long va)
+                                 struct mm_struct *mm, unsigned long start,
+                                 unsigned long end)
 {
        unsigned int sender;
        union smp_flush_state *f;
@@ -185,7 +196,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
                raw_spin_lock(&f->tlbstate_lock);
        f->flush_mm = mm;
-        f->flush_va = va;
+        f->flush_start = start;
+        f->flush_end = end;
        if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
                /*
                 * We have to send the IPI only to
@@ -199,24 +211,26 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
        }
        f->flush_mm = NULL;
-        f->flush_va = 0;
+        f->flush_start = 0;
+        f->flush_end = 0;
        if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
                raw_spin_unlock(&f->tlbstate_lock);
 }
 void native_flush_tlb_others(const struct cpumask *cpumask,
-                             struct mm_struct *mm, unsigned long va)
+                                 struct mm_struct *mm, unsigned long start,
+                                 unsigned long end)
 {
        if (is_uv_system()) {
                unsigned int cpu;
                cpu = smp_processor_id();
-                cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
+                cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
                if (cpumask)
-                        flush_tlb_others_ipi(cpumask, mm, va);
+                        flush_tlb_others_ipi(cpumask, mm, start, end);
                return;
        }
-        flush_tlb_others_ipi(cpumask, mm, va);
+        flush_tlb_others_ipi(cpumask, mm, start, end);
 }
 static void __cpuinit calculate_tlb_offset(void)
@@ -282,7 +296,7 @@ void flush_tlb_current_task(void)
        local_flush_tlb();
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
+                flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
        preempt_enable();
 }
@@ -297,12 +311,63 @@ void flush_tlb_mm(struct mm_struct *mm)
                        leave_mm(smp_processor_id());
        }
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);
+                flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+        preempt_enable();
+}
+#define FLUSHALL_BAR    16
+void flush_tlb_range(struct vm_area_struct *vma,
+                                   unsigned long start, unsigned long end)
+{
+        struct mm_struct *mm;
+        if (!cpu_has_invlpg || vma->vm_flags & VM_HUGETLB) {
+                flush_tlb_mm(vma->vm_mm);
+                return;
+        }
+        preempt_disable();
+        mm = vma->vm_mm;
+        if (current->active_mm == mm) {
+                if (current->mm) {
+                        unsigned long addr, vmflag = vma->vm_flags;
+                        unsigned act_entries, tlb_entries = 0;
+                        if (vmflag & VM_EXEC)
+                                tlb_entries = tlb_lli_4k[ENTRIES];
+                        else
+                                tlb_entries = tlb_lld_4k[ENTRIES];
+                        act_entries = tlb_entries > mm->total_vm ?
+                                        mm->total_vm : tlb_entries;
+                        if ((end - start)/PAGE_SIZE > act_entries/FLUSHALL_BAR)
+                                local_flush_tlb();
+                        else {
+                                for (addr = start; addr < end;
+                                                addr += PAGE_SIZE)
+                                        __flush_tlb_single(addr);
+                                if (cpumask_any_but(mm_cpumask(mm),
+                                        smp_processor_id()) < nr_cpu_ids)
+                                        flush_tlb_others(mm_cpumask(mm), mm,
+                                                                start, end);
+                                preempt_enable();
+                                return;
+                        }
+                } else {
+                        leave_mm(smp_processor_id());
+                }
+        }
+        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+                flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
        preempt_enable();
 }
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -310,13 +375,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
        if (current->active_mm == mm) {
                if (current->mm)
-                        __flush_tlb_one(va);
+                        __flush_tlb_one(start);
                else
                        leave_mm(smp_processor_id());
        }
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(mm_cpumask(mm), mm, va);
+                flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
        preempt_enable();
 }
author	Alex Shi <alex.shi@intel.com>	2012-06-27 21:02:17 -0400
committer	H. Peter Anvin <hpa@zytor.com>	2012-06-27 22:29:07 -0400
commit	e7b52ffd45a6d834473f43b349e7d86593d763c7 (patch)
tree	12a930bdf1c43608e932f422505bf228afaf9880 /arch/x86/mm
parent	e0ba94f14f747c2661c4d21f8c44e5b0b8cd8e48 (diff)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5e57e113b72c..3b91c981a27f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -41,7 +41,8 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
41	union smp_flush_state {	41	union smp_flush_state {
42	struct {	42	struct {
43	struct mm_struct *flush_mm;	43	struct mm_struct *flush_mm;
44	unsigned long flush_va;	44	unsigned long flush_start;
		45	unsigned long flush_end;
45	raw_spinlock_t tlbstate_lock;	46	raw_spinlock_t tlbstate_lock;
46	DECLARE_BITMAP(flush_cpumask, NR_CPUS);	47	DECLARE_BITMAP(flush_cpumask, NR_CPUS);
47	};	48	};
@@ -156,10 +157,19 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
156		157
157	if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {	158	if (f->flush_mm == this_cpu_read(cpu_tlbstate.active_mm)) {
158	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {	159	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
159	if (f->flush_va == TLB_FLUSH_ALL)	160	if (f->flush_end == TLB_FLUSH_ALL
		161	\|\| !cpu_has_invlpg)
160	local_flush_tlb();	162	local_flush_tlb();
161	else	163	else if (!f->flush_end)
162	__flush_tlb_one(f->flush_va);	164	__flush_tlb_single(f->flush_start);
		165	else {
		166	unsigned long addr;
		167	addr = f->flush_start;
		168	while (addr < f->flush_end) {
		169	__flush_tlb_single(addr);
		170	addr += PAGE_SIZE;
		171	}
		172	}
163	} else	173	} else
164	leave_mm(cpu);	174	leave_mm(cpu);
165	}	175	}
@@ -172,7 +182,8 @@ out:
172	}	182	}
173		183
174	static void flush_tlb_others_ipi(const struct cpumask *cpumask,	184	static void flush_tlb_others_ipi(const struct cpumask *cpumask,
175	struct mm_struct *mm, unsigned long va)	185	struct mm_struct *mm, unsigned long start,
		186	unsigned long end)
176	{	187	{
177	unsigned int sender;	188	unsigned int sender;
178	union smp_flush_state *f;	189	union smp_flush_state *f;
@@ -185,7 +196,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
185	raw_spin_lock(&f->tlbstate_lock);	196	raw_spin_lock(&f->tlbstate_lock);
186		197
187	f->flush_mm = mm;	198	f->flush_mm = mm;
188	f->flush_va = va;	199	f->flush_start = start;
		200	f->flush_end = end;
189	if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {	201	if (cpumask_andnot(to_cpumask(f->flush_cpumask), cpumask, cpumask_of(smp_processor_id()))) {
190	/*	202	/*
191	* We have to send the IPI only to	203	* We have to send the IPI only to
@@ -199,24 +211,26 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
199	}	211	}
200		212
201	f->flush_mm = NULL;	213	f->flush_mm = NULL;
202	f->flush_va = 0;	214	f->flush_start = 0;
		215	f->flush_end = 0;
203	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)	216	if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
204	raw_spin_unlock(&f->tlbstate_lock);	217	raw_spin_unlock(&f->tlbstate_lock);
205	}	218	}
206		219
207	void native_flush_tlb_others(const struct cpumask *cpumask,	220	void native_flush_tlb_others(const struct cpumask *cpumask,
208	struct mm_struct *mm, unsigned long va)	221	struct mm_struct *mm, unsigned long start,
		222	unsigned long end)
209	{	223	{
210	if (is_uv_system()) {	224	if (is_uv_system()) {
211	unsigned int cpu;	225	unsigned int cpu;
212		226
213	cpu = smp_processor_id();	227	cpu = smp_processor_id();
214	cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);	228	cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
215	if (cpumask)	229	if (cpumask)
216	flush_tlb_others_ipi(cpumask, mm, va);	230	flush_tlb_others_ipi(cpumask, mm, start, end);
217	return;	231	return;
218	}	232	}
219	flush_tlb_others_ipi(cpumask, mm, va);	233	flush_tlb_others_ipi(cpumask, mm, start, end);
220	}	234	}
221		235
222	static void __cpuinit calculate_tlb_offset(void)	236	static void __cpuinit calculate_tlb_offset(void)
@@ -282,7 +296,7 @@ void flush_tlb_current_task(void)
282		296
283	local_flush_tlb();	297	local_flush_tlb();
284	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)	298	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
285	flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);	299	flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
286	preempt_enable();	300	preempt_enable();
287	}	301	}
288		302
@@ -297,12 +311,63 @@ void flush_tlb_mm(struct mm_struct *mm)
297	leave_mm(smp_processor_id());	311	leave_mm(smp_processor_id());
298	}	312	}
299	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)	313	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
300	flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL);	314	flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
		315
		316	preempt_enable();
		317	}
		318
		319	#define FLUSHALL_BAR 16
		320
		321	void flush_tlb_range(struct vm_area_struct *vma,
		322	unsigned long start, unsigned long end)
		323	{
		324	struct mm_struct *mm;
		325
		326	if (!cpu_has_invlpg \|\| vma->vm_flags & VM_HUGETLB) {
		327	flush_tlb_mm(vma->vm_mm);
		328	return;
		329	}
		330
		331	preempt_disable();
		332	mm = vma->vm_mm;
		333	if (current->active_mm == mm) {
		334	if (current->mm) {
		335	unsigned long addr, vmflag = vma->vm_flags;
		336	unsigned act_entries, tlb_entries = 0;
		337
		338	if (vmflag & VM_EXEC)
		339	tlb_entries = tlb_lli_4k[ENTRIES];
		340	else
		341	tlb_entries = tlb_lld_4k[ENTRIES];
		342
		343	act_entries = tlb_entries > mm->total_vm ?
		344	mm->total_vm : tlb_entries;
301		345
		346	if ((end - start)/PAGE_SIZE > act_entries/FLUSHALL_BAR)
		347	local_flush_tlb();
		348	else {
		349	for (addr = start; addr < end;
		350	addr += PAGE_SIZE)
		351	__flush_tlb_single(addr);
		352
		353	if (cpumask_any_but(mm_cpumask(mm),
		354	smp_processor_id()) < nr_cpu_ids)
		355	flush_tlb_others(mm_cpumask(mm), mm,
		356	start, end);
		357	preempt_enable();
		358	return;
		359	}
		360	} else {
		361	leave_mm(smp_processor_id());
		362	}
		363	}
		364	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
		365	flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
302	preempt_enable();	366	preempt_enable();
303	}	367	}
304		368
305	void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)	369
		370	void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
306	{	371	{
307	struct mm_struct *mm = vma->vm_mm;	372	struct mm_struct *mm = vma->vm_mm;
308		373
@@ -310,13 +375,13 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
310		375
311	if (current->active_mm == mm) {	376	if (current->active_mm == mm) {
312	if (current->mm)	377	if (current->mm)
313	__flush_tlb_one(va);	378	__flush_tlb_one(start);
314	else	379	else
315	leave_mm(smp_processor_id());	380	leave_mm(smp_processor_id());
316	}	381	}
317		382
318	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)	383	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
319	flush_tlb_others(mm_cpumask(mm), mm, va);	384	flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
320		385
321	preempt_enable();	386	preempt_enable();
322	}	387	}