x86/mm: Rip out complicated, out-of-date, buggy TLB flushing

I think the flush_tlb_mm_range() code that tries to tune the flush sizes based on the CPU needs to get ripped out for several reasons: 1. It is obviously buggy. It uses mm->total_vm to judge the task's footprint in the TLB. It should certainly be using some measure of RSS, *NOT* ->total_vm since only resident memory can populate the TLB. 2. Haswell, and several other CPUs are missing from the intel_tlb_flushall_shift_set() function. Thus, it has been demonstrated to bitrot quickly in practice. 3. It is plain wrong in my vm: [ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0 [ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0 [ 0.037444] tlb_flushall_shift: 6 Which leads to it to never use invlpg. 4. The assumptions about TLB refill costs are wrong: http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@intel.com (more on this in later patches) 5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59 I believe the sample times were too short. Running the benchmark in a loop yields times that vary quite a bit. Note that this leaves us with a static ceiling of 1 page. This is a conservative, dumb setting, and will be revised in a later patch. This also removes the code which attempts to predict whether we are flushing data or instructions. We expect instruction flushes to be relatively rare and not worth tuning for explicitly. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Link: http://lkml.kernel.org/r/20140731154055.ABC88E89@viggo.jf.intel.com Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Mel Gorman <mgorman@suse.de> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
author: Dave Hansen <dave.hansen@linux.intel.com> 2014-07-31 11:40:55 -0400
committer: H. Peter Anvin <hpa@linux.intel.com> 2014-07-31 11:48:50 -0400
commit: e9f4e0a9fe2723078b7a1a1169828dd46a7b2f9e (patch)
tree: c16300d2f05f2fce6b7b70b2c6fed1ac58486129 /arch/x86/mm/tlb.c
parent: 4995ab9cf512e9a6cc07dfd6b1d4e2fc48ce7fef (diff)
1 files changed, 11 insertions, 76 deletions
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 378fbef279d2..dff6ddebc45f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -158,13 +158,14 @@ void flush_tlb_current_task(void)
        preempt_enable();
 }
+/* in units of pages */
+unsigned long tlb_single_page_flush_ceiling = 1;
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, unsigned long vmflag)
 {
-        bool need_flush_others_all = true;
+        int need_flush_others_all = 1;
        unsigned long addr;
-        unsigned act_entries, tlb_entries = 0;
-        unsigned long nr_base_pages;
        preempt_disable();
        if (current->active_mm != mm)
@@ -175,29 +176,16 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                goto out;
        }
-        if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
+        if (end == TLB_FLUSH_ALL || vmflag & VM_HUGETLB) {
-                                        || vmflag & VM_HUGETLB) {
                local_flush_tlb();
                goto out;
        }
-        /* In modern CPU, last level tlb used for both data/ins */
+        if ((end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
-        if (vmflag & VM_EXEC)
-                tlb_entries = tlb_lli_4k[ENTRIES];
-        else
-                tlb_entries = tlb_lld_4k[ENTRIES];
-        /* Assume all of TLB entries was occupied by this task */
-        act_entries = tlb_entries >> tlb_flushall_shift;
-        act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
-        nr_base_pages = (end - start) >> PAGE_SHIFT;
-        /* tlb_flushall_shift is on balance point, details in commit log */
-        if (nr_base_pages > act_entries) {
                count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
                local_flush_tlb();
        } else {
-                need_flush_others_all = false;
+                need_flush_others_all = 0;
                /* flush range by one by one 'invlpg' */
                for (addr = start; addr < end;  addr += PAGE_SIZE) {
                        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
@@ -259,68 +247,15 @@ static void do_kernel_range_flush(void *info)
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-        unsigned act_entries;
-        struct flush_tlb_info info;
-        /* In modern CPU, last level tlb used for both data/ins */
-        act_entries = tlb_lld_4k[ENTRIES];
        /* Balance as user space task's flush, a bit conservative */
-        if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
+        if (end == TLB_FLUSH_ALL ||
-                (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+            (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
                on_each_cpu(do_flush_tlb_all, NULL, 1);
-        else {
+        } else {
+                struct flush_tlb_info info;
                info.flush_start = start;
                info.flush_end = end;
                on_each_cpu(do_kernel_range_flush, &info, 1);
        }
 }
-#ifdef CONFIG_DEBUG_TLBFLUSH
-static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
-                             size_t count, loff_t *ppos)
-{
-        char buf[32];
-        unsigned int len;
-        len = sprintf(buf, "%hd\n", tlb_flushall_shift);
-        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
-}
-static ssize_t tlbflush_write_file(struct file *file,
-                 const char __user *user_buf, size_t count, loff_t *ppos)
-{
-        char buf[32];
-        ssize_t len;
-        s8 shift;
-        len = min(count, sizeof(buf) - 1);
-        if (copy_from_user(buf, user_buf, len))
-                return -EFAULT;
-        buf[len] = '\0';
-        if (kstrtos8(buf, 0, &shift))
-                return -EINVAL;
-        if (shift < -1 || shift >= BITS_PER_LONG)
-                return -EINVAL;
-        tlb_flushall_shift = shift;
-        return count;
-}
-static const struct file_operations fops_tlbflush = {
-        .read = tlbflush_read_file,
-        .write = tlbflush_write_file,
-        .llseek = default_llseek,
-};
-static int __init create_tlb_flushall_shift(void)
-{
-        debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
-                            arch_debugfs_dir, NULL, &fops_tlbflush);
-        return 0;
-}
-late_initcall(create_tlb_flushall_shift);
-#endif
author	Dave Hansen <dave.hansen@linux.intel.com>	2014-07-31 11:40:55 -0400
committer	H. Peter Anvin <hpa@linux.intel.com>	2014-07-31 11:48:50 -0400
commit	e9f4e0a9fe2723078b7a1a1169828dd46a7b2f9e (patch)
tree	c16300d2f05f2fce6b7b70b2c6fed1ac58486129 /arch/x86/mm/tlb.c
parent	4995ab9cf512e9a6cc07dfd6b1d4e2fc48ce7fef (diff)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 378fbef279d2..dff6ddebc45f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c
@@ -158,13 +158,14 @@ void flush_tlb_current_task(void)
158	preempt_enable();	158	preempt_enable();
159	}	159	}
160		160
		161	/* in units of pages */
		162	unsigned long tlb_single_page_flush_ceiling = 1;
		163
161	void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,	164	void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
162	unsigned long end, unsigned long vmflag)	165	unsigned long end, unsigned long vmflag)
163	{	166	{
164	bool need_flush_others_all = true;	167	int need_flush_others_all = 1;
165	unsigned long addr;	168	unsigned long addr;
166	unsigned act_entries, tlb_entries = 0;
167	unsigned long nr_base_pages;
168		169
169	preempt_disable();	170	preempt_disable();
170	if (current->active_mm != mm)	171	if (current->active_mm != mm)
@@ -175,29 +176,16 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
175	goto out;	176	goto out;
176	}	177	}
177		178
178	if (end == TLB_FLUSH_ALL \|\| tlb_flushall_shift == -1	179	if (end == TLB_FLUSH_ALL \|\| vmflag & VM_HUGETLB) {
179	\|\| vmflag & VM_HUGETLB) {
180	local_flush_tlb();	180	local_flush_tlb();
181	goto out;	181	goto out;
182	}	182	}
183		183
184	/* In modern CPU, last level tlb used for both data/ins */	184	if ((end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
185	if (vmflag & VM_EXEC)
186	tlb_entries = tlb_lli_4k[ENTRIES];
187	else
188	tlb_entries = tlb_lld_4k[ENTRIES];
189
190	/* Assume all of TLB entries was occupied by this task */
191	act_entries = tlb_entries >> tlb_flushall_shift;
192	act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
193	nr_base_pages = (end - start) >> PAGE_SHIFT;
194
195	/* tlb_flushall_shift is on balance point, details in commit log */
196	if (nr_base_pages > act_entries) {
197	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);	185	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
198	local_flush_tlb();	186	local_flush_tlb();
199	} else {	187	} else {
200	need_flush_others_all = false;	188	need_flush_others_all = 0;
201	/* flush range by one by one 'invlpg' */	189	/* flush range by one by one 'invlpg' */
202	for (addr = start; addr < end; addr += PAGE_SIZE) {	190	for (addr = start; addr < end; addr += PAGE_SIZE) {
203	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);	191	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
@@ -259,68 +247,15 @@ static void do_kernel_range_flush(void *info)
259		247
260	void flush_tlb_kernel_range(unsigned long start, unsigned long end)	248	void flush_tlb_kernel_range(unsigned long start, unsigned long end)
261	{	249	{
262	unsigned act_entries;
263	struct flush_tlb_info info;
264
265	/* In modern CPU, last level tlb used for both data/ins */
266	act_entries = tlb_lld_4k[ENTRIES];
267		250
268	/* Balance as user space task's flush, a bit conservative */	251	/* Balance as user space task's flush, a bit conservative */
269	if (end == TLB_FLUSH_ALL \|\| tlb_flushall_shift == -1 \|\|	252	if (end == TLB_FLUSH_ALL \|\|
270	(end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)	253	(end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
271
272	on_each_cpu(do_flush_tlb_all, NULL, 1);	254	on_each_cpu(do_flush_tlb_all, NULL, 1);
273	else {	255	} else {
		256	struct flush_tlb_info info;
274	info.flush_start = start;	257	info.flush_start = start;
275	info.flush_end = end;	258	info.flush_end = end;
276	on_each_cpu(do_kernel_range_flush, &info, 1);	259	on_each_cpu(do_kernel_range_flush, &info, 1);
277	}	260	}
278	}	261	}
279
280	#ifdef CONFIG_DEBUG_TLBFLUSH
281	static ssize_t tlbflush_read_file(struct file file, char __user user_buf,
282	size_t count, loff_t *ppos)
283	{
284	char buf[32];
285	unsigned int len;
286
287	len = sprintf(buf, "%hd\n", tlb_flushall_shift);
288	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
289	}
290
291	static ssize_t tlbflush_write_file(struct file *file,
292	const char __user user_buf, size_t count, loff_t ppos)
293	{
294	char buf[32];
295	ssize_t len;
296	s8 shift;
297
298	len = min(count, sizeof(buf) - 1);
299	if (copy_from_user(buf, user_buf, len))
300	return -EFAULT;
301
302	buf[len] = '\0';
303	if (kstrtos8(buf, 0, &shift))
304	return -EINVAL;
305
306	if (shift < -1 \|\| shift >= BITS_PER_LONG)
307	return -EINVAL;
308
309	tlb_flushall_shift = shift;
310	return count;
311	}
312
313	static const struct file_operations fops_tlbflush = {
314	.read = tlbflush_read_file,
315	.write = tlbflush_write_file,
316	.llseek = default_llseek,
317	};
318
319	static int __init create_tlb_flushall_shift(void)
320	{
321	debugfs_create_file("tlb_flushall_shift", S_IRUSR \| S_IWUSR,
322	arch_debugfs_dir, NULL, &fops_tlbflush);
323	return 0;
324	}
325	late_initcall(create_tlb_flushall_shift);
326	#endif