aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Hansen <dave.hansen@linux.intel.com>2014-07-31 11:40:55 -0400
committerH. Peter Anvin <hpa@linux.intel.com>2014-07-31 11:48:50 -0400
commite9f4e0a9fe2723078b7a1a1169828dd46a7b2f9e (patch)
treec16300d2f05f2fce6b7b70b2c6fed1ac58486129
parent4995ab9cf512e9a6cc07dfd6b1d4e2fc48ce7fef (diff)
x86/mm: Rip out complicated, out-of-date, buggy TLB flushing
I think the flush_tlb_mm_range() code that tries to tune the flush sizes based on the CPU needs to get ripped out for several reasons: 1. It is obviously buggy. It uses mm->total_vm to judge the task's footprint in the TLB. It should certainly be using some measure of RSS, *NOT* ->total_vm since only resident memory can populate the TLB. 2. Haswell, and several other CPUs are missing from the intel_tlb_flushall_shift_set() function. Thus, it has been demonstrated to bitrot quickly in practice. 3. It is plain wrong in my vm: [ 0.037444] Last level iTLB entries: 4KB 0, 2MB 0, 4MB 0 [ 0.037444] Last level dTLB entries: 4KB 0, 2MB 0, 4MB 0 [ 0.037444] tlb_flushall_shift: 6 Which leads to it to never use invlpg. 4. The assumptions about TLB refill costs are wrong: http://lkml.kernel.org/r/1337782555-8088-3-git-send-email-alex.shi@intel.com (more on this in later patches) 5. I can not reproduce the original data: https://lkml.org/lkml/2012/5/17/59 I believe the sample times were too short. Running the benchmark in a loop yields times that vary quite a bit. Note that this leaves us with a static ceiling of 1 page. This is a conservative, dumb setting, and will be revised in a later patch. This also removes the code which attempts to predict whether we are flushing data or instructions. We expect instruction flushes to be relatively rare and not worth tuning for explicitly. Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Link: http://lkml.kernel.org/r/20140731154055.ABC88E89@viggo.jf.intel.com Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Mel Gorman <mgorman@suse.de> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/kernel/cpu/amd.c7
-rw-r--r--arch/x86/kernel/cpu/common.c13
-rw-r--r--arch/x86/kernel/cpu/intel.c26
-rw-r--r--arch/x86/mm/tlb.c87
5 files changed, 13 insertions, 121 deletions
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a4ea02351f4d..43d61daea966 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO];
72extern u16 __read_mostly tlb_lld_2m[NR_INFO]; 72extern u16 __read_mostly tlb_lld_2m[NR_INFO];
73extern u16 __read_mostly tlb_lld_4m[NR_INFO]; 73extern u16 __read_mostly tlb_lld_4m[NR_INFO];
74extern u16 __read_mostly tlb_lld_1g[NR_INFO]; 74extern u16 __read_mostly tlb_lld_1g[NR_INFO];
75extern s8 __read_mostly tlb_flushall_shift;
76 75
77/* 76/*
78 * CPU type and hardware bug flags. Kept separately for each CPU. 77 * CPU type and hardware bug flags. Kept separately for each CPU.
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ce8b8ff0e0ef..a1a53d094987 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -741,11 +741,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
741} 741}
742#endif 742#endif
743 743
744static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
745{
746 tlb_flushall_shift = 6;
747}
748
749static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) 744static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
750{ 745{
751 u32 ebx, eax, ecx, edx; 746 u32 ebx, eax, ecx, edx;
@@ -793,8 +788,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
793 tlb_lli_2m[ENTRIES] = eax & mask; 788 tlb_lli_2m[ENTRIES] = eax & mask;
794 789
795 tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; 790 tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
796
797 cpu_set_tlb_flushall_shift(c);
798} 791}
799 792
800static const struct cpu_dev amd_cpu_dev = { 793static const struct cpu_dev amd_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2cbbf88d8f2c..2c1782085121 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -480,26 +480,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
480u16 __read_mostly tlb_lld_4m[NR_INFO]; 480u16 __read_mostly tlb_lld_4m[NR_INFO];
481u16 __read_mostly tlb_lld_1g[NR_INFO]; 481u16 __read_mostly tlb_lld_1g[NR_INFO];
482 482
483/*
484 * tlb_flushall_shift shows the balance point in replacing cr3 write
485 * with multiple 'invlpg'. It will do this replacement when
486 * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
487 * If tlb_flushall_shift is -1, means the replacement will be disabled.
488 */
489s8 __read_mostly tlb_flushall_shift = -1;
490
491void cpu_detect_tlb(struct cpuinfo_x86 *c) 483void cpu_detect_tlb(struct cpuinfo_x86 *c)
492{ 484{
493 if (this_cpu->c_detect_tlb) 485 if (this_cpu->c_detect_tlb)
494 this_cpu->c_detect_tlb(c); 486 this_cpu->c_detect_tlb(c);
495 487
496 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" 488 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
497 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" 489 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
498 "tlb_flushall_shift: %d\n",
499 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], 490 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
500 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], 491 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
501 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], 492 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
502 tlb_lld_1g[ENTRIES], tlb_flushall_shift); 493 tlb_lld_1g[ENTRIES]);
503} 494}
504 495
505void detect_ht(struct cpuinfo_x86 *c) 496void detect_ht(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index a80029035bf2..cd61755de49b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc)
634 } 634 }
635} 635}
636 636
637static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
638{
639 switch ((c->x86 << 8) + c->x86_model) {
640 case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
641 case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
642 case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
643 case 0x61d: /* six-core 45 nm xeon "Dunnington" */
644 tlb_flushall_shift = -1;
645 break;
646 case 0x63a: /* Ivybridge */
647 tlb_flushall_shift = 2;
648 break;
649 case 0x61a: /* 45 nm nehalem, "Bloomfield" */
650 case 0x61e: /* 45 nm nehalem, "Lynnfield" */
651 case 0x625: /* 32 nm nehalem, "Clarkdale" */
652 case 0x62c: /* 32 nm nehalem, "Gulftown" */
653 case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
654 case 0x62f: /* 32 nm Xeon E7 */
655 case 0x62a: /* SandyBridge */
656 case 0x62d: /* SandyBridge, "Romely-EP" */
657 default:
658 tlb_flushall_shift = 6;
659 }
660}
661
662static void intel_detect_tlb(struct cpuinfo_x86 *c) 637static void intel_detect_tlb(struct cpuinfo_x86 *c)
663{ 638{
664 int i, j, n; 639 int i, j, n;
@@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
683 for (j = 1 ; j < 16 ; j++) 658 for (j = 1 ; j < 16 ; j++)
684 intel_tlb_lookup(desc[j]); 659 intel_tlb_lookup(desc[j]);
685 } 660 }
686 intel_tlb_flushall_shift_set(c);
687} 661}
688 662
689static const struct cpu_dev intel_cpu_dev = { 663static const struct cpu_dev intel_cpu_dev = {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 378fbef279d2..dff6ddebc45f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -158,13 +158,14 @@ void flush_tlb_current_task(void)
158 preempt_enable(); 158 preempt_enable();
159} 159}
160 160
161/* in units of pages */
162unsigned long tlb_single_page_flush_ceiling = 1;
163
161void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 164void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
162 unsigned long end, unsigned long vmflag) 165 unsigned long end, unsigned long vmflag)
163{ 166{
164 bool need_flush_others_all = true; 167 int need_flush_others_all = 1;
165 unsigned long addr; 168 unsigned long addr;
166 unsigned act_entries, tlb_entries = 0;
167 unsigned long nr_base_pages;
168 169
169 preempt_disable(); 170 preempt_disable();
170 if (current->active_mm != mm) 171 if (current->active_mm != mm)
@@ -175,29 +176,16 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
175 goto out; 176 goto out;
176 } 177 }
177 178
178 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 179 if (end == TLB_FLUSH_ALL || vmflag & VM_HUGETLB) {
179 || vmflag & VM_HUGETLB) {
180 local_flush_tlb(); 180 local_flush_tlb();
181 goto out; 181 goto out;
182 } 182 }
183 183
184 /* In modern CPU, last level tlb used for both data/ins */ 184 if ((end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
185 if (vmflag & VM_EXEC)
186 tlb_entries = tlb_lli_4k[ENTRIES];
187 else
188 tlb_entries = tlb_lld_4k[ENTRIES];
189
190 /* Assume all of TLB entries was occupied by this task */
191 act_entries = tlb_entries >> tlb_flushall_shift;
192 act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
193 nr_base_pages = (end - start) >> PAGE_SHIFT;
194
195 /* tlb_flushall_shift is on balance point, details in commit log */
196 if (nr_base_pages > act_entries) {
197 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 185 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
198 local_flush_tlb(); 186 local_flush_tlb();
199 } else { 187 } else {
200 need_flush_others_all = false; 188 need_flush_others_all = 0;
201 /* flush range by one by one 'invlpg' */ 189 /* flush range by one by one 'invlpg' */
202 for (addr = start; addr < end; addr += PAGE_SIZE) { 190 for (addr = start; addr < end; addr += PAGE_SIZE) {
203 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); 191 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
@@ -259,68 +247,15 @@ static void do_kernel_range_flush(void *info)
259 247
260void flush_tlb_kernel_range(unsigned long start, unsigned long end) 248void flush_tlb_kernel_range(unsigned long start, unsigned long end)
261{ 249{
262 unsigned act_entries;
263 struct flush_tlb_info info;
264
265 /* In modern CPU, last level tlb used for both data/ins */
266 act_entries = tlb_lld_4k[ENTRIES];
267 250
268 /* Balance as user space task's flush, a bit conservative */ 251 /* Balance as user space task's flush, a bit conservative */
269 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || 252 if (end == TLB_FLUSH_ALL ||
270 (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) 253 (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
271
272 on_each_cpu(do_flush_tlb_all, NULL, 1); 254 on_each_cpu(do_flush_tlb_all, NULL, 1);
273 else { 255 } else {
256 struct flush_tlb_info info;
274 info.flush_start = start; 257 info.flush_start = start;
275 info.flush_end = end; 258 info.flush_end = end;
276 on_each_cpu(do_kernel_range_flush, &info, 1); 259 on_each_cpu(do_kernel_range_flush, &info, 1);
277 } 260 }
278} 261}
279
280#ifdef CONFIG_DEBUG_TLBFLUSH
281static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
282 size_t count, loff_t *ppos)
283{
284 char buf[32];
285 unsigned int len;
286
287 len = sprintf(buf, "%hd\n", tlb_flushall_shift);
288 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
289}
290
291static ssize_t tlbflush_write_file(struct file *file,
292 const char __user *user_buf, size_t count, loff_t *ppos)
293{
294 char buf[32];
295 ssize_t len;
296 s8 shift;
297
298 len = min(count, sizeof(buf) - 1);
299 if (copy_from_user(buf, user_buf, len))
300 return -EFAULT;
301
302 buf[len] = '\0';
303 if (kstrtos8(buf, 0, &shift))
304 return -EINVAL;
305
306 if (shift < -1 || shift >= BITS_PER_LONG)
307 return -EINVAL;
308
309 tlb_flushall_shift = shift;
310 return count;
311}
312
313static const struct file_operations fops_tlbflush = {
314 .read = tlbflush_read_file,
315 .write = tlbflush_write_file,
316 .llseek = default_llseek,
317};
318
319static int __init create_tlb_flushall_shift(void)
320{
321 debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
322 arch_debugfs_dir, NULL, &fops_tlbflush);
323 return 0;
324}
325late_initcall(create_tlb_flushall_shift);
326#endif