aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/x86/tlb.txt75
-rw-r--r--arch/x86/include/asm/mmu_context.h6
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/kernel/cpu/amd.c7
-rw-r--r--arch/x86/kernel/cpu/common.c13
-rw-r--r--arch/x86/kernel/cpu/intel.c26
-rw-r--r--arch/x86/mm/fault.c6
-rw-r--r--arch/x86/mm/init.c7
-rw-r--r--arch/x86/mm/tlb.c103
-rw-r--r--include/linux/mm_types.h8
-rw-r--r--include/trace/events/tlb.h40
11 files changed, 193 insertions, 99 deletions
diff --git a/Documentation/x86/tlb.txt b/Documentation/x86/tlb.txt
new file mode 100644
index 000000000000..2b3a82e69151
--- /dev/null
+++ b/Documentation/x86/tlb.txt
@@ -0,0 +1,75 @@
1When the kernel unmaps or modified the attributes of a range of
2memory, it has two choices:
3 1. Flush the entire TLB with a two-instruction sequence. This is
4 a quick operation, but it causes collateral damage: TLB entries
5 from areas other than the one we are trying to flush will be
6 destroyed and must be refilled later, at some cost.
7 2. Use the invlpg instruction to invalidate a single page at a
8 time. This could potentialy cost many more instructions, but
9 it is a much more precise operation, causing no collateral
10 damage to other TLB entries.
11
12Which method to do depends on a few things:
13 1. The size of the flush being performed. A flush of the entire
14 address space is obviously better performed by flushing the
15 entire TLB than doing 2^48/PAGE_SIZE individual flushes.
16 2. The contents of the TLB. If the TLB is empty, then there will
17 be no collateral damage caused by doing the global flush, and
18 all of the individual flush will have ended up being wasted
19 work.
20 3. The size of the TLB. The larger the TLB, the more collateral
21 damage we do with a full flush. So, the larger the TLB, the
22 more attrative an individual flush looks. Data and
23 instructions have separate TLBs, as do different page sizes.
24 4. The microarchitecture. The TLB has become a multi-level
25 cache on modern CPUs, and the global flushes have become more
26 expensive relative to single-page flushes.
27
28There is obviously no way the kernel can know all these things,
29especially the contents of the TLB during a given flush. The
30sizes of the flush will vary greatly depending on the workload as
31well. There is essentially no "right" point to choose.
32
33You may be doing too many individual invalidations if you see the
34invlpg instruction (or instructions _near_ it) show up high in
35profiles. If you believe that individual invalidations being
36called too often, you can lower the tunable:
37
38 /sys/debug/kernel/x86/tlb_single_page_flush_ceiling
39
40This will cause us to do the global flush for more cases.
41Lowering it to 0 will disable the use of the individual flushes.
42Setting it to 1 is a very conservative setting and it should
43never need to be 0 under normal circumstances.
44
45Despite the fact that a single individual flush on x86 is
46guaranteed to flush a full 2MB [1], hugetlbfs always uses the full
47flushes. THP is treated exactly the same as normal memory.
48
49You might see invlpg inside of flush_tlb_mm_range() show up in
50profiles, or you can use the trace_tlb_flush() tracepoints. to
51determine how long the flush operations are taking.
52
53Essentially, you are balancing the cycles you spend doing invlpg
54with the cycles that you spend refilling the TLB later.
55
56You can measure how expensive TLB refills are by using
57performance counters and 'perf stat', like this:
58
59perf stat -e
60 cpu/event=0x8,umask=0x84,name=dtlb_load_misses_walk_duration/,
61 cpu/event=0x8,umask=0x82,name=dtlb_load_misses_walk_completed/,
62 cpu/event=0x49,umask=0x4,name=dtlb_store_misses_walk_duration/,
63 cpu/event=0x49,umask=0x2,name=dtlb_store_misses_walk_completed/,
64 cpu/event=0x85,umask=0x4,name=itlb_misses_walk_duration/,
65 cpu/event=0x85,umask=0x2,name=itlb_misses_walk_completed/
66
67That works on an IvyBridge-era CPU (i5-3320M). Different CPUs
68may have differently-named counters, but they should at least
69be there in some form. You can use pmu-tools 'ocperf list'
70(https://github.com/andikleen/pmu-tools) to find the right
71counters for a given CPU.
72
731. A footnote in Intel's SDM "4.10.4.2 Recommended Invalidation"
74 says: "One execution of INVLPG is sufficient even for a page
75 with size greater than 4 KBytes."
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index be12c534fd59..166af2a8e865 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -3,6 +3,10 @@
3 3
4#include <asm/desc.h> 4#include <asm/desc.h>
5#include <linux/atomic.h> 5#include <linux/atomic.h>
6#include <linux/mm_types.h>
7
8#include <trace/events/tlb.h>
9
6#include <asm/pgalloc.h> 10#include <asm/pgalloc.h>
7#include <asm/tlbflush.h> 11#include <asm/tlbflush.h>
8#include <asm/paravirt.h> 12#include <asm/paravirt.h>
@@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
44 48
45 /* Re-load page tables */ 49 /* Re-load page tables */
46 load_cr3(next->pgd); 50 load_cr3(next->pgd);
51 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
47 52
48 /* Stop flush ipis for the previous mm */ 53 /* Stop flush ipis for the previous mm */
49 cpumask_clear_cpu(cpu, mm_cpumask(prev)); 54 cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
71 * to make sure to use no freed page tables. 76 * to make sure to use no freed page tables.
72 */ 77 */
73 load_cr3(next->pgd); 78 load_cr3(next->pgd);
79 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
74 load_LDT_nolock(&next->context); 80 load_LDT_nolock(&next->context);
75 } 81 }
76 } 82 }
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 32cc237f8e20..ee30b9f0b91c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO];
72extern u16 __read_mostly tlb_lld_2m[NR_INFO]; 72extern u16 __read_mostly tlb_lld_2m[NR_INFO];
73extern u16 __read_mostly tlb_lld_4m[NR_INFO]; 73extern u16 __read_mostly tlb_lld_4m[NR_INFO];
74extern u16 __read_mostly tlb_lld_1g[NR_INFO]; 74extern u16 __read_mostly tlb_lld_1g[NR_INFO];
75extern s8 __read_mostly tlb_flushall_shift;
76 75
77/* 76/*
78 * CPU type and hardware bug flags. Kept separately for each CPU. 77 * CPU type and hardware bug flags. Kept separately for each CPU.
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bc360d3df60e..60e5497681f5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -724,11 +724,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
724} 724}
725#endif 725#endif
726 726
727static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
728{
729 tlb_flushall_shift = 6;
730}
731
732static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) 727static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
733{ 728{
734 u32 ebx, eax, ecx, edx; 729 u32 ebx, eax, ecx, edx;
@@ -776,8 +771,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
776 tlb_lli_2m[ENTRIES] = eax & mask; 771 tlb_lli_2m[ENTRIES] = eax & mask;
777 772
778 tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; 773 tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
779
780 cpu_set_tlb_flushall_shift(c);
781} 774}
782 775
783static const struct cpu_dev amd_cpu_dev = { 776static const struct cpu_dev amd_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 188a8c5cc094..333fd5209336 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -481,26 +481,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
481u16 __read_mostly tlb_lld_4m[NR_INFO]; 481u16 __read_mostly tlb_lld_4m[NR_INFO];
482u16 __read_mostly tlb_lld_1g[NR_INFO]; 482u16 __read_mostly tlb_lld_1g[NR_INFO];
483 483
484/*
485 * tlb_flushall_shift shows the balance point in replacing cr3 write
486 * with multiple 'invlpg'. It will do this replacement when
487 * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
488 * If tlb_flushall_shift is -1, means the replacement will be disabled.
489 */
490s8 __read_mostly tlb_flushall_shift = -1;
491
492void cpu_detect_tlb(struct cpuinfo_x86 *c) 484void cpu_detect_tlb(struct cpuinfo_x86 *c)
493{ 485{
494 if (this_cpu->c_detect_tlb) 486 if (this_cpu->c_detect_tlb)
495 this_cpu->c_detect_tlb(c); 487 this_cpu->c_detect_tlb(c);
496 488
497 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" 489 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
498 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" 490 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
499 "tlb_flushall_shift: %d\n",
500 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], 491 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
501 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], 492 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
502 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], 493 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
503 tlb_lld_1g[ENTRIES], tlb_flushall_shift); 494 tlb_lld_1g[ENTRIES]);
504} 495}
505 496
506void detect_ht(struct cpuinfo_x86 *c) 497void detect_ht(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 9483ee5b3991..74e804ddc5c7 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc)
634 } 634 }
635} 635}
636 636
637static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
638{
639 switch ((c->x86 << 8) + c->x86_model) {
640 case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
641 case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
642 case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
643 case 0x61d: /* six-core 45 nm xeon "Dunnington" */
644 tlb_flushall_shift = -1;
645 break;
646 case 0x63a: /* Ivybridge */
647 tlb_flushall_shift = 2;
648 break;
649 case 0x61a: /* 45 nm nehalem, "Bloomfield" */
650 case 0x61e: /* 45 nm nehalem, "Lynnfield" */
651 case 0x625: /* 32 nm nehalem, "Clarkdale" */
652 case 0x62c: /* 32 nm nehalem, "Gulftown" */
653 case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
654 case 0x62f: /* 32 nm Xeon E7 */
655 case 0x62a: /* SandyBridge */
656 case 0x62d: /* SandyBridge, "Romely-EP" */
657 default:
658 tlb_flushall_shift = 6;
659 }
660}
661
662static void intel_detect_tlb(struct cpuinfo_x86 *c) 637static void intel_detect_tlb(struct cpuinfo_x86 *c)
663{ 638{
664 int i, j, n; 639 int i, j, n;
@@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
683 for (j = 1 ; j < 16 ; j++) 658 for (j = 1 ; j < 16 ; j++)
684 intel_tlb_lookup(desc[j]); 659 intel_tlb_lookup(desc[j]);
685 } 660 }
686 intel_tlb_flushall_shift_set(c);
687} 661}
688 662
689static const struct cpu_dev intel_cpu_dev = { 663static const struct cpu_dev intel_cpu_dev = {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 36642793e315..1dbade870f90 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -577,6 +577,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
577 577
578static const char nx_warning[] = KERN_CRIT 578static const char nx_warning[] = KERN_CRIT
579"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; 579"kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
580static const char smep_warning[] = KERN_CRIT
581"unable to execute userspace code (SMEP?) (uid: %d)\n";
580 582
581static void 583static void
582show_fault_oops(struct pt_regs *regs, unsigned long error_code, 584show_fault_oops(struct pt_regs *regs, unsigned long error_code,
@@ -597,6 +599,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
597 599
598 if (pte && pte_present(*pte) && !pte_exec(*pte)) 600 if (pte && pte_present(*pte) && !pte_exec(*pte))
599 printk(nx_warning, from_kuid(&init_user_ns, current_uid())); 601 printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
602 if (pte && pte_present(*pte) && pte_exec(*pte) &&
603 (pgd_flags(*pgd) & _PAGE_USER) &&
604 (read_cr4() & X86_CR4_SMEP))
605 printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
600 } 606 }
601 607
602 printk(KERN_ALERT "BUG: unable to handle kernel "); 608 printk(KERN_ALERT "BUG: unable to handle kernel ");
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f97130618113..66dba36f2343 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,6 +18,13 @@
18#include <asm/dma.h> /* for MAX_DMA_PFN */ 18#include <asm/dma.h> /* for MAX_DMA_PFN */
19#include <asm/microcode.h> 19#include <asm/microcode.h>
20 20
21/*
22 * We need to define the tracepoints somewhere, and tlb.c
23 * is only compied when SMP=y.
24 */
25#define CREATE_TRACE_POINTS
26#include <trace/events/tlb.h>
27
21#include "mm_internal.h" 28#include "mm_internal.h"
22 29
23static unsigned long __initdata pgt_buf_start; 30static unsigned long __initdata pgt_buf_start;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index dd8dda167a24..1fe33987de02 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -49,6 +49,7 @@ void leave_mm(int cpu)
49 if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { 49 if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
50 cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); 50 cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
51 load_cr3(swapper_pg_dir); 51 load_cr3(swapper_pg_dir);
52 trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
52 } 53 }
53} 54}
54EXPORT_SYMBOL_GPL(leave_mm); 55EXPORT_SYMBOL_GPL(leave_mm);
@@ -102,20 +103,24 @@ static void flush_tlb_func(void *info)
102 103
103 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) 104 if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
104 return; 105 return;
106 if (!f->flush_end)
107 f->flush_end = f->flush_start + PAGE_SIZE;
105 108
106 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 109 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
107 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { 110 if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
108 if (f->flush_end == TLB_FLUSH_ALL) 111 if (f->flush_end == TLB_FLUSH_ALL) {
109 local_flush_tlb(); 112 local_flush_tlb();
110 else if (!f->flush_end) 113 trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
111 __flush_tlb_single(f->flush_start); 114 } else {
112 else {
113 unsigned long addr; 115 unsigned long addr;
116 unsigned long nr_pages =
117 f->flush_end - f->flush_start / PAGE_SIZE;
114 addr = f->flush_start; 118 addr = f->flush_start;
115 while (addr < f->flush_end) { 119 while (addr < f->flush_end) {
116 __flush_tlb_single(addr); 120 __flush_tlb_single(addr);
117 addr += PAGE_SIZE; 121 addr += PAGE_SIZE;
118 } 122 }
123 trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
119 } 124 }
120 } else 125 } else
121 leave_mm(smp_processor_id()); 126 leave_mm(smp_processor_id());
@@ -153,46 +158,45 @@ void flush_tlb_current_task(void)
153 158
154 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 159 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
155 local_flush_tlb(); 160 local_flush_tlb();
161 trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
156 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 162 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
157 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); 163 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
158 preempt_enable(); 164 preempt_enable();
159} 165}
160 166
167/*
168 * See Documentation/x86/tlb.txt for details. We choose 33
169 * because it is large enough to cover the vast majority (at
170 * least 95%) of allocations, and is small enough that we are
171 * confident it will not cause too much overhead. Each single
172 * flush is about 100 ns, so this caps the maximum overhead at
173 * _about_ 3,000 ns.
174 *
175 * This is in units of pages.
176 */
177unsigned long tlb_single_page_flush_ceiling = 33;
178
161void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 179void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
162 unsigned long end, unsigned long vmflag) 180 unsigned long end, unsigned long vmflag)
163{ 181{
164 unsigned long addr; 182 unsigned long addr;
165 unsigned act_entries, tlb_entries = 0; 183 /* do a global flush by default */
166 unsigned long nr_base_pages; 184 unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
167 185
168 preempt_disable(); 186 preempt_disable();
169 if (current->active_mm != mm) 187 if (current->active_mm != mm)
170 goto flush_all; 188 goto out;
171 189
172 if (!current->mm) { 190 if (!current->mm) {
173 leave_mm(smp_processor_id()); 191 leave_mm(smp_processor_id());
174 goto flush_all; 192 goto out;
175 } 193 }
176 194
177 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 195 if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
178 || vmflag & VM_HUGETLB) { 196 base_pages_to_flush = (end - start) >> PAGE_SHIFT;
179 local_flush_tlb();
180 goto flush_all;
181 }
182
183 /* In modern CPU, last level tlb used for both data/ins */
184 if (vmflag & VM_EXEC)
185 tlb_entries = tlb_lli_4k[ENTRIES];
186 else
187 tlb_entries = tlb_lld_4k[ENTRIES];
188 197
189 /* Assume all of TLB entries was occupied by this task */ 198 if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
190 act_entries = tlb_entries >> tlb_flushall_shift; 199 base_pages_to_flush = TLB_FLUSH_ALL;
191 act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
192 nr_base_pages = (end - start) >> PAGE_SHIFT;
193
194 /* tlb_flushall_shift is on balance point, details in commit log */
195 if (nr_base_pages > act_entries) {
196 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 200 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
197 local_flush_tlb(); 201 local_flush_tlb();
198 } else { 202 } else {
@@ -201,17 +205,15 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
201 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); 205 count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
202 __flush_tlb_single(addr); 206 __flush_tlb_single(addr);
203 } 207 }
204
205 if (cpumask_any_but(mm_cpumask(mm),
206 smp_processor_id()) < nr_cpu_ids)
207 flush_tlb_others(mm_cpumask(mm), mm, start, end);
208 preempt_enable();
209 return;
210 } 208 }
211 209 trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
212flush_all: 210out:
211 if (base_pages_to_flush == TLB_FLUSH_ALL) {
212 start = 0UL;
213 end = TLB_FLUSH_ALL;
214 }
213 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) 215 if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
214 flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); 216 flush_tlb_others(mm_cpumask(mm), mm, start, end);
215 preempt_enable(); 217 preempt_enable();
216} 218}
217 219
@@ -260,32 +262,26 @@ static void do_kernel_range_flush(void *info)
260 262
261void flush_tlb_kernel_range(unsigned long start, unsigned long end) 263void flush_tlb_kernel_range(unsigned long start, unsigned long end)
262{ 264{
263 unsigned act_entries;
264 struct flush_tlb_info info;
265
266 /* In modern CPU, last level tlb used for both data/ins */
267 act_entries = tlb_lld_4k[ENTRIES];
268 265
269 /* Balance as user space task's flush, a bit conservative */ 266 /* Balance as user space task's flush, a bit conservative */
270 if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || 267 if (end == TLB_FLUSH_ALL ||
271 (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) 268 (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
272
273 on_each_cpu(do_flush_tlb_all, NULL, 1); 269 on_each_cpu(do_flush_tlb_all, NULL, 1);
274 else { 270 } else {
271 struct flush_tlb_info info;
275 info.flush_start = start; 272 info.flush_start = start;
276 info.flush_end = end; 273 info.flush_end = end;
277 on_each_cpu(do_kernel_range_flush, &info, 1); 274 on_each_cpu(do_kernel_range_flush, &info, 1);
278 } 275 }
279} 276}
280 277
281#ifdef CONFIG_DEBUG_TLBFLUSH
282static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, 278static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
283 size_t count, loff_t *ppos) 279 size_t count, loff_t *ppos)
284{ 280{
285 char buf[32]; 281 char buf[32];
286 unsigned int len; 282 unsigned int len;
287 283
288 len = sprintf(buf, "%hd\n", tlb_flushall_shift); 284 len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
289 return simple_read_from_buffer(user_buf, count, ppos, buf, len); 285 return simple_read_from_buffer(user_buf, count, ppos, buf, len);
290} 286}
291 287
@@ -294,20 +290,20 @@ static ssize_t tlbflush_write_file(struct file *file,
294{ 290{
295 char buf[32]; 291 char buf[32];
296 ssize_t len; 292 ssize_t len;
297 s8 shift; 293 int ceiling;
298 294
299 len = min(count, sizeof(buf) - 1); 295 len = min(count, sizeof(buf) - 1);
300 if (copy_from_user(buf, user_buf, len)) 296 if (copy_from_user(buf, user_buf, len))
301 return -EFAULT; 297 return -EFAULT;
302 298
303 buf[len] = '\0'; 299 buf[len] = '\0';
304 if (kstrtos8(buf, 0, &shift)) 300 if (kstrtoint(buf, 0, &ceiling))
305 return -EINVAL; 301 return -EINVAL;
306 302
307 if (shift < -1 || shift >= BITS_PER_LONG) 303 if (ceiling < 0)
308 return -EINVAL; 304 return -EINVAL;
309 305
310 tlb_flushall_shift = shift; 306 tlb_single_page_flush_ceiling = ceiling;
311 return count; 307 return count;
312} 308}
313 309
@@ -317,11 +313,10 @@ static const struct file_operations fops_tlbflush = {
317 .llseek = default_llseek, 313 .llseek = default_llseek,
318}; 314};
319 315
320static int __init create_tlb_flushall_shift(void) 316static int __init create_tlb_single_page_flush_ceiling(void)
321{ 317{
322 debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, 318 debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
323 arch_debugfs_dir, NULL, &fops_tlbflush); 319 arch_debugfs_dir, NULL, &fops_tlbflush);
324 return 0; 320 return 0;
325} 321}
326late_initcall(create_tlb_flushall_shift); 322late_initcall(create_tlb_single_page_flush_ceiling);
327#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 96c5750e3110..796deac19fcf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -516,4 +516,12 @@ struct vm_special_mapping
516 struct page **pages; 516 struct page **pages;
517}; 517};
518 518
519enum tlb_flush_reason {
520 TLB_FLUSH_ON_TASK_SWITCH,
521 TLB_REMOTE_SHOOTDOWN,
522 TLB_LOCAL_SHOOTDOWN,
523 TLB_LOCAL_MM_SHOOTDOWN,
524 NR_TLB_FLUSH_REASONS,
525};
526
519#endif /* _LINUX_MM_TYPES_H */ 527#endif /* _LINUX_MM_TYPES_H */
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
new file mode 100644
index 000000000000..13391d288107
--- /dev/null
+++ b/include/trace/events/tlb.h
@@ -0,0 +1,40 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM tlb
3
4#if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_TLB_H
6
7#include <linux/mm_types.h>
8#include <linux/tracepoint.h>
9
10#define TLB_FLUSH_REASON \
11 { TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" }, \
12 { TLB_REMOTE_SHOOTDOWN, "remote shootdown" }, \
13 { TLB_LOCAL_SHOOTDOWN, "local shootdown" }, \
14 { TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" }
15
16TRACE_EVENT(tlb_flush,
17
18 TP_PROTO(int reason, unsigned long pages),
19 TP_ARGS(reason, pages),
20
21 TP_STRUCT__entry(
22 __field( int, reason)
23 __field(unsigned long, pages)
24 ),
25
26 TP_fast_assign(
27 __entry->reason = reason;
28 __entry->pages = pages;
29 ),
30
31 TP_printk("pages:%ld reason:%s (%d)",
32 __entry->pages,
33 __print_symbolic(__entry->reason, TLB_FLUSH_REASON),
34 __entry->reason)
35);
36
37#endif /* _TRACE_TLB_H */
38
39/* This part must be outside protection */
40#include <trace/define_trace.h>