diff options
-rw-r--r-- | Documentation/x86/tlb.txt | 75 | ||||
-rw-r--r-- | arch/x86/include/asm/mmu_context.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/processor.h | 1 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 7 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 13 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/intel.c | 26 | ||||
-rw-r--r-- | arch/x86/mm/fault.c | 6 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 7 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 103 | ||||
-rw-r--r-- | include/linux/mm_types.h | 8 | ||||
-rw-r--r-- | include/trace/events/tlb.h | 40 |
11 files changed, 193 insertions, 99 deletions
diff --git a/Documentation/x86/tlb.txt b/Documentation/x86/tlb.txt new file mode 100644 index 000000000000..2b3a82e69151 --- /dev/null +++ b/Documentation/x86/tlb.txt | |||
@@ -0,0 +1,75 @@ | |||
1 | When the kernel unmaps or modified the attributes of a range of | ||
2 | memory, it has two choices: | ||
3 | 1. Flush the entire TLB with a two-instruction sequence. This is | ||
4 | a quick operation, but it causes collateral damage: TLB entries | ||
5 | from areas other than the one we are trying to flush will be | ||
6 | destroyed and must be refilled later, at some cost. | ||
7 | 2. Use the invlpg instruction to invalidate a single page at a | ||
8 | time. This could potentialy cost many more instructions, but | ||
9 | it is a much more precise operation, causing no collateral | ||
10 | damage to other TLB entries. | ||
11 | |||
12 | Which method to do depends on a few things: | ||
13 | 1. The size of the flush being performed. A flush of the entire | ||
14 | address space is obviously better performed by flushing the | ||
15 | entire TLB than doing 2^48/PAGE_SIZE individual flushes. | ||
16 | 2. The contents of the TLB. If the TLB is empty, then there will | ||
17 | be no collateral damage caused by doing the global flush, and | ||
18 | all of the individual flush will have ended up being wasted | ||
19 | work. | ||
20 | 3. The size of the TLB. The larger the TLB, the more collateral | ||
21 | damage we do with a full flush. So, the larger the TLB, the | ||
22 | more attrative an individual flush looks. Data and | ||
23 | instructions have separate TLBs, as do different page sizes. | ||
24 | 4. The microarchitecture. The TLB has become a multi-level | ||
25 | cache on modern CPUs, and the global flushes have become more | ||
26 | expensive relative to single-page flushes. | ||
27 | |||
28 | There is obviously no way the kernel can know all these things, | ||
29 | especially the contents of the TLB during a given flush. The | ||
30 | sizes of the flush will vary greatly depending on the workload as | ||
31 | well. There is essentially no "right" point to choose. | ||
32 | |||
33 | You may be doing too many individual invalidations if you see the | ||
34 | invlpg instruction (or instructions _near_ it) show up high in | ||
35 | profiles. If you believe that individual invalidations being | ||
36 | called too often, you can lower the tunable: | ||
37 | |||
38 | /sys/debug/kernel/x86/tlb_single_page_flush_ceiling | ||
39 | |||
40 | This will cause us to do the global flush for more cases. | ||
41 | Lowering it to 0 will disable the use of the individual flushes. | ||
42 | Setting it to 1 is a very conservative setting and it should | ||
43 | never need to be 0 under normal circumstances. | ||
44 | |||
45 | Despite the fact that a single individual flush on x86 is | ||
46 | guaranteed to flush a full 2MB [1], hugetlbfs always uses the full | ||
47 | flushes. THP is treated exactly the same as normal memory. | ||
48 | |||
49 | You might see invlpg inside of flush_tlb_mm_range() show up in | ||
50 | profiles, or you can use the trace_tlb_flush() tracepoints. to | ||
51 | determine how long the flush operations are taking. | ||
52 | |||
53 | Essentially, you are balancing the cycles you spend doing invlpg | ||
54 | with the cycles that you spend refilling the TLB later. | ||
55 | |||
56 | You can measure how expensive TLB refills are by using | ||
57 | performance counters and 'perf stat', like this: | ||
58 | |||
59 | perf stat -e | ||
60 | cpu/event=0x8,umask=0x84,name=dtlb_load_misses_walk_duration/, | ||
61 | cpu/event=0x8,umask=0x82,name=dtlb_load_misses_walk_completed/, | ||
62 | cpu/event=0x49,umask=0x4,name=dtlb_store_misses_walk_duration/, | ||
63 | cpu/event=0x49,umask=0x2,name=dtlb_store_misses_walk_completed/, | ||
64 | cpu/event=0x85,umask=0x4,name=itlb_misses_walk_duration/, | ||
65 | cpu/event=0x85,umask=0x2,name=itlb_misses_walk_completed/ | ||
66 | |||
67 | That works on an IvyBridge-era CPU (i5-3320M). Different CPUs | ||
68 | may have differently-named counters, but they should at least | ||
69 | be there in some form. You can use pmu-tools 'ocperf list' | ||
70 | (https://github.com/andikleen/pmu-tools) to find the right | ||
71 | counters for a given CPU. | ||
72 | |||
73 | 1. A footnote in Intel's SDM "4.10.4.2 Recommended Invalidation" | ||
74 | says: "One execution of INVLPG is sufficient even for a page | ||
75 | with size greater than 4 KBytes." | ||
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index be12c534fd59..166af2a8e865 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
@@ -3,6 +3,10 @@ | |||
3 | 3 | ||
4 | #include <asm/desc.h> | 4 | #include <asm/desc.h> |
5 | #include <linux/atomic.h> | 5 | #include <linux/atomic.h> |
6 | #include <linux/mm_types.h> | ||
7 | |||
8 | #include <trace/events/tlb.h> | ||
9 | |||
6 | #include <asm/pgalloc.h> | 10 | #include <asm/pgalloc.h> |
7 | #include <asm/tlbflush.h> | 11 | #include <asm/tlbflush.h> |
8 | #include <asm/paravirt.h> | 12 | #include <asm/paravirt.h> |
@@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |||
44 | 48 | ||
45 | /* Re-load page tables */ | 49 | /* Re-load page tables */ |
46 | load_cr3(next->pgd); | 50 | load_cr3(next->pgd); |
51 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
47 | 52 | ||
48 | /* Stop flush ipis for the previous mm */ | 53 | /* Stop flush ipis for the previous mm */ |
49 | cpumask_clear_cpu(cpu, mm_cpumask(prev)); | 54 | cpumask_clear_cpu(cpu, mm_cpumask(prev)); |
@@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |||
71 | * to make sure to use no freed page tables. | 76 | * to make sure to use no freed page tables. |
72 | */ | 77 | */ |
73 | load_cr3(next->pgd); | 78 | load_cr3(next->pgd); |
79 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
74 | load_LDT_nolock(&next->context); | 80 | load_LDT_nolock(&next->context); |
75 | } | 81 | } |
76 | } | 82 | } |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 32cc237f8e20..ee30b9f0b91c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
@@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO]; | |||
72 | extern u16 __read_mostly tlb_lld_2m[NR_INFO]; | 72 | extern u16 __read_mostly tlb_lld_2m[NR_INFO]; |
73 | extern u16 __read_mostly tlb_lld_4m[NR_INFO]; | 73 | extern u16 __read_mostly tlb_lld_4m[NR_INFO]; |
74 | extern u16 __read_mostly tlb_lld_1g[NR_INFO]; | 74 | extern u16 __read_mostly tlb_lld_1g[NR_INFO]; |
75 | extern s8 __read_mostly tlb_flushall_shift; | ||
76 | 75 | ||
77 | /* | 76 | /* |
78 | * CPU type and hardware bug flags. Kept separately for each CPU. | 77 | * CPU type and hardware bug flags. Kept separately for each CPU. |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bc360d3df60e..60e5497681f5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -724,11 +724,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) | |||
724 | } | 724 | } |
725 | #endif | 725 | #endif |
726 | 726 | ||
727 | static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) | ||
728 | { | ||
729 | tlb_flushall_shift = 6; | ||
730 | } | ||
731 | |||
732 | static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) | 727 | static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) |
733 | { | 728 | { |
734 | u32 ebx, eax, ecx, edx; | 729 | u32 ebx, eax, ecx, edx; |
@@ -776,8 +771,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) | |||
776 | tlb_lli_2m[ENTRIES] = eax & mask; | 771 | tlb_lli_2m[ENTRIES] = eax & mask; |
777 | 772 | ||
778 | tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; | 773 | tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; |
779 | |||
780 | cpu_set_tlb_flushall_shift(c); | ||
781 | } | 774 | } |
782 | 775 | ||
783 | static const struct cpu_dev amd_cpu_dev = { | 776 | static const struct cpu_dev amd_cpu_dev = { |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 188a8c5cc094..333fd5209336 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -481,26 +481,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO]; | |||
481 | u16 __read_mostly tlb_lld_4m[NR_INFO]; | 481 | u16 __read_mostly tlb_lld_4m[NR_INFO]; |
482 | u16 __read_mostly tlb_lld_1g[NR_INFO]; | 482 | u16 __read_mostly tlb_lld_1g[NR_INFO]; |
483 | 483 | ||
484 | /* | ||
485 | * tlb_flushall_shift shows the balance point in replacing cr3 write | ||
486 | * with multiple 'invlpg'. It will do this replacement when | ||
487 | * flush_tlb_lines <= active_lines/2^tlb_flushall_shift. | ||
488 | * If tlb_flushall_shift is -1, means the replacement will be disabled. | ||
489 | */ | ||
490 | s8 __read_mostly tlb_flushall_shift = -1; | ||
491 | |||
492 | void cpu_detect_tlb(struct cpuinfo_x86 *c) | 484 | void cpu_detect_tlb(struct cpuinfo_x86 *c) |
493 | { | 485 | { |
494 | if (this_cpu->c_detect_tlb) | 486 | if (this_cpu->c_detect_tlb) |
495 | this_cpu->c_detect_tlb(c); | 487 | this_cpu->c_detect_tlb(c); |
496 | 488 | ||
497 | printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" | 489 | printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" |
498 | "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" | 490 | "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", |
499 | "tlb_flushall_shift: %d\n", | ||
500 | tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], | 491 | tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], |
501 | tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], | 492 | tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], |
502 | tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], | 493 | tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], |
503 | tlb_lld_1g[ENTRIES], tlb_flushall_shift); | 494 | tlb_lld_1g[ENTRIES]); |
504 | } | 495 | } |
505 | 496 | ||
506 | void detect_ht(struct cpuinfo_x86 *c) | 497 | void detect_ht(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 9483ee5b3991..74e804ddc5c7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc) | |||
634 | } | 634 | } |
635 | } | 635 | } |
636 | 636 | ||
637 | static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) | ||
638 | { | ||
639 | switch ((c->x86 << 8) + c->x86_model) { | ||
640 | case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ | ||
641 | case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ | ||
642 | case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ | ||
643 | case 0x61d: /* six-core 45 nm xeon "Dunnington" */ | ||
644 | tlb_flushall_shift = -1; | ||
645 | break; | ||
646 | case 0x63a: /* Ivybridge */ | ||
647 | tlb_flushall_shift = 2; | ||
648 | break; | ||
649 | case 0x61a: /* 45 nm nehalem, "Bloomfield" */ | ||
650 | case 0x61e: /* 45 nm nehalem, "Lynnfield" */ | ||
651 | case 0x625: /* 32 nm nehalem, "Clarkdale" */ | ||
652 | case 0x62c: /* 32 nm nehalem, "Gulftown" */ | ||
653 | case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ | ||
654 | case 0x62f: /* 32 nm Xeon E7 */ | ||
655 | case 0x62a: /* SandyBridge */ | ||
656 | case 0x62d: /* SandyBridge, "Romely-EP" */ | ||
657 | default: | ||
658 | tlb_flushall_shift = 6; | ||
659 | } | ||
660 | } | ||
661 | |||
662 | static void intel_detect_tlb(struct cpuinfo_x86 *c) | 637 | static void intel_detect_tlb(struct cpuinfo_x86 *c) |
663 | { | 638 | { |
664 | int i, j, n; | 639 | int i, j, n; |
@@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c) | |||
683 | for (j = 1 ; j < 16 ; j++) | 658 | for (j = 1 ; j < 16 ; j++) |
684 | intel_tlb_lookup(desc[j]); | 659 | intel_tlb_lookup(desc[j]); |
685 | } | 660 | } |
686 | intel_tlb_flushall_shift_set(c); | ||
687 | } | 661 | } |
688 | 662 | ||
689 | static const struct cpu_dev intel_cpu_dev = { | 663 | static const struct cpu_dev intel_cpu_dev = { |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 36642793e315..1dbade870f90 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -577,6 +577,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) | |||
577 | 577 | ||
578 | static const char nx_warning[] = KERN_CRIT | 578 | static const char nx_warning[] = KERN_CRIT |
579 | "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; | 579 | "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; |
580 | static const char smep_warning[] = KERN_CRIT | ||
581 | "unable to execute userspace code (SMEP?) (uid: %d)\n"; | ||
580 | 582 | ||
581 | static void | 583 | static void |
582 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, | 584 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, |
@@ -597,6 +599,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, | |||
597 | 599 | ||
598 | if (pte && pte_present(*pte) && !pte_exec(*pte)) | 600 | if (pte && pte_present(*pte) && !pte_exec(*pte)) |
599 | printk(nx_warning, from_kuid(&init_user_ns, current_uid())); | 601 | printk(nx_warning, from_kuid(&init_user_ns, current_uid())); |
602 | if (pte && pte_present(*pte) && pte_exec(*pte) && | ||
603 | (pgd_flags(*pgd) & _PAGE_USER) && | ||
604 | (read_cr4() & X86_CR4_SMEP)) | ||
605 | printk(smep_warning, from_kuid(&init_user_ns, current_uid())); | ||
600 | } | 606 | } |
601 | 607 | ||
602 | printk(KERN_ALERT "BUG: unable to handle kernel "); | 608 | printk(KERN_ALERT "BUG: unable to handle kernel "); |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f97130618113..66dba36f2343 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -18,6 +18,13 @@ | |||
18 | #include <asm/dma.h> /* for MAX_DMA_PFN */ | 18 | #include <asm/dma.h> /* for MAX_DMA_PFN */ |
19 | #include <asm/microcode.h> | 19 | #include <asm/microcode.h> |
20 | 20 | ||
21 | /* | ||
22 | * We need to define the tracepoints somewhere, and tlb.c | ||
23 | * is only compied when SMP=y. | ||
24 | */ | ||
25 | #define CREATE_TRACE_POINTS | ||
26 | #include <trace/events/tlb.h> | ||
27 | |||
21 | #include "mm_internal.h" | 28 | #include "mm_internal.h" |
22 | 29 | ||
23 | static unsigned long __initdata pgt_buf_start; | 30 | static unsigned long __initdata pgt_buf_start; |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index dd8dda167a24..1fe33987de02 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -49,6 +49,7 @@ void leave_mm(int cpu) | |||
49 | if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { | 49 | if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { |
50 | cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); | 50 | cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); |
51 | load_cr3(swapper_pg_dir); | 51 | load_cr3(swapper_pg_dir); |
52 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
52 | } | 53 | } |
53 | } | 54 | } |
54 | EXPORT_SYMBOL_GPL(leave_mm); | 55 | EXPORT_SYMBOL_GPL(leave_mm); |
@@ -102,20 +103,24 @@ static void flush_tlb_func(void *info) | |||
102 | 103 | ||
103 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) | 104 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) |
104 | return; | 105 | return; |
106 | if (!f->flush_end) | ||
107 | f->flush_end = f->flush_start + PAGE_SIZE; | ||
105 | 108 | ||
106 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 109 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
107 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { | 110 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { |
108 | if (f->flush_end == TLB_FLUSH_ALL) | 111 | if (f->flush_end == TLB_FLUSH_ALL) { |
109 | local_flush_tlb(); | 112 | local_flush_tlb(); |
110 | else if (!f->flush_end) | 113 | trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); |
111 | __flush_tlb_single(f->flush_start); | 114 | } else { |
112 | else { | ||
113 | unsigned long addr; | 115 | unsigned long addr; |
116 | unsigned long nr_pages = | ||
117 | f->flush_end - f->flush_start / PAGE_SIZE; | ||
114 | addr = f->flush_start; | 118 | addr = f->flush_start; |
115 | while (addr < f->flush_end) { | 119 | while (addr < f->flush_end) { |
116 | __flush_tlb_single(addr); | 120 | __flush_tlb_single(addr); |
117 | addr += PAGE_SIZE; | 121 | addr += PAGE_SIZE; |
118 | } | 122 | } |
123 | trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); | ||
119 | } | 124 | } |
120 | } else | 125 | } else |
121 | leave_mm(smp_processor_id()); | 126 | leave_mm(smp_processor_id()); |
@@ -153,46 +158,45 @@ void flush_tlb_current_task(void) | |||
153 | 158 | ||
154 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | 159 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
155 | local_flush_tlb(); | 160 | local_flush_tlb(); |
161 | trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); | ||
156 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | 162 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
157 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); | 163 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); |
158 | preempt_enable(); | 164 | preempt_enable(); |
159 | } | 165 | } |
160 | 166 | ||
167 | /* | ||
168 | * See Documentation/x86/tlb.txt for details. We choose 33 | ||
169 | * because it is large enough to cover the vast majority (at | ||
170 | * least 95%) of allocations, and is small enough that we are | ||
171 | * confident it will not cause too much overhead. Each single | ||
172 | * flush is about 100 ns, so this caps the maximum overhead at | ||
173 | * _about_ 3,000 ns. | ||
174 | * | ||
175 | * This is in units of pages. | ||
176 | */ | ||
177 | unsigned long tlb_single_page_flush_ceiling = 33; | ||
178 | |||
161 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 179 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
162 | unsigned long end, unsigned long vmflag) | 180 | unsigned long end, unsigned long vmflag) |
163 | { | 181 | { |
164 | unsigned long addr; | 182 | unsigned long addr; |
165 | unsigned act_entries, tlb_entries = 0; | 183 | /* do a global flush by default */ |
166 | unsigned long nr_base_pages; | 184 | unsigned long base_pages_to_flush = TLB_FLUSH_ALL; |
167 | 185 | ||
168 | preempt_disable(); | 186 | preempt_disable(); |
169 | if (current->active_mm != mm) | 187 | if (current->active_mm != mm) |
170 | goto flush_all; | 188 | goto out; |
171 | 189 | ||
172 | if (!current->mm) { | 190 | if (!current->mm) { |
173 | leave_mm(smp_processor_id()); | 191 | leave_mm(smp_processor_id()); |
174 | goto flush_all; | 192 | goto out; |
175 | } | 193 | } |
176 | 194 | ||
177 | if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 | 195 | if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) |
178 | || vmflag & VM_HUGETLB) { | 196 | base_pages_to_flush = (end - start) >> PAGE_SHIFT; |
179 | local_flush_tlb(); | ||
180 | goto flush_all; | ||
181 | } | ||
182 | |||
183 | /* In modern CPU, last level tlb used for both data/ins */ | ||
184 | if (vmflag & VM_EXEC) | ||
185 | tlb_entries = tlb_lli_4k[ENTRIES]; | ||
186 | else | ||
187 | tlb_entries = tlb_lld_4k[ENTRIES]; | ||
188 | 197 | ||
189 | /* Assume all of TLB entries was occupied by this task */ | 198 | if (base_pages_to_flush > tlb_single_page_flush_ceiling) { |
190 | act_entries = tlb_entries >> tlb_flushall_shift; | 199 | base_pages_to_flush = TLB_FLUSH_ALL; |
191 | act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; | ||
192 | nr_base_pages = (end - start) >> PAGE_SHIFT; | ||
193 | |||
194 | /* tlb_flushall_shift is on balance point, details in commit log */ | ||
195 | if (nr_base_pages > act_entries) { | ||
196 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | 200 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
197 | local_flush_tlb(); | 201 | local_flush_tlb(); |
198 | } else { | 202 | } else { |
@@ -201,17 +205,15 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
201 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); | 205 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); |
202 | __flush_tlb_single(addr); | 206 | __flush_tlb_single(addr); |
203 | } | 207 | } |
204 | |||
205 | if (cpumask_any_but(mm_cpumask(mm), | ||
206 | smp_processor_id()) < nr_cpu_ids) | ||
207 | flush_tlb_others(mm_cpumask(mm), mm, start, end); | ||
208 | preempt_enable(); | ||
209 | return; | ||
210 | } | 208 | } |
211 | 209 | trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); | |
212 | flush_all: | 210 | out: |
211 | if (base_pages_to_flush == TLB_FLUSH_ALL) { | ||
212 | start = 0UL; | ||
213 | end = TLB_FLUSH_ALL; | ||
214 | } | ||
213 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | 215 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
214 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); | 216 | flush_tlb_others(mm_cpumask(mm), mm, start, end); |
215 | preempt_enable(); | 217 | preempt_enable(); |
216 | } | 218 | } |
217 | 219 | ||
@@ -260,32 +262,26 @@ static void do_kernel_range_flush(void *info) | |||
260 | 262 | ||
261 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) | 263 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) |
262 | { | 264 | { |
263 | unsigned act_entries; | ||
264 | struct flush_tlb_info info; | ||
265 | |||
266 | /* In modern CPU, last level tlb used for both data/ins */ | ||
267 | act_entries = tlb_lld_4k[ENTRIES]; | ||
268 | 265 | ||
269 | /* Balance as user space task's flush, a bit conservative */ | 266 | /* Balance as user space task's flush, a bit conservative */ |
270 | if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || | 267 | if (end == TLB_FLUSH_ALL || |
271 | (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) | 268 | (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { |
272 | |||
273 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 269 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
274 | else { | 270 | } else { |
271 | struct flush_tlb_info info; | ||
275 | info.flush_start = start; | 272 | info.flush_start = start; |
276 | info.flush_end = end; | 273 | info.flush_end = end; |
277 | on_each_cpu(do_kernel_range_flush, &info, 1); | 274 | on_each_cpu(do_kernel_range_flush, &info, 1); |
278 | } | 275 | } |
279 | } | 276 | } |
280 | 277 | ||
281 | #ifdef CONFIG_DEBUG_TLBFLUSH | ||
282 | static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, | 278 | static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, |
283 | size_t count, loff_t *ppos) | 279 | size_t count, loff_t *ppos) |
284 | { | 280 | { |
285 | char buf[32]; | 281 | char buf[32]; |
286 | unsigned int len; | 282 | unsigned int len; |
287 | 283 | ||
288 | len = sprintf(buf, "%hd\n", tlb_flushall_shift); | 284 | len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); |
289 | return simple_read_from_buffer(user_buf, count, ppos, buf, len); | 285 | return simple_read_from_buffer(user_buf, count, ppos, buf, len); |
290 | } | 286 | } |
291 | 287 | ||
@@ -294,20 +290,20 @@ static ssize_t tlbflush_write_file(struct file *file, | |||
294 | { | 290 | { |
295 | char buf[32]; | 291 | char buf[32]; |
296 | ssize_t len; | 292 | ssize_t len; |
297 | s8 shift; | 293 | int ceiling; |
298 | 294 | ||
299 | len = min(count, sizeof(buf) - 1); | 295 | len = min(count, sizeof(buf) - 1); |
300 | if (copy_from_user(buf, user_buf, len)) | 296 | if (copy_from_user(buf, user_buf, len)) |
301 | return -EFAULT; | 297 | return -EFAULT; |
302 | 298 | ||
303 | buf[len] = '\0'; | 299 | buf[len] = '\0'; |
304 | if (kstrtos8(buf, 0, &shift)) | 300 | if (kstrtoint(buf, 0, &ceiling)) |
305 | return -EINVAL; | 301 | return -EINVAL; |
306 | 302 | ||
307 | if (shift < -1 || shift >= BITS_PER_LONG) | 303 | if (ceiling < 0) |
308 | return -EINVAL; | 304 | return -EINVAL; |
309 | 305 | ||
310 | tlb_flushall_shift = shift; | 306 | tlb_single_page_flush_ceiling = ceiling; |
311 | return count; | 307 | return count; |
312 | } | 308 | } |
313 | 309 | ||
@@ -317,11 +313,10 @@ static const struct file_operations fops_tlbflush = { | |||
317 | .llseek = default_llseek, | 313 | .llseek = default_llseek, |
318 | }; | 314 | }; |
319 | 315 | ||
320 | static int __init create_tlb_flushall_shift(void) | 316 | static int __init create_tlb_single_page_flush_ceiling(void) |
321 | { | 317 | { |
322 | debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, | 318 | debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, |
323 | arch_debugfs_dir, NULL, &fops_tlbflush); | 319 | arch_debugfs_dir, NULL, &fops_tlbflush); |
324 | return 0; | 320 | return 0; |
325 | } | 321 | } |
326 | late_initcall(create_tlb_flushall_shift); | 322 | late_initcall(create_tlb_single_page_flush_ceiling); |
327 | #endif | ||
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 96c5750e3110..796deac19fcf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -516,4 +516,12 @@ struct vm_special_mapping | |||
516 | struct page **pages; | 516 | struct page **pages; |
517 | }; | 517 | }; |
518 | 518 | ||
519 | enum tlb_flush_reason { | ||
520 | TLB_FLUSH_ON_TASK_SWITCH, | ||
521 | TLB_REMOTE_SHOOTDOWN, | ||
522 | TLB_LOCAL_SHOOTDOWN, | ||
523 | TLB_LOCAL_MM_SHOOTDOWN, | ||
524 | NR_TLB_FLUSH_REASONS, | ||
525 | }; | ||
526 | |||
519 | #endif /* _LINUX_MM_TYPES_H */ | 527 | #endif /* _LINUX_MM_TYPES_H */ |
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h new file mode 100644 index 000000000000..13391d288107 --- /dev/null +++ b/include/trace/events/tlb.h | |||
@@ -0,0 +1,40 @@ | |||
1 | #undef TRACE_SYSTEM | ||
2 | #define TRACE_SYSTEM tlb | ||
3 | |||
4 | #if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ) | ||
5 | #define _TRACE_TLB_H | ||
6 | |||
7 | #include <linux/mm_types.h> | ||
8 | #include <linux/tracepoint.h> | ||
9 | |||
10 | #define TLB_FLUSH_REASON \ | ||
11 | { TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" }, \ | ||
12 | { TLB_REMOTE_SHOOTDOWN, "remote shootdown" }, \ | ||
13 | { TLB_LOCAL_SHOOTDOWN, "local shootdown" }, \ | ||
14 | { TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" } | ||
15 | |||
16 | TRACE_EVENT(tlb_flush, | ||
17 | |||
18 | TP_PROTO(int reason, unsigned long pages), | ||
19 | TP_ARGS(reason, pages), | ||
20 | |||
21 | TP_STRUCT__entry( | ||
22 | __field( int, reason) | ||
23 | __field(unsigned long, pages) | ||
24 | ), | ||
25 | |||
26 | TP_fast_assign( | ||
27 | __entry->reason = reason; | ||
28 | __entry->pages = pages; | ||
29 | ), | ||
30 | |||
31 | TP_printk("pages:%ld reason:%s (%d)", | ||
32 | __entry->pages, | ||
33 | __print_symbolic(__entry->reason, TLB_FLUSH_REASON), | ||
34 | __entry->reason) | ||
35 | ); | ||
36 | |||
37 | #endif /* _TRACE_TLB_H */ | ||
38 | |||
39 | /* This part must be outside protection */ | ||
40 | #include <trace/define_trace.h> | ||