diff options
| -rw-r--r-- | Documentation/x86/tlb.txt | 75 | ||||
| -rw-r--r-- | arch/x86/include/asm/mmu_context.h | 6 | ||||
| -rw-r--r-- | arch/x86/include/asm/processor.h | 1 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/amd.c | 7 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/common.c | 13 | ||||
| -rw-r--r-- | arch/x86/kernel/cpu/intel.c | 26 | ||||
| -rw-r--r-- | arch/x86/mm/fault.c | 6 | ||||
| -rw-r--r-- | arch/x86/mm/init.c | 7 | ||||
| -rw-r--r-- | arch/x86/mm/tlb.c | 103 | ||||
| -rw-r--r-- | include/linux/mm_types.h | 8 | ||||
| -rw-r--r-- | include/trace/events/tlb.h | 40 |
11 files changed, 193 insertions, 99 deletions
diff --git a/Documentation/x86/tlb.txt b/Documentation/x86/tlb.txt new file mode 100644 index 000000000000..2b3a82e69151 --- /dev/null +++ b/Documentation/x86/tlb.txt | |||
| @@ -0,0 +1,75 @@ | |||
| 1 | When the kernel unmaps or modified the attributes of a range of | ||
| 2 | memory, it has two choices: | ||
| 3 | 1. Flush the entire TLB with a two-instruction sequence. This is | ||
| 4 | a quick operation, but it causes collateral damage: TLB entries | ||
| 5 | from areas other than the one we are trying to flush will be | ||
| 6 | destroyed and must be refilled later, at some cost. | ||
| 7 | 2. Use the invlpg instruction to invalidate a single page at a | ||
| 8 | time. This could potentialy cost many more instructions, but | ||
| 9 | it is a much more precise operation, causing no collateral | ||
| 10 | damage to other TLB entries. | ||
| 11 | |||
| 12 | Which method to do depends on a few things: | ||
| 13 | 1. The size of the flush being performed. A flush of the entire | ||
| 14 | address space is obviously better performed by flushing the | ||
| 15 | entire TLB than doing 2^48/PAGE_SIZE individual flushes. | ||
| 16 | 2. The contents of the TLB. If the TLB is empty, then there will | ||
| 17 | be no collateral damage caused by doing the global flush, and | ||
| 18 | all of the individual flush will have ended up being wasted | ||
| 19 | work. | ||
| 20 | 3. The size of the TLB. The larger the TLB, the more collateral | ||
| 21 | damage we do with a full flush. So, the larger the TLB, the | ||
| 22 | more attrative an individual flush looks. Data and | ||
| 23 | instructions have separate TLBs, as do different page sizes. | ||
| 24 | 4. The microarchitecture. The TLB has become a multi-level | ||
| 25 | cache on modern CPUs, and the global flushes have become more | ||
| 26 | expensive relative to single-page flushes. | ||
| 27 | |||
| 28 | There is obviously no way the kernel can know all these things, | ||
| 29 | especially the contents of the TLB during a given flush. The | ||
| 30 | sizes of the flush will vary greatly depending on the workload as | ||
| 31 | well. There is essentially no "right" point to choose. | ||
| 32 | |||
| 33 | You may be doing too many individual invalidations if you see the | ||
| 34 | invlpg instruction (or instructions _near_ it) show up high in | ||
| 35 | profiles. If you believe that individual invalidations being | ||
| 36 | called too often, you can lower the tunable: | ||
| 37 | |||
| 38 | /sys/debug/kernel/x86/tlb_single_page_flush_ceiling | ||
| 39 | |||
| 40 | This will cause us to do the global flush for more cases. | ||
| 41 | Lowering it to 0 will disable the use of the individual flushes. | ||
| 42 | Setting it to 1 is a very conservative setting and it should | ||
| 43 | never need to be 0 under normal circumstances. | ||
| 44 | |||
| 45 | Despite the fact that a single individual flush on x86 is | ||
| 46 | guaranteed to flush a full 2MB [1], hugetlbfs always uses the full | ||
| 47 | flushes. THP is treated exactly the same as normal memory. | ||
| 48 | |||
| 49 | You might see invlpg inside of flush_tlb_mm_range() show up in | ||
| 50 | profiles, or you can use the trace_tlb_flush() tracepoints. to | ||
| 51 | determine how long the flush operations are taking. | ||
| 52 | |||
| 53 | Essentially, you are balancing the cycles you spend doing invlpg | ||
| 54 | with the cycles that you spend refilling the TLB later. | ||
| 55 | |||
| 56 | You can measure how expensive TLB refills are by using | ||
| 57 | performance counters and 'perf stat', like this: | ||
| 58 | |||
| 59 | perf stat -e | ||
| 60 | cpu/event=0x8,umask=0x84,name=dtlb_load_misses_walk_duration/, | ||
| 61 | cpu/event=0x8,umask=0x82,name=dtlb_load_misses_walk_completed/, | ||
| 62 | cpu/event=0x49,umask=0x4,name=dtlb_store_misses_walk_duration/, | ||
| 63 | cpu/event=0x49,umask=0x2,name=dtlb_store_misses_walk_completed/, | ||
| 64 | cpu/event=0x85,umask=0x4,name=itlb_misses_walk_duration/, | ||
| 65 | cpu/event=0x85,umask=0x2,name=itlb_misses_walk_completed/ | ||
| 66 | |||
| 67 | That works on an IvyBridge-era CPU (i5-3320M). Different CPUs | ||
| 68 | may have differently-named counters, but they should at least | ||
| 69 | be there in some form. You can use pmu-tools 'ocperf list' | ||
| 70 | (https://github.com/andikleen/pmu-tools) to find the right | ||
| 71 | counters for a given CPU. | ||
| 72 | |||
| 73 | 1. A footnote in Intel's SDM "4.10.4.2 Recommended Invalidation" | ||
| 74 | says: "One execution of INVLPG is sufficient even for a page | ||
| 75 | with size greater than 4 KBytes." | ||
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index be12c534fd59..166af2a8e865 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h | |||
| @@ -3,6 +3,10 @@ | |||
| 3 | 3 | ||
| 4 | #include <asm/desc.h> | 4 | #include <asm/desc.h> |
| 5 | #include <linux/atomic.h> | 5 | #include <linux/atomic.h> |
| 6 | #include <linux/mm_types.h> | ||
| 7 | |||
| 8 | #include <trace/events/tlb.h> | ||
| 9 | |||
| 6 | #include <asm/pgalloc.h> | 10 | #include <asm/pgalloc.h> |
| 7 | #include <asm/tlbflush.h> | 11 | #include <asm/tlbflush.h> |
| 8 | #include <asm/paravirt.h> | 12 | #include <asm/paravirt.h> |
| @@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |||
| 44 | 48 | ||
| 45 | /* Re-load page tables */ | 49 | /* Re-load page tables */ |
| 46 | load_cr3(next->pgd); | 50 | load_cr3(next->pgd); |
| 51 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
| 47 | 52 | ||
| 48 | /* Stop flush ipis for the previous mm */ | 53 | /* Stop flush ipis for the previous mm */ |
| 49 | cpumask_clear_cpu(cpu, mm_cpumask(prev)); | 54 | cpumask_clear_cpu(cpu, mm_cpumask(prev)); |
| @@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, | |||
| 71 | * to make sure to use no freed page tables. | 76 | * to make sure to use no freed page tables. |
| 72 | */ | 77 | */ |
| 73 | load_cr3(next->pgd); | 78 | load_cr3(next->pgd); |
| 79 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
| 74 | load_LDT_nolock(&next->context); | 80 | load_LDT_nolock(&next->context); |
| 75 | } | 81 | } |
| 76 | } | 82 | } |
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 32cc237f8e20..ee30b9f0b91c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h | |||
| @@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO]; | |||
| 72 | extern u16 __read_mostly tlb_lld_2m[NR_INFO]; | 72 | extern u16 __read_mostly tlb_lld_2m[NR_INFO]; |
| 73 | extern u16 __read_mostly tlb_lld_4m[NR_INFO]; | 73 | extern u16 __read_mostly tlb_lld_4m[NR_INFO]; |
| 74 | extern u16 __read_mostly tlb_lld_1g[NR_INFO]; | 74 | extern u16 __read_mostly tlb_lld_1g[NR_INFO]; |
| 75 | extern s8 __read_mostly tlb_flushall_shift; | ||
| 76 | 75 | ||
| 77 | /* | 76 | /* |
| 78 | * CPU type and hardware bug flags. Kept separately for each CPU. | 77 | * CPU type and hardware bug flags. Kept separately for each CPU. |
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bc360d3df60e..60e5497681f5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
| @@ -724,11 +724,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) | |||
| 724 | } | 724 | } |
| 725 | #endif | 725 | #endif |
| 726 | 726 | ||
| 727 | static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) | ||
| 728 | { | ||
| 729 | tlb_flushall_shift = 6; | ||
| 730 | } | ||
| 731 | |||
| 732 | static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) | 727 | static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) |
| 733 | { | 728 | { |
| 734 | u32 ebx, eax, ecx, edx; | 729 | u32 ebx, eax, ecx, edx; |
| @@ -776,8 +771,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) | |||
| 776 | tlb_lli_2m[ENTRIES] = eax & mask; | 771 | tlb_lli_2m[ENTRIES] = eax & mask; |
| 777 | 772 | ||
| 778 | tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; | 773 | tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; |
| 779 | |||
| 780 | cpu_set_tlb_flushall_shift(c); | ||
| 781 | } | 774 | } |
| 782 | 775 | ||
| 783 | static const struct cpu_dev amd_cpu_dev = { | 776 | static const struct cpu_dev amd_cpu_dev = { |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 188a8c5cc094..333fd5209336 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
| @@ -481,26 +481,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO]; | |||
| 481 | u16 __read_mostly tlb_lld_4m[NR_INFO]; | 481 | u16 __read_mostly tlb_lld_4m[NR_INFO]; |
| 482 | u16 __read_mostly tlb_lld_1g[NR_INFO]; | 482 | u16 __read_mostly tlb_lld_1g[NR_INFO]; |
| 483 | 483 | ||
| 484 | /* | ||
| 485 | * tlb_flushall_shift shows the balance point in replacing cr3 write | ||
| 486 | * with multiple 'invlpg'. It will do this replacement when | ||
| 487 | * flush_tlb_lines <= active_lines/2^tlb_flushall_shift. | ||
| 488 | * If tlb_flushall_shift is -1, means the replacement will be disabled. | ||
| 489 | */ | ||
| 490 | s8 __read_mostly tlb_flushall_shift = -1; | ||
| 491 | |||
| 492 | void cpu_detect_tlb(struct cpuinfo_x86 *c) | 484 | void cpu_detect_tlb(struct cpuinfo_x86 *c) |
| 493 | { | 485 | { |
| 494 | if (this_cpu->c_detect_tlb) | 486 | if (this_cpu->c_detect_tlb) |
| 495 | this_cpu->c_detect_tlb(c); | 487 | this_cpu->c_detect_tlb(c); |
| 496 | 488 | ||
| 497 | printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" | 489 | printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" |
| 498 | "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" | 490 | "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n", |
| 499 | "tlb_flushall_shift: %d\n", | ||
| 500 | tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], | 491 | tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], |
| 501 | tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], | 492 | tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], |
| 502 | tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], | 493 | tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], |
| 503 | tlb_lld_1g[ENTRIES], tlb_flushall_shift); | 494 | tlb_lld_1g[ENTRIES]); |
| 504 | } | 495 | } |
| 505 | 496 | ||
| 506 | void detect_ht(struct cpuinfo_x86 *c) | 497 | void detect_ht(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 9483ee5b3991..74e804ddc5c7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
| @@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc) | |||
| 634 | } | 634 | } |
| 635 | } | 635 | } |
| 636 | 636 | ||
| 637 | static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c) | ||
| 638 | { | ||
| 639 | switch ((c->x86 << 8) + c->x86_model) { | ||
| 640 | case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ | ||
| 641 | case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ | ||
| 642 | case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ | ||
| 643 | case 0x61d: /* six-core 45 nm xeon "Dunnington" */ | ||
| 644 | tlb_flushall_shift = -1; | ||
| 645 | break; | ||
| 646 | case 0x63a: /* Ivybridge */ | ||
| 647 | tlb_flushall_shift = 2; | ||
| 648 | break; | ||
| 649 | case 0x61a: /* 45 nm nehalem, "Bloomfield" */ | ||
| 650 | case 0x61e: /* 45 nm nehalem, "Lynnfield" */ | ||
| 651 | case 0x625: /* 32 nm nehalem, "Clarkdale" */ | ||
| 652 | case 0x62c: /* 32 nm nehalem, "Gulftown" */ | ||
| 653 | case 0x62e: /* 45 nm nehalem-ex, "Beckton" */ | ||
| 654 | case 0x62f: /* 32 nm Xeon E7 */ | ||
| 655 | case 0x62a: /* SandyBridge */ | ||
| 656 | case 0x62d: /* SandyBridge, "Romely-EP" */ | ||
| 657 | default: | ||
| 658 | tlb_flushall_shift = 6; | ||
| 659 | } | ||
| 660 | } | ||
| 661 | |||
| 662 | static void intel_detect_tlb(struct cpuinfo_x86 *c) | 637 | static void intel_detect_tlb(struct cpuinfo_x86 *c) |
| 663 | { | 638 | { |
| 664 | int i, j, n; | 639 | int i, j, n; |
| @@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c) | |||
| 683 | for (j = 1 ; j < 16 ; j++) | 658 | for (j = 1 ; j < 16 ; j++) |
| 684 | intel_tlb_lookup(desc[j]); | 659 | intel_tlb_lookup(desc[j]); |
| 685 | } | 660 | } |
| 686 | intel_tlb_flushall_shift_set(c); | ||
| 687 | } | 661 | } |
| 688 | 662 | ||
| 689 | static const struct cpu_dev intel_cpu_dev = { | 663 | static const struct cpu_dev intel_cpu_dev = { |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 36642793e315..1dbade870f90 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
| @@ -577,6 +577,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address) | |||
| 577 | 577 | ||
| 578 | static const char nx_warning[] = KERN_CRIT | 578 | static const char nx_warning[] = KERN_CRIT |
| 579 | "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; | 579 | "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n"; |
| 580 | static const char smep_warning[] = KERN_CRIT | ||
| 581 | "unable to execute userspace code (SMEP?) (uid: %d)\n"; | ||
| 580 | 582 | ||
| 581 | static void | 583 | static void |
| 582 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, | 584 | show_fault_oops(struct pt_regs *regs, unsigned long error_code, |
| @@ -597,6 +599,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, | |||
| 597 | 599 | ||
| 598 | if (pte && pte_present(*pte) && !pte_exec(*pte)) | 600 | if (pte && pte_present(*pte) && !pte_exec(*pte)) |
| 599 | printk(nx_warning, from_kuid(&init_user_ns, current_uid())); | 601 | printk(nx_warning, from_kuid(&init_user_ns, current_uid())); |
| 602 | if (pte && pte_present(*pte) && pte_exec(*pte) && | ||
| 603 | (pgd_flags(*pgd) & _PAGE_USER) && | ||
| 604 | (read_cr4() & X86_CR4_SMEP)) | ||
| 605 | printk(smep_warning, from_kuid(&init_user_ns, current_uid())); | ||
| 600 | } | 606 | } |
| 601 | 607 | ||
| 602 | printk(KERN_ALERT "BUG: unable to handle kernel "); | 608 | printk(KERN_ALERT "BUG: unable to handle kernel "); |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f97130618113..66dba36f2343 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
| @@ -18,6 +18,13 @@ | |||
| 18 | #include <asm/dma.h> /* for MAX_DMA_PFN */ | 18 | #include <asm/dma.h> /* for MAX_DMA_PFN */ |
| 19 | #include <asm/microcode.h> | 19 | #include <asm/microcode.h> |
| 20 | 20 | ||
| 21 | /* | ||
| 22 | * We need to define the tracepoints somewhere, and tlb.c | ||
| 23 | * is only compied when SMP=y. | ||
| 24 | */ | ||
| 25 | #define CREATE_TRACE_POINTS | ||
| 26 | #include <trace/events/tlb.h> | ||
| 27 | |||
| 21 | #include "mm_internal.h" | 28 | #include "mm_internal.h" |
| 22 | 29 | ||
| 23 | static unsigned long __initdata pgt_buf_start; | 30 | static unsigned long __initdata pgt_buf_start; |
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index dd8dda167a24..1fe33987de02 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
| @@ -49,6 +49,7 @@ void leave_mm(int cpu) | |||
| 49 | if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { | 49 | if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { |
| 50 | cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); | 50 | cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); |
| 51 | load_cr3(swapper_pg_dir); | 51 | load_cr3(swapper_pg_dir); |
| 52 | trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); | ||
| 52 | } | 53 | } |
| 53 | } | 54 | } |
| 54 | EXPORT_SYMBOL_GPL(leave_mm); | 55 | EXPORT_SYMBOL_GPL(leave_mm); |
| @@ -102,20 +103,24 @@ static void flush_tlb_func(void *info) | |||
| 102 | 103 | ||
| 103 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) | 104 | if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) |
| 104 | return; | 105 | return; |
| 106 | if (!f->flush_end) | ||
| 107 | f->flush_end = f->flush_start + PAGE_SIZE; | ||
| 105 | 108 | ||
| 106 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); | 109 | count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); |
| 107 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { | 110 | if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { |
| 108 | if (f->flush_end == TLB_FLUSH_ALL) | 111 | if (f->flush_end == TLB_FLUSH_ALL) { |
| 109 | local_flush_tlb(); | 112 | local_flush_tlb(); |
| 110 | else if (!f->flush_end) | 113 | trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); |
| 111 | __flush_tlb_single(f->flush_start); | 114 | } else { |
| 112 | else { | ||
| 113 | unsigned long addr; | 115 | unsigned long addr; |
| 116 | unsigned long nr_pages = | ||
| 117 | f->flush_end - f->flush_start / PAGE_SIZE; | ||
| 114 | addr = f->flush_start; | 118 | addr = f->flush_start; |
| 115 | while (addr < f->flush_end) { | 119 | while (addr < f->flush_end) { |
| 116 | __flush_tlb_single(addr); | 120 | __flush_tlb_single(addr); |
| 117 | addr += PAGE_SIZE; | 121 | addr += PAGE_SIZE; |
| 118 | } | 122 | } |
| 123 | trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); | ||
| 119 | } | 124 | } |
| 120 | } else | 125 | } else |
| 121 | leave_mm(smp_processor_id()); | 126 | leave_mm(smp_processor_id()); |
| @@ -153,46 +158,45 @@ void flush_tlb_current_task(void) | |||
| 153 | 158 | ||
| 154 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | 159 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
| 155 | local_flush_tlb(); | 160 | local_flush_tlb(); |
| 161 | trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); | ||
| 156 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | 162 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
| 157 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); | 163 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); |
| 158 | preempt_enable(); | 164 | preempt_enable(); |
| 159 | } | 165 | } |
| 160 | 166 | ||
| 167 | /* | ||
| 168 | * See Documentation/x86/tlb.txt for details. We choose 33 | ||
| 169 | * because it is large enough to cover the vast majority (at | ||
| 170 | * least 95%) of allocations, and is small enough that we are | ||
| 171 | * confident it will not cause too much overhead. Each single | ||
| 172 | * flush is about 100 ns, so this caps the maximum overhead at | ||
| 173 | * _about_ 3,000 ns. | ||
| 174 | * | ||
| 175 | * This is in units of pages. | ||
| 176 | */ | ||
| 177 | unsigned long tlb_single_page_flush_ceiling = 33; | ||
| 178 | |||
| 161 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | 179 | void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, |
| 162 | unsigned long end, unsigned long vmflag) | 180 | unsigned long end, unsigned long vmflag) |
| 163 | { | 181 | { |
| 164 | unsigned long addr; | 182 | unsigned long addr; |
| 165 | unsigned act_entries, tlb_entries = 0; | 183 | /* do a global flush by default */ |
| 166 | unsigned long nr_base_pages; | 184 | unsigned long base_pages_to_flush = TLB_FLUSH_ALL; |
| 167 | 185 | ||
| 168 | preempt_disable(); | 186 | preempt_disable(); |
| 169 | if (current->active_mm != mm) | 187 | if (current->active_mm != mm) |
| 170 | goto flush_all; | 188 | goto out; |
| 171 | 189 | ||
| 172 | if (!current->mm) { | 190 | if (!current->mm) { |
| 173 | leave_mm(smp_processor_id()); | 191 | leave_mm(smp_processor_id()); |
| 174 | goto flush_all; | 192 | goto out; |
| 175 | } | 193 | } |
| 176 | 194 | ||
| 177 | if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 | 195 | if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) |
| 178 | || vmflag & VM_HUGETLB) { | 196 | base_pages_to_flush = (end - start) >> PAGE_SHIFT; |
| 179 | local_flush_tlb(); | ||
| 180 | goto flush_all; | ||
| 181 | } | ||
| 182 | |||
| 183 | /* In modern CPU, last level tlb used for both data/ins */ | ||
| 184 | if (vmflag & VM_EXEC) | ||
| 185 | tlb_entries = tlb_lli_4k[ENTRIES]; | ||
| 186 | else | ||
| 187 | tlb_entries = tlb_lld_4k[ENTRIES]; | ||
| 188 | 197 | ||
| 189 | /* Assume all of TLB entries was occupied by this task */ | 198 | if (base_pages_to_flush > tlb_single_page_flush_ceiling) { |
| 190 | act_entries = tlb_entries >> tlb_flushall_shift; | 199 | base_pages_to_flush = TLB_FLUSH_ALL; |
| 191 | act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; | ||
| 192 | nr_base_pages = (end - start) >> PAGE_SHIFT; | ||
| 193 | |||
| 194 | /* tlb_flushall_shift is on balance point, details in commit log */ | ||
| 195 | if (nr_base_pages > act_entries) { | ||
| 196 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); | 200 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); |
| 197 | local_flush_tlb(); | 201 | local_flush_tlb(); |
| 198 | } else { | 202 | } else { |
| @@ -201,17 +205,15 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, | |||
| 201 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); | 205 | count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); |
| 202 | __flush_tlb_single(addr); | 206 | __flush_tlb_single(addr); |
| 203 | } | 207 | } |
| 204 | |||
| 205 | if (cpumask_any_but(mm_cpumask(mm), | ||
| 206 | smp_processor_id()) < nr_cpu_ids) | ||
| 207 | flush_tlb_others(mm_cpumask(mm), mm, start, end); | ||
| 208 | preempt_enable(); | ||
| 209 | return; | ||
| 210 | } | 208 | } |
| 211 | 209 | trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); | |
| 212 | flush_all: | 210 | out: |
| 211 | if (base_pages_to_flush == TLB_FLUSH_ALL) { | ||
| 212 | start = 0UL; | ||
| 213 | end = TLB_FLUSH_ALL; | ||
| 214 | } | ||
| 213 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) | 215 | if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) |
| 214 | flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); | 216 | flush_tlb_others(mm_cpumask(mm), mm, start, end); |
| 215 | preempt_enable(); | 217 | preempt_enable(); |
| 216 | } | 218 | } |
| 217 | 219 | ||
| @@ -260,32 +262,26 @@ static void do_kernel_range_flush(void *info) | |||
| 260 | 262 | ||
| 261 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) | 263 | void flush_tlb_kernel_range(unsigned long start, unsigned long end) |
| 262 | { | 264 | { |
| 263 | unsigned act_entries; | ||
| 264 | struct flush_tlb_info info; | ||
| 265 | |||
| 266 | /* In modern CPU, last level tlb used for both data/ins */ | ||
| 267 | act_entries = tlb_lld_4k[ENTRIES]; | ||
| 268 | 265 | ||
| 269 | /* Balance as user space task's flush, a bit conservative */ | 266 | /* Balance as user space task's flush, a bit conservative */ |
| 270 | if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 || | 267 | if (end == TLB_FLUSH_ALL || |
| 271 | (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) | 268 | (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { |
| 272 | |||
| 273 | on_each_cpu(do_flush_tlb_all, NULL, 1); | 269 | on_each_cpu(do_flush_tlb_all, NULL, 1); |
| 274 | else { | 270 | } else { |
| 271 | struct flush_tlb_info info; | ||
| 275 | info.flush_start = start; | 272 | info.flush_start = start; |
| 276 | info.flush_end = end; | 273 | info.flush_end = end; |
| 277 | on_each_cpu(do_kernel_range_flush, &info, 1); | 274 | on_each_cpu(do_kernel_range_flush, &info, 1); |
| 278 | } | 275 | } |
| 279 | } | 276 | } |
| 280 | 277 | ||
| 281 | #ifdef CONFIG_DEBUG_TLBFLUSH | ||
| 282 | static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, | 278 | static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, |
| 283 | size_t count, loff_t *ppos) | 279 | size_t count, loff_t *ppos) |
| 284 | { | 280 | { |
| 285 | char buf[32]; | 281 | char buf[32]; |
| 286 | unsigned int len; | 282 | unsigned int len; |
| 287 | 283 | ||
| 288 | len = sprintf(buf, "%hd\n", tlb_flushall_shift); | 284 | len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); |
| 289 | return simple_read_from_buffer(user_buf, count, ppos, buf, len); | 285 | return simple_read_from_buffer(user_buf, count, ppos, buf, len); |
| 290 | } | 286 | } |
| 291 | 287 | ||
| @@ -294,20 +290,20 @@ static ssize_t tlbflush_write_file(struct file *file, | |||
| 294 | { | 290 | { |
| 295 | char buf[32]; | 291 | char buf[32]; |
| 296 | ssize_t len; | 292 | ssize_t len; |
| 297 | s8 shift; | 293 | int ceiling; |
| 298 | 294 | ||
| 299 | len = min(count, sizeof(buf) - 1); | 295 | len = min(count, sizeof(buf) - 1); |
| 300 | if (copy_from_user(buf, user_buf, len)) | 296 | if (copy_from_user(buf, user_buf, len)) |
| 301 | return -EFAULT; | 297 | return -EFAULT; |
| 302 | 298 | ||
| 303 | buf[len] = '\0'; | 299 | buf[len] = '\0'; |
| 304 | if (kstrtos8(buf, 0, &shift)) | 300 | if (kstrtoint(buf, 0, &ceiling)) |
| 305 | return -EINVAL; | 301 | return -EINVAL; |
| 306 | 302 | ||
| 307 | if (shift < -1 || shift >= BITS_PER_LONG) | 303 | if (ceiling < 0) |
| 308 | return -EINVAL; | 304 | return -EINVAL; |
| 309 | 305 | ||
| 310 | tlb_flushall_shift = shift; | 306 | tlb_single_page_flush_ceiling = ceiling; |
| 311 | return count; | 307 | return count; |
| 312 | } | 308 | } |
| 313 | 309 | ||
| @@ -317,11 +313,10 @@ static const struct file_operations fops_tlbflush = { | |||
| 317 | .llseek = default_llseek, | 313 | .llseek = default_llseek, |
| 318 | }; | 314 | }; |
| 319 | 315 | ||
| 320 | static int __init create_tlb_flushall_shift(void) | 316 | static int __init create_tlb_single_page_flush_ceiling(void) |
| 321 | { | 317 | { |
| 322 | debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR, | 318 | debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, |
| 323 | arch_debugfs_dir, NULL, &fops_tlbflush); | 319 | arch_debugfs_dir, NULL, &fops_tlbflush); |
| 324 | return 0; | 320 | return 0; |
| 325 | } | 321 | } |
| 326 | late_initcall(create_tlb_flushall_shift); | 322 | late_initcall(create_tlb_single_page_flush_ceiling); |
| 327 | #endif | ||
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 96c5750e3110..796deac19fcf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
| @@ -516,4 +516,12 @@ struct vm_special_mapping | |||
| 516 | struct page **pages; | 516 | struct page **pages; |
| 517 | }; | 517 | }; |
| 518 | 518 | ||
| 519 | enum tlb_flush_reason { | ||
| 520 | TLB_FLUSH_ON_TASK_SWITCH, | ||
| 521 | TLB_REMOTE_SHOOTDOWN, | ||
| 522 | TLB_LOCAL_SHOOTDOWN, | ||
| 523 | TLB_LOCAL_MM_SHOOTDOWN, | ||
| 524 | NR_TLB_FLUSH_REASONS, | ||
| 525 | }; | ||
| 526 | |||
| 519 | #endif /* _LINUX_MM_TYPES_H */ | 527 | #endif /* _LINUX_MM_TYPES_H */ |
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h new file mode 100644 index 000000000000..13391d288107 --- /dev/null +++ b/include/trace/events/tlb.h | |||
| @@ -0,0 +1,40 @@ | |||
| 1 | #undef TRACE_SYSTEM | ||
| 2 | #define TRACE_SYSTEM tlb | ||
| 3 | |||
| 4 | #if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ) | ||
| 5 | #define _TRACE_TLB_H | ||
| 6 | |||
| 7 | #include <linux/mm_types.h> | ||
| 8 | #include <linux/tracepoint.h> | ||
| 9 | |||
| 10 | #define TLB_FLUSH_REASON \ | ||
| 11 | { TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" }, \ | ||
| 12 | { TLB_REMOTE_SHOOTDOWN, "remote shootdown" }, \ | ||
| 13 | { TLB_LOCAL_SHOOTDOWN, "local shootdown" }, \ | ||
| 14 | { TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" } | ||
| 15 | |||
| 16 | TRACE_EVENT(tlb_flush, | ||
| 17 | |||
| 18 | TP_PROTO(int reason, unsigned long pages), | ||
| 19 | TP_ARGS(reason, pages), | ||
| 20 | |||
| 21 | TP_STRUCT__entry( | ||
| 22 | __field( int, reason) | ||
| 23 | __field(unsigned long, pages) | ||
| 24 | ), | ||
| 25 | |||
| 26 | TP_fast_assign( | ||
| 27 | __entry->reason = reason; | ||
| 28 | __entry->pages = pages; | ||
| 29 | ), | ||
| 30 | |||
| 31 | TP_printk("pages:%ld reason:%s (%d)", | ||
| 32 | __entry->pages, | ||
| 33 | __print_symbolic(__entry->reason, TLB_FLUSH_REASON), | ||
| 34 | __entry->reason) | ||
| 35 | ); | ||
| 36 | |||
| 37 | #endif /* _TRACE_TLB_H */ | ||
| 38 | |||
| 39 | /* This part must be outside protection */ | ||
| 40 | #include <trace/define_trace.h> | ||
