11 files changed, 193 insertions, 99 deletions
diff --git a/Documentation/x86/tlb.txt b/Documentation/x86/tlb.txt
new file mode 100644
index 000000000000..2b3a82e69151
--- /dev/null
+++ b/Documentation/x86/tlb.txt
@@ -0,0 +1,75 @@
+When the kernel unmaps or modified the attributes of a range of
+memory, it has two choices:
+ 1. Flush the entire TLB with a two-instruction sequence.  This is
+    a quick operation, but it causes collateral damage: TLB entries
+    from areas other than the one we are trying to flush will be
+    destroyed and must be refilled later, at some cost.
+ 2. Use the invlpg instruction to invalidate a single page at a
+    time.  This could potentialy cost many more instructions, but
+    it is a much more precise operation, causing no collateral
+    damage to other TLB entries.
+Which method to do depends on a few things:
+ 1. The size of the flush being performed.  A flush of the entire
+    address space is obviously better performed by flushing the
+    entire TLB than doing 2^48/PAGE_SIZE individual flushes.
+ 2. The contents of the TLB.  If the TLB is empty, then there will
+    be no collateral damage caused by doing the global flush, and
+    all of the individual flush will have ended up being wasted
+    work.
+ 3. The size of the TLB.  The larger the TLB, the more collateral
+    damage we do with a full flush.  So, the larger the TLB, the
+    more attrative an individual flush looks.  Data and
+    instructions have separate TLBs, as do different page sizes.
+ 4. The microarchitecture.  The TLB has become a multi-level
+    cache on modern CPUs, and the global flushes have become more
+    expensive relative to single-page flushes.
+There is obviously no way the kernel can know all these things,
+especially the contents of the TLB during a given flush.  The
+sizes of the flush will vary greatly depending on the workload as
+well.  There is essentially no "right" point to choose.
+You may be doing too many individual invalidations if you see the
+invlpg instruction (or instructions _near_ it) show up high in
+profiles.  If you believe that individual invalidations being
+called too often, you can lower the tunable:
+        /sys/debug/kernel/x86/tlb_single_page_flush_ceiling
+This will cause us to do the global flush for more cases.
+Lowering it to 0 will disable the use of the individual flushes.
+Setting it to 1 is a very conservative setting and it should
+never need to be 0 under normal circumstances.
+Despite the fact that a single individual flush on x86 is
+guaranteed to flush a full 2MB [1], hugetlbfs always uses the full
+flushes.  THP is treated exactly the same as normal memory.
+You might see invlpg inside of flush_tlb_mm_range() show up in
+profiles, or you can use the trace_tlb_flush() tracepoints. to
+determine how long the flush operations are taking.
+Essentially, you are balancing the cycles you spend doing invlpg
+with the cycles that you spend refilling the TLB later.
+You can measure how expensive TLB refills are by using
+performance counters and 'perf stat', like this:
+perf stat -e
+        cpu/event=0x8,umask=0x84,name=dtlb_load_misses_walk_duration/,
+        cpu/event=0x8,umask=0x82,name=dtlb_load_misses_walk_completed/,
+        cpu/event=0x49,umask=0x4,name=dtlb_store_misses_walk_duration/,
+        cpu/event=0x49,umask=0x2,name=dtlb_store_misses_walk_completed/,
+        cpu/event=0x85,umask=0x4,name=itlb_misses_walk_duration/,
+        cpu/event=0x85,umask=0x2,name=itlb_misses_walk_completed/
+That works on an IvyBridge-era CPU (i5-3320M).  Different CPUs
+may have differently-named counters, but they should at least
+be there in some form.  You can use pmu-tools 'ocperf list'
+(https://github.com/andikleen/pmu-tools) to find the right
+counters for a given CPU.
+1. A footnote in Intel's SDM "4.10.4.2 Recommended Invalidation"
+   says: "One execution of INVLPG is sufficient even for a page
+   with size greater than 4 KBytes."
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index be12c534fd59..166af2a8e865 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -3,6 +3,10 @@
 #include <asm/desc.h>
 #include <linux/atomic.h>
+#include <linux/mm_types.h>
+#include <trace/events/tlb.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
@@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                /* Re-load page tables */
                load_cr3(next->pgd);
+                trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
                /* Stop flush ipis for the previous mm */
                cpumask_clear_cpu(cpu, mm_cpumask(prev));
@@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
                         * to make sure to use no freed page tables.
                         */
                        load_cr3(next->pgd);
+                        trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
                        load_LDT_nolock(&next->context);
                }
        }
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 32cc237f8e20..ee30b9f0b91c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -72,7 +72,6 @@ extern u16 __read_mostly tlb_lld_4k[NR_INFO];
 extern u16 __read_mostly tlb_lld_2m[NR_INFO];
 extern u16 __read_mostly tlb_lld_4m[NR_INFO];
 extern u16 __read_mostly tlb_lld_1g[NR_INFO];
-extern s8  __read_mostly tlb_flushall_shift;
 /*
 *  CPU type and hardware bug flags. Kept separately for each CPU.
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bc360d3df60e..60e5497681f5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -724,11 +724,6 @@ static unsigned int amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 }
 #endif
-static void cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
-{
-        tlb_flushall_shift = 6;
-}
 static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
 {
        u32 ebx, eax, ecx, edx;
@@ -776,8 +771,6 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
                tlb_lli_2m[ENTRIES] = eax & mask;
        tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
-        cpu_set_tlb_flushall_shift(c);
 }
 static const struct cpu_dev amd_cpu_dev = {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 188a8c5cc094..333fd5209336 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -481,26 +481,17 @@ u16 __read_mostly tlb_lld_2m[NR_INFO];
 u16 __read_mostly tlb_lld_4m[NR_INFO];
 u16 __read_mostly tlb_lld_1g[NR_INFO];
-/*
- * tlb_flushall_shift shows the balance point in replacing cr3 write
- * with multiple 'invlpg'. It will do this replacement when
- *   flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
- * If tlb_flushall_shift is -1, means the replacement will be disabled.
- */
-s8  __read_mostly tlb_flushall_shift = -1;
 void cpu_detect_tlb(struct cpuinfo_x86 *c)
 {
        if (this_cpu->c_detect_tlb)
                this_cpu->c_detect_tlb(c);
        printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
-                "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
+                "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n",
-                "tlb_flushall_shift: %d\n",
                tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
                tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
                tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
-                tlb_lld_1g[ENTRIES], tlb_flushall_shift);
+                tlb_lld_1g[ENTRIES]);
 }
 void detect_ht(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 9483ee5b3991..74e804ddc5c7 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -634,31 +634,6 @@ static void intel_tlb_lookup(const unsigned char desc)
        }
 }
-static void intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
-{
-        switch ((c->x86 << 8) + c->x86_model) {
-        case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
-        case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
-        case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
-        case 0x61d: /* six-core 45 nm xeon "Dunnington" */
-                tlb_flushall_shift = -1;
-                break;
-        case 0x63a: /* Ivybridge */
-                tlb_flushall_shift = 2;
-                break;
-        case 0x61a: /* 45 nm nehalem, "Bloomfield" */
-        case 0x61e: /* 45 nm nehalem, "Lynnfield" */
-        case 0x625: /* 32 nm nehalem, "Clarkdale" */
-        case 0x62c: /* 32 nm nehalem, "Gulftown" */
-        case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
-        case 0x62f: /* 32 nm Xeon E7 */
-        case 0x62a: /* SandyBridge */
-        case 0x62d: /* SandyBridge, "Romely-EP" */
-        default:
-                tlb_flushall_shift = 6;
-        }
-}
 static void intel_detect_tlb(struct cpuinfo_x86 *c)
 {
        int i, j, n;
@@ -683,7 +658,6 @@ static void intel_detect_tlb(struct cpuinfo_x86 *c)
                for (j = 1 ; j < 16 ; j++)
                        intel_tlb_lookup(desc[j]);
        }
-        intel_tlb_flushall_shift_set(c);
 }
 static const struct cpu_dev intel_cpu_dev = {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 36642793e315..1dbade870f90 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -577,6 +577,8 @@ static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 static const char nx_warning[] = KERN_CRIT
 "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
+static const char smep_warning[] = KERN_CRIT
+"unable to execute userspace code (SMEP?) (uid: %d)\n";
 static void
 show_fault_oops(struct pt_regs *regs, unsigned long error_code,
@@ -597,6 +599,10 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
                if (pte && pte_present(*pte) && !pte_exec(*pte))
                        printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
+                if (pte && pte_present(*pte) && pte_exec(*pte) &&
+                                (pgd_flags(*pgd) & _PAGE_USER) &&
+                                (read_cr4() & X86_CR4_SMEP))
+                        printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
        }
        printk(KERN_ALERT "BUG: unable to handle kernel ");
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index f97130618113..66dba36f2343 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -18,6 +18,13 @@
 #include <asm/dma.h>            /* for MAX_DMA_PFN */
 #include <asm/microcode.h>
+/*
+ * We need to define the tracepoints somewhere, and tlb.c
+ * is only compied when SMP=y.
+ */
+#define CREATE_TRACE_POINTS
+#include <trace/events/tlb.h>
 #include "mm_internal.h"
 static unsigned long __initdata pgt_buf_start;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index dd8dda167a24..1fe33987de02 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -49,6 +49,7 @@ void leave_mm(int cpu)
        if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
                cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
                load_cr3(swapper_pg_dir);
+                trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
        }
 }
 EXPORT_SYMBOL_GPL(leave_mm);
@@ -102,20 +103,24 @@ static void flush_tlb_func(void *info)
        if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
                return;
+        if (!f->flush_end)
+                f->flush_end = f->flush_start + PAGE_SIZE;
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
-                if (f->flush_end == TLB_FLUSH_ALL)
+                if (f->flush_end == TLB_FLUSH_ALL) {
                        local_flush_tlb();
-                else if (!f->flush_end)
+                        trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
-                        __flush_tlb_single(f->flush_start);
+                } else {
-                else {
                        unsigned long addr;
+                        unsigned long nr_pages =
+                                f->flush_end - f->flush_start / PAGE_SIZE;
                        addr = f->flush_start;
                        while (addr < f->flush_end) {
                                __flush_tlb_single(addr);
                                addr += PAGE_SIZE;
                        }
+                        trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
                }
        } else
                leave_mm(smp_processor_id());
@@ -153,46 +158,45 @@ void flush_tlb_current_task(void)
        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
        local_flush_tlb();
+        trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
                flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
        preempt_enable();
 }
+/*
+ * See Documentation/x86/tlb.txt for details.  We choose 33
+ * because it is large enough to cover the vast majority (at
+ * least 95%) of allocations, and is small enough that we are
+ * confident it will not cause too much overhead.  Each single
+ * flush is about 100 ns, so this caps the maximum overhead at
+ * _about_ 3,000 ns.
+ *
+ * This is in units of pages.
+ */
+unsigned long tlb_single_page_flush_ceiling = 33;
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, unsigned long vmflag)
 {
        unsigned long addr;
-        unsigned act_entries, tlb_entries = 0;
+        /* do a global flush by default */
-        unsigned long nr_base_pages;
+        unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
        preempt_disable();
        if (current->active_mm != mm)
-                goto flush_all;
+                goto out;
        if (!current->mm) {
                leave_mm(smp_processor_id());
-                goto flush_all;
+                goto out;
        }
-        if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1
+        if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
-                                        || vmflag & VM_HUGETLB) {
+                base_pages_to_flush = (end - start) >> PAGE_SHIFT;
-                local_flush_tlb();
-                goto flush_all;
-        }
-        /* In modern CPU, last level tlb used for both data/ins */
-        if (vmflag & VM_EXEC)
-                tlb_entries = tlb_lli_4k[ENTRIES];
-        else
-                tlb_entries = tlb_lld_4k[ENTRIES];
-        /* Assume all of TLB entries was occupied by this task */
+        if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
-        act_entries = tlb_entries >> tlb_flushall_shift;
+                base_pages_to_flush = TLB_FLUSH_ALL;
-        act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
-        nr_base_pages = (end - start) >> PAGE_SHIFT;
-        /* tlb_flushall_shift is on balance point, details in commit log */
-        if (nr_base_pages > act_entries) {
                count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
                local_flush_tlb();
        } else {
@@ -201,17 +205,15 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
                        __flush_tlb_single(addr);
                }
-                if (cpumask_any_but(mm_cpumask(mm),
-                                smp_processor_id()) < nr_cpu_ids)
-                        flush_tlb_others(mm_cpumask(mm), mm, start, end);
-                preempt_enable();
-                return;
        }
+        trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
-flush_all:
+out:
+        if (base_pages_to_flush == TLB_FLUSH_ALL) {
+                start = 0UL;
+                end = TLB_FLUSH_ALL;
+        }
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
+                flush_tlb_others(mm_cpumask(mm), mm, start, end);
        preempt_enable();
 }
@@ -260,32 +262,26 @@ static void do_kernel_range_flush(void *info)
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
-        unsigned act_entries;
-        struct flush_tlb_info info;
-        /* In modern CPU, last level tlb used for both data/ins */
-        act_entries = tlb_lld_4k[ENTRIES];
        /* Balance as user space task's flush, a bit conservative */
-        if (end == TLB_FLUSH_ALL || tlb_flushall_shift == -1 ||
+        if (end == TLB_FLUSH_ALL ||
-                (end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+            (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
                on_each_cpu(do_flush_tlb_all, NULL, 1);
-        else {
+        } else {
+                struct flush_tlb_info info;
                info.flush_start = start;
                info.flush_end = end;
                on_each_cpu(do_kernel_range_flush, &info, 1);
        }
 }
-#ifdef CONFIG_DEBUG_TLBFLUSH
 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
                             size_t count, loff_t *ppos)
 {
        char buf[32];
        unsigned int len;
-        len = sprintf(buf, "%hd\n", tlb_flushall_shift);
+        len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling);
        return simple_read_from_buffer(user_buf, count, ppos, buf, len);
 }
@@ -294,20 +290,20 @@ static ssize_t tlbflush_write_file(struct file *file,
 {
        char buf[32];
        ssize_t len;
-        s8 shift;
+        int ceiling;
        len = min(count, sizeof(buf) - 1);
        if (copy_from_user(buf, user_buf, len))
                return -EFAULT;
        buf[len] = '\0';
-        if (kstrtos8(buf, 0, &shift))
+        if (kstrtoint(buf, 0, &ceiling))
                return -EINVAL;
-        if (shift < -1 || shift >= BITS_PER_LONG)
+        if (ceiling < 0)
                return -EINVAL;
-        tlb_flushall_shift = shift;
+        tlb_single_page_flush_ceiling = ceiling;
        return count;
 }
@@ -317,11 +313,10 @@ static const struct file_operations fops_tlbflush = {
        .llseek = default_llseek,
 };
-static int __init create_tlb_flushall_shift(void)
+static int __init create_tlb_single_page_flush_ceiling(void)
 {
-        debugfs_create_file("tlb_flushall_shift", S_IRUSR | S_IWUSR,
+        debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR,
                            arch_debugfs_dir, NULL, &fops_tlbflush);
        return 0;
 }
-late_initcall(create_tlb_flushall_shift);
+late_initcall(create_tlb_single_page_flush_ceiling);
-#endif
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 96c5750e3110..796deac19fcf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -516,4 +516,12 @@ struct vm_special_mapping
        struct page **pages;
 };
+enum tlb_flush_reason {
+        TLB_FLUSH_ON_TASK_SWITCH,
+        TLB_REMOTE_SHOOTDOWN,
+        TLB_LOCAL_SHOOTDOWN,
+        TLB_LOCAL_MM_SHOOTDOWN,
+        NR_TLB_FLUSH_REASONS,
+};
 #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h
new file mode 100644
index 000000000000..13391d288107
--- /dev/null
+++ b/include/trace/events/tlb.h
@@ -0,0 +1,40 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM tlb
+#if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TLB_H
+#include <linux/mm_types.h>
+#include <linux/tracepoint.h>
+#define TLB_FLUSH_REASON        \
+        { TLB_FLUSH_ON_TASK_SWITCH,     "flush on task switch" },       \
+        { TLB_REMOTE_SHOOTDOWN,         "remote shootdown" },           \
+        { TLB_LOCAL_SHOOTDOWN,          "local shootdown" },            \
+        { TLB_LOCAL_MM_SHOOTDOWN,       "local mm shootdown" }
+TRACE_EVENT(tlb_flush,
+        TP_PROTO(int reason, unsigned long pages),
+        TP_ARGS(reason, pages),
+        TP_STRUCT__entry(
+                __field(          int, reason)
+                __field(unsigned long,  pages)
+        ),
+        TP_fast_assign(
+                __entry->reason = reason;
+                __entry->pages  = pages;
+        ),
+        TP_printk("pages:%ld reason:%s (%d)",
+                __entry->pages,
+                __print_symbolic(__entry->reason, TLB_FLUSH_REASON),
+                __entry->reason)
+);
+#endif /* _TRACE_TLB_H */
+/* This part must be outside protection */
+#include <trace/define_trace.h>