5 files changed, 82 insertions, 70 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9d591c895803..a10c8c792161 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1001,6 +1001,12 @@ static int fault_in_kernel_space(unsigned long address)
 static inline bool smap_violation(int error_code, struct pt_regs *regs)
 {
+        if (!IS_ENABLED(CONFIG_X86_SMAP))
+                return false;
+        if (!static_cpu_has(X86_FEATURE_SMAP))
+                return false;
        if (error_code & PF_USER)
                return false;
@@ -1014,13 +1020,17 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
 * This routine handles page faults.  It determines the address,
 * and the problem, and then passes it off to one of the appropriate
 * routines.
+ *
+ * This function must have noinline because both callers
+ * {,trace_}do_page_fault() have notrace on. Having this an actual function
+ * guarantees there's a function trace entry.
 */
-static void __kprobes
+static void __kprobes noinline
-__do_page_fault(struct pt_regs *regs, unsigned long error_code)
+__do_page_fault(struct pt_regs *regs, unsigned long error_code,
+                unsigned long address)
 {
        struct vm_area_struct *vma;
        struct task_struct *tsk;
-        unsigned long address;
        struct mm_struct *mm;
        int fault;
        unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
@@ -1028,9 +1038,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
        tsk = current;
        mm = tsk->mm;
-        /* Get the faulting address: */
-        address = read_cr2();
        /*
         * Detect and handle instructions that would cause a page fault for
         * both a tracked kernel page and a userspace page.
@@ -1087,11 +1094,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
        if (unlikely(error_code & PF_RSVD))
                pgtable_bad(regs, error_code, address);
-        if (static_cpu_has(X86_FEATURE_SMAP)) {
+        if (unlikely(smap_violation(error_code, regs))) {
-                if (unlikely(smap_violation(error_code, regs))) {
+                bad_area_nosemaphore(regs, error_code, address);
-                        bad_area_nosemaphore(regs, error_code, address);
+                return;
-                        return;
-                }
        }
        /*
@@ -1244,32 +1249,50 @@ good_area:
        up_read(&mm->mmap_sem);
 }
-dotraplinkage void __kprobes
+dotraplinkage void __kprobes notrace
 do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
+        unsigned long address = read_cr2(); /* Get the faulting address */
        enum ctx_state prev_state;
+        /*
+         * We must have this function tagged with __kprobes, notrace and call
+         * read_cr2() before calling anything else. To avoid calling any kind
+         * of tracing machinery before we've observed the CR2 value.
+         *
+         * exception_{enter,exit}() contain all sorts of tracepoints.
+         */
        prev_state = exception_enter();
-        __do_page_fault(regs, error_code);
+        __do_page_fault(regs, error_code, address);
        exception_exit(prev_state);
 }
-static void trace_page_fault_entries(struct pt_regs *regs,
+#ifdef CONFIG_TRACING
+static void trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
                                     unsigned long error_code)
 {
        if (user_mode(regs))
-                trace_page_fault_user(read_cr2(), regs, error_code);
+                trace_page_fault_user(address, regs, error_code);
        else
-                trace_page_fault_kernel(read_cr2(), regs, error_code);
+                trace_page_fault_kernel(address, regs, error_code);
 }
-dotraplinkage void __kprobes
+dotraplinkage void __kprobes notrace
 trace_do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
+        /*
+         * The exception_enter and tracepoint processing could
+         * trigger another page faults (user space callchain
+         * reading) and destroy the original cr2 value, so read
+         * the faulting address now.
+         */
+        unsigned long address = read_cr2();
        enum ctx_state prev_state;
        prev_state = exception_enter();
-        trace_page_fault_entries(regs, error_code);
+        trace_page_fault_entries(address, regs, error_code);
-        __do_page_fault(regs, error_code);
+        __do_page_fault(regs, error_code, address);
        exception_exit(prev_state);
 }
+#endif /* CONFIG_TRACING */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 81b2750f3666..27aa0455fab3 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
                struct numa_memblk *mb = &mi->blk[i];
                memblock_set_node(mb->start, mb->end - mb->start,
                                  &memblock.memory, mb->nid);
-                /*
-                 * At this time, all memory regions reserved by memblock are
-                 * used by the kernel. Set the nid in memblock.reserved will
-                 * mark out all the nodes the kernel resides in.
-                 */
-                memblock_set_node(mb->start, mb->end - mb->start,
-                                  &memblock.reserved, mb->nid);
        }
        /*
@@ -565,10 +557,21 @@ static void __init numa_init_array(void)
 static void __init numa_clear_kernel_node_hotplug(void)
 {
        int i, nid;
-        nodemask_t numa_kernel_nodes;
+        nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
        unsigned long start, end;
        struct memblock_type *type = &memblock.reserved;
+        /*
+         * At this time, all memory regions reserved by memblock are
+         * used by the kernel. Set the nid in memblock.reserved will
+         * mark out all the nodes the kernel resides in.
+         */
+        for (i = 0; i < numa_meminfo.nr_blks; i++) {
+                struct numa_memblk *mb = &numa_meminfo.blk[i];
+                memblock_set_node(mb->start, mb->end - mb->start,
+                                  &memblock.reserved, mb->nid);
+        }
        /* Mark all kernel nodes. */
        for (i = 0; i < type->cnt; i++)
                node_set(type->regions[i].nid, numa_kernel_nodes);
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 0342d27ca798..47b6436e41c2 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
                        nid, start, end);
        printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
        printk(KERN_DEBUG "  ");
+        start = round_down(start, PAGES_PER_SECTION);
+        end = round_up(end, PAGES_PER_SECTION);
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
                physnode_map[pfn / PAGES_PER_SECTION] = nid;
                printk(KERN_CONT "%lx ", pfn);
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 1a25187e151e..1953e9c9391a 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -42,15 +42,25 @@ static __init inline int srat_disabled(void)
        return acpi_numa < 0;
 }
-/* Callback for SLIT parsing */
+/*
+ * Callback for SLIT parsing.  pxm_to_node() returns NUMA_NO_NODE for
+ * I/O localities since SRAT does not list them.  I/O localities are
+ * not supported at this point.
+ */
 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
 {
        int i, j;
-        for (i = 0; i < slit->locality_count; i++)
+        for (i = 0; i < slit->locality_count; i++) {
-                for (j = 0; j < slit->locality_count; j++)
+                if (pxm_to_node(i) == NUMA_NO_NODE)
+                        continue;
+                for (j = 0; j < slit->locality_count; j++) {
+                        if (pxm_to_node(j) == NUMA_NO_NODE)
+                                continue;
                        numa_set_distance(pxm_to_node(i), pxm_to_node(j),
                                slit->entry[slit->locality_count * i + j]);
+                }
+        }
 }
 /* Callback for Proximity Domain -> x2APIC mapping */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ae699b3bbac8..dd8dda167a24 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,7 +103,7 @@ static void flush_tlb_func(void *info)
        if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
                return;
-        count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
                if (f->flush_end == TLB_FLUSH_ALL)
                        local_flush_tlb();
@@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
        info.flush_start = start;
        info.flush_end = end;
-        count_vm_event(NR_TLB_REMOTE_FLUSH);
+        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
        if (is_uv_system()) {
                unsigned int cpu;
@@ -151,44 +151,19 @@ void flush_tlb_current_task(void)
        preempt_disable();
-        count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
        local_flush_tlb();
        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
                flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
        preempt_enable();
 }
-/*
- * It can find out the THP large page, or
- * HUGETLB page in tlb_flush when THP disabled
- */
-static inline unsigned long has_large_page(struct mm_struct *mm,
-                                 unsigned long start, unsigned long end)
-{
-        pgd_t *pgd;
-        pud_t *pud;
-        pmd_t *pmd;
-        unsigned long addr = ALIGN(start, HPAGE_SIZE);
-        for (; addr < end; addr += HPAGE_SIZE) {
-                pgd = pgd_offset(mm, addr);
-                if (likely(!pgd_none(*pgd))) {
-                        pud = pud_offset(pgd, addr);
-                        if (likely(!pud_none(*pud))) {
-                                pmd = pmd_offset(pud, addr);
-                                if (likely(!pmd_none(*pmd)))
-                                        if (pmd_large(*pmd))
-                                                return addr;
-                        }
-                }
-        }
-        return 0;
-}
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, unsigned long vmflag)
 {
        unsigned long addr;
        unsigned act_entries, tlb_entries = 0;
+        unsigned long nr_base_pages;
        preempt_disable();
        if (current->active_mm != mm)
@@ -210,21 +185,20 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                tlb_entries = tlb_lli_4k[ENTRIES];
        else
                tlb_entries = tlb_lld_4k[ENTRIES];
        /* Assume all of TLB entries was occupied by this task */
-        act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
+        act_entries = tlb_entries >> tlb_flushall_shift;
+        act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm;
+        nr_base_pages = (end - start) >> PAGE_SHIFT;
        /* tlb_flushall_shift is on balance point, details in commit log */
-        if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
+        if (nr_base_pages > act_entries) {
-                count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+                count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
                local_flush_tlb();
        } else {
-                if (has_large_page(mm, start, end)) {
-                        local_flush_tlb();
-                        goto flush_all;
-                }
                /* flush range by one by one 'invlpg' */
                for (addr = start; addr < end;  addr += PAGE_SIZE) {
-                        count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
+                        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
                        __flush_tlb_single(addr);
                }
@@ -262,7 +236,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 static void do_flush_tlb_all(void *info)
 {
-        count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
        __flush_tlb_all();
        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
                leave_mm(smp_processor_id());
@@ -270,7 +244,7 @@ static void do_flush_tlb_all(void *info)
 void flush_tlb_all(void)
 {
-        count_vm_event(NR_TLB_REMOTE_FLUSH);
+        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
        on_each_cpu(do_flush_tlb_all, NULL, 1);
 }