Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar: "The main changes in this cycle were: - Continued work to add support for 5-level paging provided by future Intel CPUs. In particular we switch the x86 GUP code to the generic implementation. (Kirill A. Shutemov) - Continued work to add PCID CPU support to native kernels as well. In this round most of the focus is on reworking/refreshing the TLB flush infrastructure for the upcoming PCID changes. (Andy Lutomirski)" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (34 commits) x86/mm: Delete a big outdated comment about TLB flushing x86/mm: Don't reenter flush_tlb_func_common() x86/KASLR: Fix detection 32/64 bit bootloaders for 5-level paging x86/ftrace: Exclude functions in head64.c from function-tracing x86/mmap, ASLR: Do not treat unlimited-stack tasks as legacy mmap x86/mm: Remove reset_lazy_tlbstate() x86/ldt: Simplify the LDT switching logic x86/boot/64: Put __startup_64() into .head.text x86/mm: Add support for 5-level paging for KASLR x86/mm: Make kernel_physical_mapping_init() support 5-level paging x86/mm: Add sync_global_pgds() for configuration with 5-level paging x86/boot/64: Add support of additional page table level during early boot x86/boot/64: Rename init_level4_pgt and early_level4_pgt x86/boot/64: Rewrite startup_64() in C x86/boot/compressed: Enable 5-level paging during decompression stage x86/boot/efi: Define __KERNEL32_CS GDT on 64-bit configurations x86/boot/efi: Fix __KERNEL_CS definition of GDT entry on 64-bit configurations x86/boot/efi: Cleanup initialization of GDT entries x86/asm: Fix comment in return_from_SYSCALL_64() x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-07-03 17:45:09 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-07-03 17:45:09 -0400
commit: 7a69f9c60b49699579f5bfb71f928cceba0afe1a (patch)
tree: bf3b5640bbd9f23beeb5a55d18348d65bafff8e8 /arch/x86
parent: 9bc088ab66be8978fbc981ba9644468fa2c2fd3f (diff)
parent: 8781fb7e9749da424e01daacd14834b674658c63 (diff)
54 files changed, 1061 insertions, 1191 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0efb4c9497bc..737212c0333e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -69,7 +69,7 @@ config X86
        select ARCH_USE_BUILTIN_BSWAP
        select ARCH_USE_QUEUED_RWLOCKS
        select ARCH_USE_QUEUED_SPINLOCKS
-        select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
+        select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
        select ARCH_WANT_FRAME_POINTERS
        select ARCH_WANTS_DYNAMIC_TASK_STRUCT
        select BUILDTIME_EXTABLE_SORT
@@ -2793,6 +2793,9 @@ config X86_DMA_REMAP
        bool
        depends on STA2X11
+config HAVE_GENERIC_GUP
+        def_bool y
 source "net/Kconfig"
 source "drivers/Kconfig"
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index cbf4b87f55b9..c3e869eaef0c 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1046,9 +1046,31 @@ struct boot_params *efi_main(struct efi_config *c,
        memset((char *)gdt->address, 0x0, gdt->size);
        desc = (struct desc_struct *)gdt->address;
-        /* The first GDT is a dummy and the second is unused. */
+        /* The first GDT is a dummy. */
-        desc += 2;
+        desc++;
+        if (IS_ENABLED(CONFIG_X86_64)) {
+                /* __KERNEL32_CS */
+                desc->limit0 = 0xffff;
+                desc->base0 = 0x0000;
+                desc->base1 = 0x0000;
+                desc->type = SEG_TYPE_CODE | SEG_TYPE_EXEC_READ;
+                desc->s = DESC_TYPE_CODE_DATA;
+                desc->dpl = 0;
+                desc->p = 1;
+                desc->limit = 0xf;
+                desc->avl = 0;
+                desc->l = 0;
+                desc->d = SEG_OP_SIZE_32BIT;
+                desc->g = SEG_GRANULARITY_4KB;
+                desc->base2 = 0x00;
+                desc++;
+        } else {
+                /* Second entry is unused on 32-bit */
+                desc++;
+        }
+        /* __KERNEL_CS */
        desc->limit0 = 0xffff;
        desc->base0 = 0x0000;
        desc->base1 = 0x0000;
@@ -1058,12 +1080,18 @@ struct boot_params *efi_main(struct efi_config *c,
        desc->p = 1;
        desc->limit = 0xf;
        desc->avl = 0;
-        desc->l = 0;
+        if (IS_ENABLED(CONFIG_X86_64)) {
-        desc->d = SEG_OP_SIZE_32BIT;
+                desc->l = 1;
+                desc->d = 0;
+        } else {
+                desc->l = 0;
+                desc->d = SEG_OP_SIZE_32BIT;
+        }
        desc->g = SEG_GRANULARITY_4KB;
        desc->base2 = 0x00;
        desc++;
+        /* __KERNEL_DS */
        desc->limit0 = 0xffff;
        desc->base0 = 0x0000;
        desc->base1 = 0x0000;
@@ -1077,24 +1105,25 @@ struct boot_params *efi_main(struct efi_config *c,
        desc->d = SEG_OP_SIZE_32BIT;
        desc->g = SEG_GRANULARITY_4KB;
        desc->base2 = 0x00;
-#ifdef CONFIG_X86_64
-        /* Task segment value */
        desc++;
-        desc->limit0 = 0x0000;
-        desc->base0 = 0x0000;
+        if (IS_ENABLED(CONFIG_X86_64)) {
-        desc->base1 = 0x0000;
+                /* Task segment value */
-        desc->type = SEG_TYPE_TSS;
+                desc->limit0 = 0x0000;
-        desc->s = 0;
+                desc->base0 = 0x0000;
-        desc->dpl = 0;
+                desc->base1 = 0x0000;
-        desc->p = 1;
+                desc->type = SEG_TYPE_TSS;
-        desc->limit = 0x0;
+                desc->s = 0;
-        desc->avl = 0;
+                desc->dpl = 0;
-        desc->l = 0;
+                desc->p = 1;
-        desc->d = 0;
+                desc->limit = 0x0;
-        desc->g = SEG_GRANULARITY_4KB;
+                desc->avl = 0;
-        desc->base2 = 0x00;
+                desc->l = 0;
-#endif /* CONFIG_X86_64 */
+                desc->d = 0;
+                desc->g = SEG_GRANULARITY_4KB;
+                desc->base2 = 0x00;
+                desc++;
+        }
        asm volatile("cli");
        asm volatile ("lgdt %0" : : "m" (*gdt));
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index d2ae1f821e0c..fbf4c32d0b62 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -346,6 +346,48 @@ preferred_addr:
        /* Set up the stack */
        leaq    boot_stack_end(%rbx), %rsp
+#ifdef CONFIG_X86_5LEVEL
+        /* Check if 5-level paging has already enabled */
+        movq    %cr4, %rax
+        testl   $X86_CR4_LA57, %eax
+        jnz     lvl5
+        /*
+         * At this point we are in long mode with 4-level paging enabled,
+         * but we want to enable 5-level paging.
+         *
+         * The problem is that we cannot do it directly. Setting LA57 in
+         * long mode would trigger #GP. So we need to switch off long mode
+         * first.
+         *
+         * NOTE: This is not going to work if bootloader put us above 4G
+         * limit.
+         *
+         * The first step is go into compatibility mode.
+         */
+        /* Clear additional page table */
+        leaq    lvl5_pgtable(%rbx), %rdi
+        xorq    %rax, %rax
+        movq    $(PAGE_SIZE/8), %rcx
+        rep     stosq
+        /*
+         * Setup current CR3 as the first and only entry in a new top level
+         * page table.
+         */
+        movq    %cr3, %rdi
+        leaq    0x7 (%rdi), %rax
+        movq    %rax, lvl5_pgtable(%rbx)
+        /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */
+        pushq   $__KERNEL32_CS
+        leaq    compatible_mode(%rip), %rax
+        pushq   %rax
+        lretq
+lvl5:
+#endif
        /* Zero EFLAGS */
        pushq   $0
        popfq
@@ -429,6 +471,44 @@ relocated:
        jmp     *%rax
        .code32
+#ifdef CONFIG_X86_5LEVEL
+compatible_mode:
+        /* Setup data and stack segments */
+        movl    $__KERNEL_DS, %eax
+        movl    %eax, %ds
+        movl    %eax, %ss
+        /* Disable paging */
+        movl    %cr0, %eax
+        btrl    $X86_CR0_PG_BIT, %eax
+        movl    %eax, %cr0
+        /* Point CR3 to 5-level paging */
+        leal    lvl5_pgtable(%ebx), %eax
+        movl    %eax, %cr3
+        /* Enable PAE and LA57 mode */
+        movl    %cr4, %eax
+        orl     $(X86_CR4_PAE | X86_CR4_LA57), %eax
+        movl    %eax, %cr4
+        /* Calculate address we are running at */
+        call    1f
+1:      popl    %edi
+        subl    $1b, %edi
+        /* Prepare stack for far return to Long Mode */
+        pushl   $__KERNEL_CS
+        leal    lvl5(%edi), %eax
+        push    %eax
+        /* Enable paging back */
+        movl    $(X86_CR0_PG | X86_CR0_PE), %eax
+        movl    %eax, %cr0
+        lret
+#endif
 no_longmode:
        /* This isn't an x86-64 CPU so hang */
 1:
@@ -442,7 +522,7 @@ gdt:
        .word   gdt_end - gdt
        .long   gdt
        .word   0
-        .quad   0x0000000000000000      /* NULL descriptor */
+        .quad   0x00cf9a000000ffff      /* __KERNEL32_CS */
        .quad   0x00af9a000000ffff      /* __KERNEL_CS */
        .quad   0x00cf92000000ffff      /* __KERNEL_DS */
        .quad   0x0080890000000000      /* TS descriptor */
@@ -486,3 +566,7 @@ boot_stack_end:
        .balign 4096
 pgtable:
        .fill BOOT_PGT_SIZE, 1, 0
+#ifdef CONFIG_X86_5LEVEL
+lvl5_pgtable:
+        .fill PAGE_SIZE, 1, 0
+#endif
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 1d78f1739087..28029be47fbb 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -63,7 +63,7 @@ static void *alloc_pgt_page(void *context)
 static struct alloc_pgt_data pgt_data;
 /* The top level page table entry pointer. */
-static unsigned long level4p;
+static unsigned long top_level_pgt;
 /*
 * Mapping information structure passed to kernel_ident_mapping_init().
@@ -91,9 +91,15 @@ void initialize_identity_maps(void)
         * If we came here via startup_32(), cr3 will be _pgtable already
         * and we must append to the existing area instead of entirely
         * overwriting it.
+         *
+         * With 5-level paging, we use '_pgtable' to allocate the p4d page table,
+         * the top-level page table is allocated separately.
+         *
+         * p4d_offset(top_level_pgt, 0) would cover both the 4- and 5-level
+         * cases. On 4-level paging it's equal to 'top_level_pgt'.
         */
-        level4p = read_cr3();
+        top_level_pgt = read_cr3_pa();
-        if (level4p == (unsigned long)_pgtable) {
+        if (p4d_offset((pgd_t *)top_level_pgt, 0) == (p4d_t *)_pgtable) {
                debug_putstr("booted via startup_32()\n");
                pgt_data.pgt_buf = _pgtable + BOOT_INIT_PGT_SIZE;
                pgt_data.pgt_buf_size = BOOT_PGT_SIZE - BOOT_INIT_PGT_SIZE;
@@ -103,7 +109,7 @@ void initialize_identity_maps(void)
                pgt_data.pgt_buf = _pgtable;
                pgt_data.pgt_buf_size = BOOT_PGT_SIZE;
                memset(pgt_data.pgt_buf, 0, pgt_data.pgt_buf_size);
-                level4p = (unsigned long)alloc_pgt_page(&pgt_data);
+                top_level_pgt = (unsigned long)alloc_pgt_page(&pgt_data);
        }
 }
@@ -123,7 +129,7 @@ void add_identity_map(unsigned long start, unsigned long size)
                return;
        /* Build the mapping. */
-        kernel_ident_mapping_init(&mapping_info, (pgd_t *)level4p,
+        kernel_ident_mapping_init(&mapping_info, (pgd_t *)top_level_pgt,
                                  start, end);
 }
@@ -134,5 +140,5 @@ void add_identity_map(unsigned long start, unsigned long size)
 */
 void finalize_identity_maps(void)
 {
-        write_cr3(level4p);
+        write_cr3(top_level_pgt);
 }
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4a4c0834f965..a9a8027a6c0e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,7 +265,8 @@ return_from_SYSCALL_64:
         * If width of "canonical tail" ever becomes variable, this will need
         * to be updated to remain correct on both old and new CPUs.
         *
-         * Change top 16 bits to be the sign-extension of 47th bit
+         * Change top bits to match most significant bit (47th or 56th bit
+         * depending on paging mode) in the address.
         */
        shl     $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
        sar     $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 628b8c556aab..2de0dd73830a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2111,8 +2111,7 @@ static int x86_pmu_event_init(struct perf_event *event)
 static void refresh_pce(void *ignored)
 {
-        if (current->active_mm)
+        load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
-                load_mm_cr4(current->active_mm);
 }
 static void x86_pmu_event_mapped(struct perf_event *event)
@@ -2344,7 +2343,7 @@ static unsigned long get_segment_base(unsigned int segment)
                /* IRQs are off, so this synchronizes with smp_store_release */
                ldt = lockless_dereference(current->active_mm->context.ldt);
-                if (!ldt || idx > ldt->size)
+                if (!ldt || idx > ldt->nr_entries)
                        return 0;
                desc = &ldt->entries[idx];
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2f77bcefe6b4..d2ff779f347e 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -74,7 +74,7 @@ struct efi_scratch {
        __kernel_fpu_begin();                                           \
                                                                        \
        if (efi_scratch.use_pgd) {                                      \
-                efi_scratch.prev_cr3 = read_cr3();                      \
+                efi_scratch.prev_cr3 = __read_cr3();                    \
                write_cr3((unsigned long)efi_scratch.efi_pgt);          \
                __flush_tlb_all();                                      \
        }                                                               \
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 59405a248fc2..9b76cd331990 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,8 +22,8 @@ typedef struct {
 #ifdef CONFIG_SMP
        unsigned int irq_resched_count;
        unsigned int irq_call_count;
-        unsigned int irq_tlb_count;
 #endif
+        unsigned int irq_tlb_count;
 #ifdef CONFIG_X86_THERMAL_VECTOR
        unsigned int irq_thermal_count;
 #endif
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index f9813b6d8b80..79b647a7ebd0 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -37,12 +37,6 @@ typedef struct {
 #endif
 } mm_context_t;
-#ifdef CONFIG_SMP
 void leave_mm(int cpu);
-#else
-static inline void leave_mm(int cpu)
-{
-}
-#endif
 #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 68b329d77b3a..ecfcb6643c9b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -47,7 +47,7 @@ struct ldt_struct {
         * allocations, but it's not worth trying to optimize.
         */
        struct desc_struct *entries;
-        unsigned int size;
+        unsigned int nr_entries;
 };
 /*
@@ -87,22 +87,46 @@ static inline void load_mm_ldt(struct mm_struct *mm)
         */
        if (unlikely(ldt))
-                set_ldt(ldt->entries, ldt->size);
+                set_ldt(ldt->entries, ldt->nr_entries);
        else
                clear_LDT();
 #else
        clear_LDT();
 #endif
+}
+static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
+{
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+        /*
+         * Load the LDT if either the old or new mm had an LDT.
+         *
+         * An mm will never go from having an LDT to not having an LDT.  Two
+         * mms never share an LDT, so we don't gain anything by checking to
+         * see whether the LDT changed.  There's also no guarantee that
+         * prev->context.ldt actually matches LDTR, but, if LDTR is non-NULL,
+         * then prev->context.ldt will also be non-NULL.
+         *
+         * If we really cared, we could optimize the case where prev == next
+         * and we're exiting lazy mode.  Most of the time, if this happens,
+         * we don't actually need to reload LDTR, but modify_ldt() is mostly
+         * used by legacy code and emulators where we don't need this level of
+         * performance.
+         *
+         * This uses | instead of || because it generates better code.
+         */
+        if (unlikely((unsigned long)prev->context.ldt |
+                     (unsigned long)next->context.ldt))
+                load_mm_ldt(next);
+#endif
        DEBUG_LOCKS_WARN_ON(preemptible());
 }
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
-#ifdef CONFIG_SMP
        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
                this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
 }
 static inline int init_new_context(struct task_struct *tsk,
@@ -220,18 +244,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
 }
 #endif
-static inline bool __pkru_allows_pkey(u16 pkey, bool write)
-{
-        u32 pkru = read_pkru();
-        if (!__pkru_allows_read(pkru, pkey))
-                return false;
-        if (write && !__pkru_allows_write(pkru, pkey))
-                return false;
-        return true;
-}
 /*
 * We only want to enforce protection keys on the current process
 * because we effectively have no access to PKRU for other
@@ -268,4 +280,23 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
        return __pkru_allows_pkey(vma_pkey(vma), write);
 }
+/*
+ * This can be used from process context to figure out what the value of
+ * CR3 is without needing to do a (slow) __read_cr3().
+ *
+ * It's intended to be used for code like KVM that sneakily changes CR3
+ * and needs to restore it.  It needs to be used very carefully.
+ */
+static inline unsigned long __get_current_cr3_fast(void)
+{
+        unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
+        /* For now, be very restrictive about when this can be called. */
+        VM_WARN_ON(in_nmi() || !in_atomic());
+        VM_BUG_ON(cr3 != __read_cr3());
+        return cr3;
+}
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index a3dcf8944cb9..9ccac1926587 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -61,7 +61,7 @@ static inline void write_cr2(unsigned long x)
        PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
 }
-static inline unsigned long read_cr3(void)
+static inline unsigned long __read_cr3(void)
 {
        return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
 }
@@ -312,11 +312,9 @@ static inline void __flush_tlb_single(unsigned long addr)
 }
 static inline void flush_tlb_others(const struct cpumask *cpumask,
-                                    struct mm_struct *mm,
+                                    const struct flush_tlb_info *info)
-                                    unsigned long start,
-                                    unsigned long end)
 {
-        PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
+        PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
 }
 static inline int paravirt_pgd_alloc(struct mm_struct *mm)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7465d6fe336f..cb976bab6299 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -51,6 +51,7 @@ struct mm_struct;
 struct desc_struct;
 struct task_struct;
 struct cpumask;
+struct flush_tlb_info;
 /*
 * Wrapper type for pointers to code which uses the non-standard
@@ -223,9 +224,7 @@ struct pv_mmu_ops {
        void (*flush_tlb_kernel)(void);
        void (*flush_tlb_single)(unsigned long addr);
        void (*flush_tlb_others)(const struct cpumask *cpus,
-                                 struct mm_struct *mm,
+                                 const struct flush_tlb_info *info);
-                                 unsigned long start,
-                                 unsigned long end);
        /* Hooks for allocating and freeing a pagetable top-level */
        int  (*pgd_alloc)(struct mm_struct *mm);
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 50d35e3185f5..c8821bab938f 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -212,4 +212,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp)
 #define __pte_to_swp_entry(pte)         ((swp_entry_t){ (pte).pte_high })
 #define __swp_entry_to_pte(x)           ((pte_t){ { .pte_high = (x).val } })
+#define gup_get_pte gup_get_pte
+/*
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking
+ * any locks.  For this we would like to load the pointers atomically,
+ * but that is not possible (without expensive cmpxchg8b) on PAE.  What
+ * we do have is the guarantee that a PTE will only either go from not
+ * present to present, or present to not present or both -- it will not
+ * switch to a completely different present page without a TLB flush in
+ * between; something that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ *   ptep->pte_high = h;
+ *   smp_wmb();
+ *   ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ *   ptep->pte_low = 0;
+ *   smp_wmb();
+ *   ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' iff pte_high
+ * sees 'h'. We load pte_high *after* loading pte_low, which ensures we
+ * don't see an older value of pte_high.  *Then* we recheck pte_low,
+ * which ensures that we haven't picked up a changed pte high. We might
+ * have gotten rubbish values from pte_low and pte_high, but we are
+ * guaranteed that pte_low will not have the present bit set *unless*
+ * it is 'l'. Because get_user_pages_fast() only operates on present ptes
+ * we're safe.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+        pte_t pte;
+        do {
+                pte.pte_low = ptep->pte_low;
+                smp_rmb();
+                pte.pte_high = ptep->pte_high;
+                smp_rmb();
+        } while (unlikely(pte.pte_low != ptep->pte_low));
+        return pte;
+}
 #endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f5af95a0c6b8..77037b6f1caa 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -244,6 +244,11 @@ static inline int pud_devmap(pud_t pud)
        return 0;
 }
 #endif
+static inline int pgd_devmap(pgd_t pgd)
+{
+        return 0;
+}
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -917,7 +922,7 @@ extern pgd_t trampoline_pgd_entry;
 static inline void __meminit init_trampoline_default(void)
 {
        /* Default trampoline pgd value */
-        trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
+        trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
 }
 # ifdef CONFIG_RANDOMIZE_MEMORY
 void __meminit init_trampoline(void);
@@ -1185,6 +1190,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags)
 #endif
 }
+static inline bool __pkru_allows_pkey(u16 pkey, bool write)
+{
+        u32 pkru = read_pkru();
+        if (!__pkru_allows_read(pkru, pkey))
+                return false;
+        if (write && !__pkru_allows_write(pkru, pkey))
+                return false;
+        return true;
+}
+/*
+ * 'pteval' can come from a PTE, PMD or PUD.  We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline bool __pte_access_permitted(unsigned long pteval, bool write)
+{
+        unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+        if (write)
+                need_pte_bits |= _PAGE_RW;
+        if ((pteval & need_pte_bits) != need_pte_bits)
+                return 0;
+        return __pkru_allows_pkey(pte_flags_pkey(pteval), write);
+}
+#define pte_access_permitted pte_access_permitted
+static inline bool pte_access_permitted(pte_t pte, bool write)
+{
+        return __pte_access_permitted(pte_val(pte), write);
+}
+#define pmd_access_permitted pmd_access_permitted
+static inline bool pmd_access_permitted(pmd_t pmd, bool write)
+{
+        return __pte_access_permitted(pmd_val(pmd), write);
+}
+#define pud_access_permitted pud_access_permitted
+static inline bool pud_access_permitted(pud_t pud, bool write)
+{
+        return __pte_access_permitted(pud_val(pud), write);
+}
 #include <asm-generic/pgtable.h>
 #endif  /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 9991224f6238..2160c1fee920 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,15 +14,17 @@
 #include <linux/bitops.h>
 #include <linux/threads.h>
+extern p4d_t level4_kernel_pgt[512];
+extern p4d_t level4_ident_pgt[512];
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
 extern pmd_t level2_fixmap_pgt[512];
 extern pmd_t level2_ident_pgt[512];
 extern pte_t level1_fixmap_pgt[512];
-extern pgd_t init_level4_pgt[];
+extern pgd_t init_top_pgt[];
-#define swapper_pg_dir init_level4_pgt
+#define swapper_pg_dir init_top_pgt
 extern void paging_init(void);
@@ -227,6 +229,20 @@ extern void cleanup_highmap(void);
 extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
 extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
-#endif /* !__ASSEMBLY__ */
+#define gup_fast_permitted gup_fast_permitted
+static inline bool gup_fast_permitted(unsigned long start, int nr_pages,
+                int write)
+{
+        unsigned long len, end;
+        len = (unsigned long)nr_pages << PAGE_SHIFT;
+        end = start + len;
+        if (end < start)
+                return false;
+        if (end >> __VIRTUAL_MASK_SHIFT)
+                return false;
+        return true;
+}
+#endif /* !__ASSEMBLY__ */
 #endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 39fb618e2211..79aa2f98398d 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -8,4 +8,40 @@
 #else
 #define X86_VM_MASK     0 /* No VM86 support */
 #endif
+/*
+ * CR3's layout varies depending on several things.
+ *
+ * If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
+ * If PAE is enabled, then CR3[11:5] is part of the PDPT address
+ * (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
+ * Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
+ * CR3[2:0] and CR3[11:5] are ignored.
+ *
+ * In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
+ *
+ * CR3[63] is always read as zero.  If CR4.PCIDE is set, then CR3[63] may be
+ * written as 1 to prevent the write to CR3 from flushing the TLB.
+ *
+ * On systems with SME, one bit (in a variable position!) is stolen to indicate
+ * that the top-level paging structure is encrypted.
+ *
+ * All of the remaining bits indicate the physical address of the top-level
+ * paging structure.
+ *
+ * CR3_ADDR_MASK is the mask used by read_cr3_pa().
+ */
+#ifdef CONFIG_X86_64
+/* Mask off the address space ID bits. */
+#define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
+#define CR3_PCID_MASK 0xFFFull
+#else
+/*
+ * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
+ * a tiny bit of code size by setting all the bits.
+ */
+#define CR3_ADDR_MASK 0xFFFFFFFFull
+#define CR3_PCID_MASK 0ull
+#endif
 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a28b671f1549..2e1696294af5 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -231,6 +231,14 @@ native_cpuid_reg(ebx)
 native_cpuid_reg(ecx)
 native_cpuid_reg(edx)
+/*
+ * Friendlier CR3 helpers.
+ */
+static inline unsigned long read_cr3_pa(void)
+{
+        return __read_cr3() & CR3_ADDR_MASK;
+}
 static inline void load_cr3(pgd_t *pgdir)
 {
        write_cr3(__pa(pgdir));
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 12af3e35edfa..9efaabf5b54b 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -39,7 +39,7 @@ static inline void native_write_cr2(unsigned long val)
        asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
 }
-static inline unsigned long native_read_cr3(void)
+static inline unsigned long __native_read_cr3(void)
 {
        unsigned long val;
        asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
@@ -159,9 +159,13 @@ static inline void write_cr2(unsigned long x)
        native_write_cr2(x);
 }
-static inline unsigned long read_cr3(void)
+/*
+ * Careful!  CR3 contains more than just an address.  You probably want
+ * read_cr3_pa() instead.
+ */
+static inline unsigned long __read_cr3(void)
 {
-        return native_read_cr3();
+        return __native_read_cr3();
 }
 static inline void write_cr3(unsigned long x)
diff --git a/arch/x86/include/asm/tlbbatch.h b/arch/x86/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..f4a6ff352a0e
--- /dev/null
+++ b/arch/x86/include/asm/tlbbatch.h
@@ -0,0 +1,14 @@
+#ifndef _ARCH_X86_TLBBATCH_H
+#define _ARCH_X86_TLBBATCH_H
+#include <linux/cpumask.h>
+struct arch_tlbflush_unmap_batch {
+        /*
+         * Each bit set is a CPU that potentially has a TLB entry for one of
+         * the PFNs being flushed..
+         */
+        struct cpumask cpumask;
+};
+#endif /* _ARCH_X86_TLBBATCH_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6ed9ea469b48..50ea3482e1d1 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,7 @@
 #include <asm/processor.h>
 #include <asm/cpufeature.h>
 #include <asm/special_insns.h>
+#include <asm/smp.h>
 static inline void __invpcid(unsigned long pcid, unsigned long addr,
                             unsigned long type)
@@ -65,10 +66,14 @@ static inline void invpcid_flush_all_nonglobals(void)
 #endif
 struct tlb_state {
-#ifdef CONFIG_SMP
+        /*
-        struct mm_struct *active_mm;
+         * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
+         * are on.  This means that it may not match current->active_mm,
+         * which will contain the previous user mm when we're in lazy TLB
+         * mode even if we've already switched back to swapper_pg_dir.
+         */
+        struct mm_struct *loaded_mm;
        int state;
-#endif
        /*
         * Access to this CR4 shadow and to H/W CR4 is protected by
@@ -151,7 +156,7 @@ static inline void __native_flush_tlb(void)
         * back:
         */
        preempt_disable();
-        native_write_cr3(native_read_cr3());
+        native_write_cr3(__native_read_cr3());
        preempt_enable();
 }
@@ -220,84 +225,16 @@ static inline void __flush_tlb_one(unsigned long addr)
 *  - flush_tlb_page(vma, vmaddr) flushes one page
 *  - flush_tlb_range(vma, start, end) flushes a range of pages
 *  - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
- *  - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
+ *  - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
 *
 * ..but the i386 has somewhat limited tlb flushing capabilities,
 * and page-granular flushes are available only on i486 and up.
 */
+struct flush_tlb_info {
-#ifndef CONFIG_SMP
+        struct mm_struct *mm;
+        unsigned long start;
-/* "_up" is for UniProcessor.
+        unsigned long end;
- *
+};
- * This is a helper for other header functions.  *Not* intended to be called
- * directly.  All global TLB flushes need to either call this, or to bump the
- * vm statistics themselves.
- */
-static inline void __flush_tlb_up(void)
-{
-        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-        __flush_tlb();
-}
-static inline void flush_tlb_all(void)
-{
-        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-        __flush_tlb_all();
-}
-static inline void local_flush_tlb(void)
-{
-        __flush_tlb_up();
-}
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-        if (mm == current->active_mm)
-                __flush_tlb_up();
-}
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-                                  unsigned long addr)
-{
-        if (vma->vm_mm == current->active_mm)
-                __flush_tlb_one(addr);
-}
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-                                   unsigned long start, unsigned long end)
-{
-        if (vma->vm_mm == current->active_mm)
-                __flush_tlb_up();
-}
-static inline void flush_tlb_mm_range(struct mm_struct *mm,
-           unsigned long start, unsigned long end, unsigned long vmflag)
-{
-        if (mm == current->active_mm)
-                __flush_tlb_up();
-}
-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
-                                           struct mm_struct *mm,
-                                           unsigned long start,
-                                           unsigned long end)
-{
-}
-static inline void reset_lazy_tlbstate(void)
-{
-}
-static inline void flush_tlb_kernel_range(unsigned long start,
-                                          unsigned long end)
-{
-        flush_tlb_all();
-}
-#else  /* SMP */
-#include <asm/smp.h>
 #define local_flush_tlb() __flush_tlb()
@@ -307,29 +244,32 @@ static inline void flush_tlb_kernel_range(unsigned long start,
                flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
 extern void flush_tlb_all(void);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, unsigned long vmflag);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+        flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
 void native_flush_tlb_others(const struct cpumask *cpumask,
-                                struct mm_struct *mm,
+                             const struct flush_tlb_info *info);
-                                unsigned long start, unsigned long end);
 #define TLBSTATE_OK     1
 #define TLBSTATE_LAZY   2
-static inline void reset_lazy_tlbstate(void)
+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+                                        struct mm_struct *mm)
 {
-        this_cpu_write(cpu_tlbstate.state, 0);
+        cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
-        this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
 }
-#endif  /* SMP */
+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
 #ifndef CONFIG_PARAVIRT
-#define flush_tlb_others(mask, mm, start, end)  \
+#define flush_tlb_others(mask, info)    \
-        native_flush_tlb_others(mask, mm, start, end)
+        native_flush_tlb_others(mask, info)
 #endif
 #endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index 6686820feae9..b5a32231abd8 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_UV_UV_H
 #define _ASM_X86_UV_UV_H
+#include <asm/tlbflush.h>
 enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
 struct cpumask;
@@ -15,10 +17,7 @@ extern void uv_cpu_init(void);
 extern void uv_nmi_init(void);
 extern void uv_system_init(void);
 extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-                                                 struct mm_struct *mm,
+                                                 const struct flush_tlb_info *info);
-                                                 unsigned long start,
-                                                 unsigned long end,
-                                                 unsigned int cpu);
 #else   /* X86_UV */
@@ -28,8 +27,8 @@ static inline int is_uv_hubless(void)	{ return 0; }
 static inline void uv_cpu_init(void)    { }
 static inline void uv_system_init(void) { }
 static inline const struct cpumask *
-uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
+uv_flush_tlb_others(const struct cpumask *cpumask,
-                    unsigned long start, unsigned long end, unsigned int cpu)
+                    const struct flush_tlb_info *info)
 { return cpumask; }
 #endif  /* X86_UV */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 567de50a4c2a..185f3d10c194 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -104,6 +104,8 @@
 #define X86_CR4_OSFXSR          _BITUL(X86_CR4_OSFXSR_BIT)
 #define X86_CR4_OSXMMEXCPT_BIT  10 /* enable unmasked SSE exceptions */
 #define X86_CR4_OSXMMEXCPT      _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_LA57_BIT        12 /* enable 5-level page tables */
+#define X86_CR4_LA57            _BITUL(X86_CR4_LA57_BIT)
 #define X86_CR4_VMXE_BIT        13 /* enable VMX virtualization */
 #define X86_CR4_VMXE            _BITUL(X86_CR4_VMXE_BIT)
 #define X86_CR4_SMXE_BIT        14 /* enable safer mode (TXT) */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3c7c419c4e3e..a01892bdd61a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -18,6 +18,7 @@ CFLAGS_REMOVE_pvclock.o = -pg
 CFLAGS_REMOVE_kvmclock.o = -pg
 CFLAGS_REMOVE_ftrace.o = -pg
 CFLAGS_REMOVE_early_printk.o = -pg
+CFLAGS_REMOVE_head64.o = -pg
 endif
 KASAN_SANITIZE_head$(BITS).o                            := n
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8e598a1ad986..6b91e2eb8d3f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
        p4d_t *p4d;
        /* Install the espfix pud into the kernel page directory */
-        pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+        pgd = &init_top_pgt[pgd_index(ESPFIX_BASE_ADDR)];
        p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
        p4d_populate(&init_mm, p4d, espfix_pud_page);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 43b7002f44fb..46c3c73e7f43 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -33,17 +33,120 @@
 /*
 * Manage page tables very early on.
 */
-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
-static unsigned int __initdata next_early_pgt = 2;
+static unsigned int __initdata next_early_pgt;
 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX);
+#define __head  __section(.head.text)
+static void __head *fixup_pointer(void *ptr, unsigned long physaddr)
+{
+        return ptr - (void *)_text + (void *)physaddr;
+}
+void __head __startup_64(unsigned long physaddr)
+{
+        unsigned long load_delta, *p;
+        pgdval_t *pgd;
+        p4dval_t *p4d;
+        pudval_t *pud;
+        pmdval_t *pmd, pmd_entry;
+        int i;
+        /* Is the address too large? */
+        if (physaddr >> MAX_PHYSMEM_BITS)
+                for (;;);
+        /*
+         * Compute the delta between the address I am compiled to run at
+         * and the address I am actually running at.
+         */
+        load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
+        /* Is the address not 2M aligned? */
+        if (load_delta & ~PMD_PAGE_MASK)
+                for (;;);
+        /* Fixup the physical addresses in the page table */
+        pgd = fixup_pointer(&early_top_pgt, physaddr);
+        pgd[pgd_index(__START_KERNEL_map)] += load_delta;
+        if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+                p4d = fixup_pointer(&level4_kernel_pgt, physaddr);
+                p4d[511] += load_delta;
+        }
+        pud = fixup_pointer(&level3_kernel_pgt, physaddr);
+        pud[510] += load_delta;
+        pud[511] += load_delta;
+        pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
+        pmd[506] += load_delta;
+        /*
+         * Set up the identity mapping for the switchover.  These
+         * entries should *NOT* have the global bit set!  This also
+         * creates a bunch of nonsense entries but that is fine --
+         * it avoids problems around wraparound.
+         */
+        pud = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+        pmd = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+        if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+                p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr);
+                i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+                pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE;
+                pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE;
+                i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D;
+                p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+                p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+        } else {
+                i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD;
+                pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE;
+                pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE;
+        }
+        i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD;
+        pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE;
+        pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE;
+        pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL;
+        pmd_entry +=  physaddr;
+        for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
+                int idx = i + (physaddr >> PMD_SHIFT) % PTRS_PER_PMD;
+                pmd[idx] = pmd_entry + i * PMD_SIZE;
+        }
+        /*
+         * Fixup the kernel text+data virtual addresses. Note that
+         * we might write invalid pmds, when the kernel is relocated
+         * cleanup_highmap() fixes this up along with the mappings
+         * beyond _end.
+         */
+        pmd = fixup_pointer(level2_kernel_pgt, physaddr);
+        for (i = 0; i < PTRS_PER_PMD; i++) {
+                if (pmd[i] & _PAGE_PRESENT)
+                        pmd[i] += load_delta;
+        }
+        /* Fixup phys_base */
+        p = fixup_pointer(&phys_base, physaddr);
+        *p += load_delta;
+}
 /* Wipe all early page tables except for the kernel symbol map */
 static void __init reset_early_page_tables(void)
 {
-        memset(early_level4_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
+        memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1));
        next_early_pgt = 0;
-        write_cr3(__pa_nodebug(early_level4_pgt));
+        write_cr3(__pa_nodebug(early_top_pgt));
 }
 /* Create a new PMD entry */
@@ -51,15 +154,16 @@ int __init early_make_pgtable(unsigned long address)
 {
        unsigned long physaddr = address - __PAGE_OFFSET;
        pgdval_t pgd, *pgd_p;
+        p4dval_t p4d, *p4d_p;
        pudval_t pud, *pud_p;
        pmdval_t pmd, *pmd_p;
        /* Invalid address or early pgt is done ?  */
-        if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
+        if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt))
                return -1;
 again:
-        pgd_p = &early_level4_pgt[pgd_index(address)].pgd;
+        pgd_p = &early_top_pgt[pgd_index(address)].pgd;
        pgd = *pgd_p;
        /*
@@ -67,8 +171,25 @@ again:
         * critical -- __PAGE_OFFSET would point us back into the dynamic
         * range and we might end up looping forever...
         */
-        if (pgd)
+        if (!IS_ENABLED(CONFIG_X86_5LEVEL))
-                pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+                p4d_p = pgd_p;
+        else if (pgd)
+                p4d_p = (p4dval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
+        else {
+                if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
+                        reset_early_page_tables();
+                        goto again;
+                }
+                p4d_p = (p4dval_t *)early_dynamic_pgts[next_early_pgt++];
+                memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D);
+                *pgd_p = (pgdval_t)p4d_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+        }
+        p4d_p += p4d_index(address);
+        p4d = *p4d_p;
+        if (p4d)
+                pud_p = (pudval_t *)((p4d & PTE_PFN_MASK) + __START_KERNEL_map - phys_base);
        else {
                if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES) {
                        reset_early_page_tables();
@@ -77,7 +198,7 @@ again:
                pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
                memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD);
-                *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
+                *p4d_p = (p4dval_t)pud_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE;
        }
        pud_p += pud_index(address);
        pud = *pud_p;
@@ -156,7 +277,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
        clear_bss();
-        clear_page(init_level4_pgt);
+        clear_page(init_top_pgt);
        kasan_early_init();
@@ -171,8 +292,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
         */
        load_ucode_bsp();
-        /* set init_level4_pgt kernel high mapping*/
+        /* set init_top_pgt kernel high mapping*/
-        init_level4_pgt[511] = early_level4_pgt[511];
+        init_top_pgt[511] = early_top_pgt[511];
        x86_64_start_reservations(real_mode_data);
 }
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ac9d327d2e42..6225550883df 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -37,10 +37,11 @@
 *
 */
+#define p4d_index(x)    (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
 #define pud_index(x)    (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
-L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
+PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
-L4_START_KERNEL = pgd_index(__START_KERNEL_map)
+PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
        .text
@@ -72,101 +73,12 @@ startup_64:
        /* Sanitize CPU configuration */
        call verify_cpu
-        /*
-         * Compute the delta between the address I am compiled to run at and the
-         * address I am actually running at.
-         */
-        leaq    _text(%rip), %rbp
-        subq    $_text - __START_KERNEL_map, %rbp
-        /* Is the address not 2M aligned? */
-        testl   $~PMD_PAGE_MASK, %ebp
-        jnz     bad_address
-        /*
-         * Is the address too large?
-         */
-        leaq    _text(%rip), %rax
-        shrq    $MAX_PHYSMEM_BITS, %rax
-        jnz     bad_address
-        /*
-         * Fixup the physical addresses in the page table
-         */
-        addq    %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
-        addq    %rbp, level3_kernel_pgt + (510*8)(%rip)
-        addq    %rbp, level3_kernel_pgt + (511*8)(%rip)
-        addq    %rbp, level2_fixmap_pgt + (506*8)(%rip)
-        /*
-         * Set up the identity mapping for the switchover.  These
-         * entries should *NOT* have the global bit set!  This also
-         * creates a bunch of nonsense entries but that is fine --
-         * it avoids problems around wraparound.
-         */
        leaq    _text(%rip), %rdi
-        leaq    early_level4_pgt(%rip), %rbx
+        pushq   %rsi
+        call    __startup_64
-        movq    %rdi, %rax
+        popq    %rsi
-        shrq    $PGDIR_SHIFT, %rax
-        leaq    (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx
-        movq    %rdx, 0(%rbx,%rax,8)
-        movq    %rdx, 8(%rbx,%rax,8)
-        addq    $PAGE_SIZE, %rdx
-        movq    %rdi, %rax
-        shrq    $PUD_SHIFT, %rax
-        andl    $(PTRS_PER_PUD-1), %eax
-        movq    %rdx, PAGE_SIZE(%rbx,%rax,8)
-        incl    %eax
-        andl    $(PTRS_PER_PUD-1), %eax
-        movq    %rdx, PAGE_SIZE(%rbx,%rax,8)
-        addq    $PAGE_SIZE * 2, %rbx
-        movq    %rdi, %rax
-        shrq    $PMD_SHIFT, %rdi
-        addq    $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
-        leaq    (_end - 1)(%rip), %rcx
-        shrq    $PMD_SHIFT, %rcx
-        subq    %rdi, %rcx
-        incl    %ecx
-1:
-        andq    $(PTRS_PER_PMD - 1), %rdi
-        movq    %rax, (%rbx,%rdi,8)
-        incq    %rdi
-        addq    $PMD_SIZE, %rax
-        decl    %ecx
-        jnz     1b
-        test %rbp, %rbp
-        jz .Lskip_fixup
-        /*
+        movq    $(early_top_pgt - __START_KERNEL_map), %rax
-         * Fixup the kernel text+data virtual addresses. Note that
-         * we might write invalid pmds, when the kernel is relocated
-         * cleanup_highmap() fixes this up along with the mappings
-         * beyond _end.
-         */
-        leaq    level2_kernel_pgt(%rip), %rdi
-        leaq    PAGE_SIZE(%rdi), %r8
-        /* See if it is a valid page table entry */
-1:      testb   $_PAGE_PRESENT, 0(%rdi)
-        jz      2f
-        addq    %rbp, 0(%rdi)
-        /* Go to the next page */
-2:      addq    $8, %rdi
-        cmp     %r8, %rdi
-        jne     1b
-        /* Fixup phys_base */
-        addq    %rbp, phys_base(%rip)
-.Lskip_fixup:
-        movq    $(early_level4_pgt - __START_KERNEL_map), %rax
        jmp 1f
 ENTRY(secondary_startup_64)
        /*
@@ -186,14 +98,17 @@ ENTRY(secondary_startup_64)
        /* Sanitize CPU configuration */
        call verify_cpu
-        movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+        movq    $(init_top_pgt - __START_KERNEL_map), %rax
 1:
-        /* Enable PAE mode and PGE */
+        /* Enable PAE mode, PGE and LA57 */
        movl    $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+#ifdef CONFIG_X86_5LEVEL
+        orl     $X86_CR4_LA57, %ecx
+#endif
        movq    %rcx, %cr4
-        /* Setup early boot stage 4 level pagetables. */
+        /* Setup early boot stage 4-/5-level pagetables. */
        addq    phys_base(%rip), %rax
        movq    %rax, %cr3
@@ -417,9 +332,13 @@ GLOBAL(name)
        .endr
        __INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PAGE(early_top_pgt)
        .fill   511,8,0
+#ifdef CONFIG_X86_5LEVEL
+        .quad   level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#else
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
 NEXT_PAGE(early_dynamic_pgts)
        .fill   512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -427,14 +346,14 @@ NEXT_PAGE(early_dynamic_pgts)
        .data
 #ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
        .fill   512,8,0
 #else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PAGE(init_top_pgt)
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-        .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
+        .org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-        .org    init_level4_pgt + L4_START_KERNEL*8, 0
+        .org    init_top_pgt + PGD_START_KERNEL*8, 0
        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
@@ -448,6 +367,12 @@ NEXT_PAGE(level2_ident_pgt)
        PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 #endif
+#ifdef CONFIG_X86_5LEVEL
+NEXT_PAGE(level4_kernel_pgt)
+        .fill   511,8,0
+        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+#endif
 NEXT_PAGE(level3_kernel_pgt)
        .fill   L3_START_KERNEL,8,0
        /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d4a15831ac58..a870910c8565 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -22,24 +22,25 @@
 #include <asm/syscalls.h>
 /* context.lock is held for us, so we don't need any locking. */
-static void flush_ldt(void *current_mm)
+static void flush_ldt(void *__mm)
 {
+        struct mm_struct *mm = __mm;
        mm_context_t *pc;
-        if (current->active_mm != current_mm)
+        if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
                return;
-        pc = &current->active_mm->context;
+        pc = &mm->context;
-        set_ldt(pc->ldt->entries, pc->ldt->size);
+        set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
 }
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
-static struct ldt_struct *alloc_ldt_struct(unsigned int size)
+static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
 {
        struct ldt_struct *new_ldt;
        unsigned int alloc_size;
-        if (size > LDT_ENTRIES)
+        if (num_entries > LDT_ENTRIES)
                return NULL;
        new_ldt = kmalloc(sizeof(struct ldt_struct), GFP_KERNEL);
@@ -47,7 +48,7 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
                return NULL;
        BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
-        alloc_size = size * LDT_ENTRY_SIZE;
+        alloc_size = num_entries * LDT_ENTRY_SIZE;
        /*
         * Xen is very picky: it requires a page-aligned LDT that has no
@@ -65,14 +66,14 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int size)
                return NULL;
        }
-        new_ldt->size = size;
+        new_ldt->nr_entries = num_entries;
        return new_ldt;
 }
 /* After calling this, the LDT is immutable. */
 static void finalize_ldt_struct(struct ldt_struct *ldt)
 {
-        paravirt_alloc_ldt(ldt->entries, ldt->size);
+        paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);
 }
 /* context.lock is held */
@@ -91,8 +92,8 @@ static void free_ldt_struct(struct ldt_struct *ldt)
        if (likely(!ldt))
                return;
-        paravirt_free_ldt(ldt->entries, ldt->size);
+        paravirt_free_ldt(ldt->entries, ldt->nr_entries);
-        if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+        if (ldt->nr_entries * LDT_ENTRY_SIZE > PAGE_SIZE)
                vfree_atomic(ldt->entries);
        else
                free_page((unsigned long)ldt->entries);
@@ -122,14 +123,14 @@ int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm)
                goto out_unlock;
        }
-        new_ldt = alloc_ldt_struct(old_mm->context.ldt->size);
+        new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);
        if (!new_ldt) {
                retval = -ENOMEM;
                goto out_unlock;
        }
        memcpy(new_ldt->entries, old_mm->context.ldt->entries,
-               new_ldt->size * LDT_ENTRY_SIZE);
+               new_ldt->nr_entries * LDT_ENTRY_SIZE);
        finalize_ldt_struct(new_ldt);
        mm->context.ldt = new_ldt;
@@ -152,9 +153,9 @@ void destroy_context_ldt(struct mm_struct *mm)
 static int read_ldt(void __user *ptr, unsigned long bytecount)
 {
-        int retval;
-        unsigned long size;
        struct mm_struct *mm = current->mm;
+        unsigned long entries_size;
+        int retval;
        mutex_lock(&mm->context.lock);
@@ -166,18 +167,18 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)
        if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
                bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
-        size = mm->context.ldt->size * LDT_ENTRY_SIZE;
+        entries_size = mm->context.ldt->nr_entries * LDT_ENTRY_SIZE;
-        if (size > bytecount)
+        if (entries_size > bytecount)
-                size = bytecount;
+                entries_size = bytecount;
-        if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
+        if (copy_to_user(ptr, mm->context.ldt->entries, entries_size)) {
                retval = -EFAULT;
                goto out_unlock;
        }
-        if (size != bytecount) {
+        if (entries_size != bytecount) {
                /* Zero-fill the rest and pretend we read bytecount bytes. */
-                if (clear_user(ptr + size, bytecount - size)) {
+                if (clear_user(ptr + entries_size, bytecount - entries_size)) {
                        retval = -EFAULT;
                        goto out_unlock;
                }
@@ -208,7 +209,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
 {
        struct mm_struct *mm = current->mm;
        struct ldt_struct *new_ldt, *old_ldt;
-        unsigned int oldsize, newsize;
+        unsigned int old_nr_entries, new_nr_entries;
        struct user_desc ldt_info;
        struct desc_struct ldt;
        int error;
@@ -247,17 +248,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
        mutex_lock(&mm->context.lock);
-        old_ldt = mm->context.ldt;
+        old_ldt       = mm->context.ldt;
-        oldsize = old_ldt ? old_ldt->size : 0;
+        old_nr_entries = old_ldt ? old_ldt->nr_entries : 0;
-        newsize = max(ldt_info.entry_number + 1, oldsize);
+        new_nr_entries = max(ldt_info.entry_number + 1, old_nr_entries);
        error = -ENOMEM;
-        new_ldt = alloc_ldt_struct(newsize);
+        new_ldt = alloc_ldt_struct(new_nr_entries);
        if (!new_ldt)
                goto out_unlock;
        if (old_ldt)
-                memcpy(new_ldt->entries, old_ldt->entries, oldsize * LDT_ENTRY_SIZE);
+                memcpy(new_ldt->entries, old_ldt->entries, old_nr_entries * LDT_ENTRY_SIZE);
        new_ldt->entries[ldt_info.entry_number] = ldt;
        finalize_ldt_struct(new_ldt);
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6f5ca4ebe6e5..cb0a30473c23 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -347,7 +347,7 @@ void machine_kexec(struct kimage *image)
 void arch_crash_save_vmcoreinfo(void)
 {
        VMCOREINFO_NUMBER(phys_base);
-        VMCOREINFO_SYMBOL(init_level4_pgt);
+        VMCOREINFO_SYMBOL(init_top_pgt);
 #ifdef CONFIG_NUMA
        VMCOREINFO_SYMBOL(node_data);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 3586996fc50d..bc0a849589bb 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -391,7 +391,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
        .read_cr2 = native_read_cr2,
        .write_cr2 = native_write_cr2,
-        .read_cr3 = native_read_cr3,
+        .read_cr3 = __native_read_cr3,
        .write_cr3 = native_write_cr3,
        .flush_tlb_user = native_flush_tlb,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index ffeae818aa7a..c6d6dc5f8bb2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -92,7 +92,7 @@ void __show_regs(struct pt_regs *regs, int all)
        cr0 = read_cr0();
        cr2 = read_cr2();
-        cr3 = read_cr3();
+        cr3 = __read_cr3();
        cr4 = __read_cr4();
        printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
                        cr0, cr2, cr3, cr4);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b6840bf3940b..c3169be4c596 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -104,7 +104,7 @@ void __show_regs(struct pt_regs *regs, int all)
        cr0 = read_cr0();
        cr2 = read_cr2();
-        cr3 = read_cr3();
+        cr3 = __read_cr3();
        cr4 = __read_cr4();
        printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
@@ -142,7 +142,7 @@ void release_thread(struct task_struct *dead_task)
                        pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
                                dead_task->comm,
                                dead_task->mm->context.ldt->entries,
-                                dead_task->mm->context.ldt->size);
+                                dead_task->mm->context.ldt->nr_entries);
                        BUG();
                }
 #endif
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 045e4f993bd2..b474c8de7fba 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1589,7 +1589,6 @@ void native_cpu_die(unsigned int cpu)
 void play_dead_common(void)
 {
        idle_task_exit();
-        reset_lazy_tlbstate();
        /* Ack it */
        (void)cpu_report_death();
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index f07f83b3611b..5f25cfbd952e 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -34,7 +34,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
                mutex_lock(&child->mm->context.lock);
                if (unlikely(!child->mm->context.ldt ||
-                             seg >= child->mm->context.ldt->size))
+                             seg >= child->mm->context.ldt->nr_entries))
                        addr = -1L; /* bogus selector, access would fault */
                else {
                        desc = &child->mm->context.ldt->entries[seg];
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1b469b6c762f..6dcc4873e435 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -49,6 +49,7 @@
 #include <asm/kexec.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
+#include <asm/mmu_context.h>
 #include "trace.h"
 #include "pmu.h"
@@ -597,6 +598,7 @@ struct vcpu_vmx {
                int           gs_ldt_reload_needed;
                int           fs_reload_needed;
                u64           msr_host_bndcfgs;
+                unsigned long vmcs_host_cr3;    /* May not match real cr3 */
                unsigned long vmcs_host_cr4;    /* May not match real cr4 */
        } host_state;
        struct {
@@ -5013,12 +5015,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
        u32 low32, high32;
        unsigned long tmpl;
        struct desc_ptr dt;
-        unsigned long cr0, cr4;
+        unsigned long cr0, cr3, cr4;
        cr0 = read_cr0();
        WARN_ON(cr0 & X86_CR0_TS);
        vmcs_writel(HOST_CR0, cr0);  /* 22.2.3 */
-        vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
+        /*
+         * Save the most likely value for this task's CR3 in the VMCS.
+         * We can't use __get_current_cr3_fast() because we're not atomic.
+         */
+        cr3 = __read_cr3();
+        vmcs_writel(HOST_CR3, cr3);             /* 22.2.3  FIXME: shadow tables */
+        vmx->host_state.vmcs_host_cr3 = cr3;
        /* Save the most likely value for this task's CR4 in the VMCS. */
        cr4 = cr4_read_shadow();
@@ -8822,7 +8831,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
 static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-        unsigned long debugctlmsr, cr4;
+        unsigned long debugctlmsr, cr3, cr4;
        /* Don't enter VMX if guest state is invalid, let the exit handler
           start emulation until we arrive back to a valid state */
@@ -8844,6 +8853,12 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
                vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
+        cr3 = __get_current_cr3_fast();
+        if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
+                vmcs_writel(HOST_CR3, cr3);
+                vmx->host_state.vmcs_host_cr3 = cr3;
+        }
        cr4 = cr4_read_shadow();
        if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
                vmcs_writel(HOST_CR4, cr4);
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index 5e044d506b7a..a179254a5122 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -27,7 +27,7 @@ static inline struct desc_struct FPU_get_ldt_descriptor(unsigned seg)
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
        seg >>= 3;
        mutex_lock(&current->mm->context.lock);
-        if (current->mm->context.ldt && seg < current->mm->context.ldt->size)
+        if (current->mm->context.ldt && seg < current->mm->context.ldt->nr_entries)
                ret = current->mm->context.ldt->entries[seg];
        mutex_unlock(&current->mm->context.lock);
 #endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b847e09e..0fbdcb64f9f8 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
 KCOV_INSTRUMENT_tlb.o   := n
 obj-y   :=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-            pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
+            pat.o pgtable.o physaddr.o setup_nx.o tlb.o
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index bce6990b1d81..0470826d2bdc 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -431,7 +431,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
                                       bool checkwx)
 {
 #ifdef CONFIG_X86_64
-        pgd_t *start = (pgd_t *) &init_level4_pgt;
+        pgd_t *start = (pgd_t *) &init_top_pgt;
 #else
        pgd_t *start = swapper_pg_dir;
 #endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 8ad91a01cbc8..2a1fa10c6a98 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -346,7 +346,7 @@ static noinline int vmalloc_fault(unsigned long address)
         * Do _not_ use "current" here. We might be inside
         * an interrupt in the middle of a task switch..
         */
-        pgd_paddr = read_cr3();
+        pgd_paddr = read_cr3_pa();
        pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
        if (!pmd_k)
                return -1;
@@ -388,7 +388,7 @@ static bool low_pfn(unsigned long pfn)
 static void dump_pagetable(unsigned long address)
 {
-        pgd_t *base = __va(read_cr3());
+        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = &base[pgd_index(address)];
        p4d_t *p4d;
        pud_t *pud;
@@ -451,7 +451,7 @@ static noinline int vmalloc_fault(unsigned long address)
         * happen within a race in page table update. In the later
         * case just flush:
         */
-        pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address);
+        pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
        pgd_ref = pgd_offset_k(address);
        if (pgd_none(*pgd_ref))
                return -1;
@@ -555,7 +555,7 @@ static int bad_address(void *p)
 static void dump_pagetable(unsigned long address)
 {
-        pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = base + pgd_index(address);
        p4d_t *p4d;
        pud_t *pud;
@@ -700,7 +700,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
                pgd_t *pgd;
                pte_t *pte;
-                pgd = __va(read_cr3() & PHYSICAL_PAGE_MASK);
+                pgd = __va(read_cr3_pa());
                pgd += pgd_index(address);
                pte = lookup_address_in_pgd(pgd, address, &level);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
deleted file mode 100644
index 456dfdfd2249..000000000000
--- a/arch/x86/mm/gup.c
+++ /dev/null
@@ -1,496 +0,0 @@
-/*
- * Lockless get_user_pages_fast for x86
- *
- * Copyright (C) 2008 Nick Piggin
- * Copyright (C) 2008 Novell Inc.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/highmem.h>
-#include <linux/swap.h>
-#include <linux/memremap.h>
-#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
-#ifndef CONFIG_X86_PAE
-        return READ_ONCE(*ptep);
-#else
-        /*
-         * With get_user_pages_fast, we walk down the pagetables without taking
-         * any locks.  For this we would like to load the pointers atomically,
-         * but that is not possible (without expensive cmpxchg8b) on PAE.  What
-         * we do have is the guarantee that a pte will only either go from not
-         * present to present, or present to not present or both -- it will not
-         * switch to a completely different present page without a TLB flush in
-         * between; something that we are blocking by holding interrupts off.
-         *
-         * Setting ptes from not present to present goes:
-         * ptep->pte_high = h;
-         * smp_wmb();
-         * ptep->pte_low = l;
-         *
-         * And present to not present goes:
-         * ptep->pte_low = 0;
-         * smp_wmb();
-         * ptep->pte_high = 0;
-         *
-         * We must ensure here that the load of pte_low sees l iff pte_high
-         * sees h. We load pte_high *after* loading pte_low, which ensures we
-         * don't see an older value of pte_high.  *Then* we recheck pte_low,
-         * which ensures that we haven't picked up a changed pte high. We might
-         * have got rubbish values from pte_low and pte_high, but we are
-         * guaranteed that pte_low will not have the present bit set *unless*
-         * it is 'l'. And get_user_pages_fast only operates on present ptes, so
-         * we're safe.
-         *
-         * gup_get_pte should not be used or copied outside gup.c without being
-         * very careful -- it does not atomically load the pte or anything that
-         * is likely to be useful for you.
-         */
-        pte_t pte;
-retry:
-        pte.pte_low = ptep->pte_low;
-        smp_rmb();
-        pte.pte_high = ptep->pte_high;
-        smp_rmb();
-        if (unlikely(pte.pte_low != ptep->pte_low))
-                goto retry;
-        return pte;
-#endif
-}
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
-{
-        while ((*nr) - nr_start) {
-                struct page *page = pages[--(*nr)];
-                ClearPageReferenced(page);
-                put_page(page);
-        }
-}
-/*
- * 'pteval' can come from a pte, pmd, pud or p4d.  We only check
- * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
- * same value on all 4 types.
- */
-static inline int pte_allows_gup(unsigned long pteval, int write)
-{
-        unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
-        if (write)
-                need_pte_bits |= _PAGE_RW;
-        if ((pteval & need_pte_bits) != need_pte_bits)
-                return 0;
-        /* Check memory protection keys permissions. */
-        if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
-                return 0;
-        return 1;
-}
-/*
- * The performance critical leaf functions are made noinline otherwise gcc
- * inlines everything into a single function which results in too much
- * register pressure.
- */
-static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
-                unsigned long end, int write, struct page **pages, int *nr)
-{
-        struct dev_pagemap *pgmap = NULL;
-        int nr_start = *nr, ret = 0;
-        pte_t *ptep, *ptem;
-        /*
-         * Keep the original mapped PTE value (ptem) around since we
-         * might increment ptep off the end of the page when finishing
-         * our loop iteration.
-         */
-        ptem = ptep = pte_offset_map(&pmd, addr);
-        do {
-                pte_t pte = gup_get_pte(ptep);
-                struct page *page;
-                /* Similar to the PMD case, NUMA hinting must take slow path */
-                if (pte_protnone(pte))
-                        break;
-                if (!pte_allows_gup(pte_val(pte), write))
-                        break;
-                if (pte_devmap(pte)) {
-                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
-                        if (unlikely(!pgmap)) {
-                                undo_dev_pagemap(nr, nr_start, pages);
-                                break;
-                        }
-                } else if (pte_special(pte))
-                        break;
-                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-                page = pte_page(pte);
-                get_page(page);
-                put_dev_pagemap(pgmap);
-                SetPageReferenced(page);
-                pages[*nr] = page;
-                (*nr)++;
-        } while (ptep++, addr += PAGE_SIZE, addr != end);
-        if (addr == end)
-                ret = 1;
-        pte_unmap(ptem);
-        return ret;
-}
-static inline void get_head_page_multiple(struct page *page, int nr)
-{
-        VM_BUG_ON_PAGE(page != compound_head(page), page);
-        VM_BUG_ON_PAGE(page_count(page) == 0, page);
-        page_ref_add(page, nr);
-        SetPageReferenced(page);
-}
-static int __gup_device_huge(unsigned long pfn, unsigned long addr,
-                unsigned long end, struct page **pages, int *nr)
-{
-        int nr_start = *nr;
-        struct dev_pagemap *pgmap = NULL;
-        do {
-                struct page *page = pfn_to_page(pfn);
-                pgmap = get_dev_pagemap(pfn, pgmap);
-                if (unlikely(!pgmap)) {
-                        undo_dev_pagemap(nr, nr_start, pages);
-                        return 0;
-                }
-                SetPageReferenced(page);
-                pages[*nr] = page;
-                get_page(page);
-                put_dev_pagemap(pgmap);
-                (*nr)++;
-                pfn++;
-        } while (addr += PAGE_SIZE, addr != end);
-        return 1;
-}
-static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
-                unsigned long end, struct page **pages, int *nr)
-{
-        unsigned long fault_pfn;
-        fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-        return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
-                unsigned long end, struct page **pages, int *nr)
-{
-        unsigned long fault_pfn;
-        fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-        return __gup_device_huge(fault_pfn, addr, end, pages, nr);
-}
-static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
-                unsigned long end, int write, struct page **pages, int *nr)
-{
-        struct page *head, *page;
-        int refs;
-        if (!pte_allows_gup(pmd_val(pmd), write))
-                return 0;
-        VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
-        if (pmd_devmap(pmd))
-                return __gup_device_huge_pmd(pmd, addr, end, pages, nr);
-        /* hugepages are never "special" */
-        VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL);
-        refs = 0;
-        head = pmd_page(pmd);
-        page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-        do {
-                VM_BUG_ON_PAGE(compound_head(page) != head, page);
-                pages[*nr] = page;
-                (*nr)++;
-                page++;
-                refs++;
-        } while (addr += PAGE_SIZE, addr != end);
-        get_head_page_multiple(head, refs);
-        return 1;
-}
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-                int write, struct page **pages, int *nr)
-{
-        unsigned long next;
-        pmd_t *pmdp;
-        pmdp = pmd_offset(&pud, addr);
-        do {
-                pmd_t pmd = *pmdp;
-                next = pmd_addr_end(addr, end);
-                if (pmd_none(pmd))
-                        return 0;
-                if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) {
-                        /*
-                         * NUMA hinting faults need to be handled in the GUP
-                         * slowpath for accounting purposes and so that they
-                         * can be serialised against THP migration.
-                         */
-                        if (pmd_protnone(pmd))
-                                return 0;
-                        if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
-                                return 0;
-                } else {
-                        if (!gup_pte_range(pmd, addr, next, write, pages, nr))
-                                return 0;
-                }
-        } while (pmdp++, addr = next, addr != end);
-        return 1;
-}
-static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
-                unsigned long end, int write, struct page **pages, int *nr)
-{
-        struct page *head, *page;
-        int refs;
-        if (!pte_allows_gup(pud_val(pud), write))
-                return 0;
-        VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
-        if (pud_devmap(pud))
-                return __gup_device_huge_pud(pud, addr, end, pages, nr);
-        /* hugepages are never "special" */
-        VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
-        refs = 0;
-        head = pud_page(pud);
-        page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-        do {
-                VM_BUG_ON_PAGE(compound_head(page) != head, page);
-                pages[*nr] = page;
-                (*nr)++;
-                page++;
-                refs++;
-        } while (addr += PAGE_SIZE, addr != end);
-        get_head_page_multiple(head, refs);
-        return 1;
-}
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
-                        int write, struct page **pages, int *nr)
-{
-        unsigned long next;
-        pud_t *pudp;
-        pudp = pud_offset(&p4d, addr);
-        do {
-                pud_t pud = *pudp;
-                next = pud_addr_end(addr, end);
-                if (pud_none(pud))
-                        return 0;
-                if (unlikely(pud_large(pud))) {
-                        if (!gup_huge_pud(pud, addr, next, write, pages, nr))
-                                return 0;
-                } else {
-                        if (!gup_pmd_range(pud, addr, next, write, pages, nr))
-                                return 0;
-                }
-        } while (pudp++, addr = next, addr != end);
-        return 1;
-}
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
-                        int write, struct page **pages, int *nr)
-{
-        unsigned long next;
-        p4d_t *p4dp;
-        p4dp = p4d_offset(&pgd, addr);
-        do {
-                p4d_t p4d = *p4dp;
-                next = p4d_addr_end(addr, end);
-                if (p4d_none(p4d))
-                        return 0;
-                BUILD_BUG_ON(p4d_large(p4d));
-                if (!gup_pud_range(p4d, addr, next, write, pages, nr))
-                        return 0;
-        } while (p4dp++, addr = next, addr != end);
-        return 1;
-}
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
-                          struct page **pages)
-{
-        struct mm_struct *mm = current->mm;
-        unsigned long addr, len, end;
-        unsigned long next;
-        unsigned long flags;
-        pgd_t *pgdp;
-        int nr = 0;
-        start &= PAGE_MASK;
-        addr = start;
-        len = (unsigned long) nr_pages << PAGE_SHIFT;
-        end = start + len;
-        if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
-                                        (void __user *)start, len)))
-                return 0;
-        /*
-         * XXX: batch / limit 'nr', to avoid large irq off latency
-         * needs some instrumenting to determine the common sizes used by
-         * important workloads (eg. DB2), and whether limiting the batch size
-         * will decrease performance.
-         *
-         * It seems like we're in the clear for the moment. Direct-IO is
-         * the main guy that batches up lots of get_user_pages, and even
-         * they are limited to 64-at-a-time which is not so many.
-         */
-        /*
-         * This doesn't prevent pagetable teardown, but does prevent
-         * the pagetables and pages from being freed on x86.
-         *
-         * So long as we atomically load page table pointers versus teardown
-         * (which we do on x86, with the above PAE exception), we can follow the
-         * address down to the the page and take a ref on it.
-         */
-        local_irq_save(flags);
-        pgdp = pgd_offset(mm, addr);
-        do {
-                pgd_t pgd = *pgdp;
-                next = pgd_addr_end(addr, end);
-                if (pgd_none(pgd))
-                        break;
-                if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
-                        break;
-        } while (pgdp++, addr = next, addr != end);
-        local_irq_restore(flags);
-        return nr;
-}
-/**
- * get_user_pages_fast() - pin user pages in memory
- * @start:      starting user address
- * @nr_pages:   number of pages from start to pin
- * @write:      whether pages will be written to
- * @pages:      array that receives pointers to the pages pinned.
- *              Should be at least nr_pages long.
- *
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
- * If not successful, it will fall back to taking the lock and
- * calling get_user_pages().
- *
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
- */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-                        struct page **pages)
-{
-        struct mm_struct *mm = current->mm;
-        unsigned long addr, len, end;
-        unsigned long next;
-        pgd_t *pgdp;
-        int nr = 0;
-        start &= PAGE_MASK;
-        addr = start;
-        len = (unsigned long) nr_pages << PAGE_SHIFT;
-        end = start + len;
-        if (end < start)
-                goto slow_irqon;
-#ifdef CONFIG_X86_64
-        if (end >> __VIRTUAL_MASK_SHIFT)
-                goto slow_irqon;
-#endif
-        /*
-         * XXX: batch / limit 'nr', to avoid large irq off latency
-         * needs some instrumenting to determine the common sizes used by
-         * important workloads (eg. DB2), and whether limiting the batch size
-         * will decrease performance.
-         *
-         * It seems like we're in the clear for the moment. Direct-IO is
-         * the main guy that batches up lots of get_user_pages, and even
-         * they are limited to 64-at-a-time which is not so many.
-         */
-        /*
-         * This doesn't prevent pagetable teardown, but does prevent
-         * the pagetables and pages from being freed on x86.
-         *
-         * So long as we atomically load page table pointers versus teardown
-         * (which we do on x86, with the above PAE exception), we can follow the
-         * address down to the the page and take a ref on it.
-         */
-        local_irq_disable();
-        pgdp = pgd_offset(mm, addr);
-        do {
-                pgd_t pgd = *pgdp;
-                next = pgd_addr_end(addr, end);
-                if (pgd_none(pgd))
-                        goto slow;
-                if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
-                        goto slow;
-        } while (pgdp++, addr = next, addr != end);
-        local_irq_enable();
-        VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
-        return nr;
-        {
-                int ret;
-slow:
-                local_irq_enable();
-slow_irqon:
-                /* Try to get the remaining pages with get_user_pages */
-                start += nr << PAGE_SHIFT;
-                pages += nr;
-                ret = get_user_pages_unlocked(start,
-                                              (end - start) >> PAGE_SHIFT,
-                                              pages, write ? FOLL_WRITE : 0);
-                /* Have to be a bit careful with return values */
-                if (nr > 0) {
-                        if (ret < 0)
-                                ret = nr;
-                        else
-                                ret += nr;
-                }
-                return ret;
-        }
-}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 9b3f9fa5b283..673541eb3b3f 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -811,10 +811,8 @@ void __init zone_sizes_init(void)
 }
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
-#ifdef CONFIG_SMP
+        .loaded_mm = &init_mm,
-        .active_mm = &init_mm,
        .state = 0,
-#endif
        .cr4 = ~0UL,    /* fail hard if we screw up cr4 shadow initialization */
 };
 EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0a59daf799f8..dae6a5e5ad4a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -92,6 +92,44 @@ __setup("noexec32=", nonx32_setup);
 * When memory was added make sure all the processes MM have
 * suitable PGD entries in the local PGD level page.
 */
+#ifdef CONFIG_X86_5LEVEL
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+        unsigned long addr;
+        for (addr = start; addr <= end; addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+                const pgd_t *pgd_ref = pgd_offset_k(addr);
+                struct page *page;
+                /* Check for overflow */
+                if (addr < start)
+                        break;
+                if (pgd_none(*pgd_ref))
+                        continue;
+                spin_lock(&pgd_lock);
+                list_for_each_entry(page, &pgd_list, lru) {
+                        pgd_t *pgd;
+                        spinlock_t *pgt_lock;
+                        pgd = (pgd_t *)page_address(page) + pgd_index(addr);
+                        /* the pgt_lock only for Xen */
+                        pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
+                        spin_lock(pgt_lock);
+                        if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
+                                BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
+                        if (pgd_none(*pgd))
+                                set_pgd(pgd, *pgd_ref);
+                        spin_unlock(pgt_lock);
+                }
+                spin_unlock(&pgd_lock);
+        }
+}
+#else
 void sync_global_pgds(unsigned long start, unsigned long end)
 {
        unsigned long addr;
@@ -135,6 +173,7 @@ void sync_global_pgds(unsigned long start, unsigned long end)
                spin_unlock(&pgd_lock);
        }
 }
+#endif
 /*
 * NOTE: This function is marked __ref because it calls __init function
@@ -585,6 +624,57 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end,
        return paddr_last;
 }
+static unsigned long __meminit
+phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
+              unsigned long page_size_mask)
+{
+        unsigned long paddr_next, paddr_last = paddr_end;
+        unsigned long vaddr = (unsigned long)__va(paddr);
+        int i = p4d_index(vaddr);
+        if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+                return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, page_size_mask);
+        for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+                p4d_t *p4d;
+                pud_t *pud;
+                vaddr = (unsigned long)__va(paddr);
+                p4d = p4d_page + p4d_index(vaddr);
+                paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+                if (paddr >= paddr_end) {
+                        if (!after_bootmem &&
+                            !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+                                             E820_TYPE_RAM) &&
+                            !e820__mapped_any(paddr & P4D_MASK, paddr_next,
+                                             E820_TYPE_RESERVED_KERN))
+                                set_p4d(p4d, __p4d(0));
+                        continue;
+                }
+                if (!p4d_none(*p4d)) {
+                        pud = pud_offset(p4d, 0);
+                        paddr_last = phys_pud_init(pud, paddr,
+                                        paddr_end,
+                                        page_size_mask);
+                        __flush_tlb_all();
+                        continue;
+                }
+                pud = alloc_low_page();
+                paddr_last = phys_pud_init(pud, paddr, paddr_end,
+                                           page_size_mask);
+                spin_lock(&init_mm.page_table_lock);
+                p4d_populate(&init_mm, p4d, pud);
+                spin_unlock(&init_mm.page_table_lock);
+        }
+        __flush_tlb_all();
+        return paddr_last;
+}
 /*
 * Create page table mapping for the physical memory for specific physical
 * addresses. The virtual and physical addresses have to be aligned on PMD level
@@ -606,26 +696,26 @@ kernel_physical_mapping_init(unsigned long paddr_start,
        for (; vaddr < vaddr_end; vaddr = vaddr_next) {
                pgd_t *pgd = pgd_offset_k(vaddr);
                p4d_t *p4d;
-                pud_t *pud;
                vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
-                BUILD_BUG_ON(pgd_none(*pgd));
+                if (pgd_val(*pgd)) {
-                p4d = p4d_offset(pgd, vaddr);
+                        p4d = (p4d_t *)pgd_page_vaddr(*pgd);
-                if (p4d_val(*p4d)) {
+                        paddr_last = phys_p4d_init(p4d, __pa(vaddr),
-                        pud = (pud_t *)p4d_page_vaddr(*p4d);
-                        paddr_last = phys_pud_init(pud, __pa(vaddr),
                                                   __pa(vaddr_end),
                                                   page_size_mask);
                        continue;
                }
-                pud = alloc_low_page();
+                p4d = alloc_low_page();
-                paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
+                paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
                                           page_size_mask);
                spin_lock(&init_mm.page_table_lock);
-                p4d_populate(&init_mm, p4d, pud);
+                if (IS_ENABLED(CONFIG_X86_5LEVEL))
+                        pgd_populate(&init_mm, pgd, p4d);
+                else
+                        p4d_populate(&init_mm, p4d_offset(pgd, vaddr), (pud_t *) p4d);
                spin_unlock(&init_mm.page_table_lock);
                pgd_changed = true;
        }
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index bbc558b88a88..4c1b5fd0c7ad 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -424,7 +424,7 @@ static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss;
 static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
 {
        /* Don't assume we're using swapper_pg_dir at this point */
-        pgd_t *base = __va(read_cr3());
+        pgd_t *base = __va(read_cr3_pa());
        pgd_t *pgd = &base[pgd_index(addr)];
        p4d_t *p4d = p4d_offset(pgd, addr);
        pud_t *pud = pud_offset(p4d, addr);
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 0c7d8129bed6..88215ac16b24 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -12,7 +12,7 @@
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
-extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pgd_t early_top_pgt[PTRS_PER_PGD];
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 static int __init map_range(struct range *range)
@@ -109,8 +109,8 @@ void __init kasan_early_init(void)
        for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
                kasan_zero_p4d[i] = __p4d(p4d_val);
-        kasan_map_early_shadow(early_level4_pgt);
+        kasan_map_early_shadow(early_top_pgt);
-        kasan_map_early_shadow(init_level4_pgt);
+        kasan_map_early_shadow(init_top_pgt);
 }
 void __init kasan_init(void)
@@ -121,8 +121,8 @@ void __init kasan_init(void)
        register_die_notifier(&kasan_die_notifier);
 #endif
-        memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
+        memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
-        load_cr3(early_level4_pgt);
+        load_cr3(early_top_pgt);
        __flush_tlb_all();
        clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
@@ -148,7 +148,7 @@ void __init kasan_init(void)
        kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
                        (void *)KASAN_SHADOW_END);
-        load_cr3(init_level4_pgt);
+        load_cr3(init_top_pgt);
        __flush_tlb_all();
        /*
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aed206475aa7..af599167fe3c 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -6,12 +6,12 @@
 *
 * Entropy is generated using the KASLR early boot functions now shared in
 * the lib directory (originally written by Kees Cook). Randomization is
- * done on PGD & PUD page table levels to increase possible addresses. The
+ * done on PGD & P4D/PUD page table levels to increase possible addresses.
- * physical memory mapping code was adapted to support PUD level virtual
+ * The physical memory mapping code was adapted to support P4D/PUD level
- * addresses. This implementation on the best configuration provides 30,000
+ * virtual addresses. This implementation on the best configuration provides
- * possible virtual addresses in average for each memory region. An additional
+ * 30,000 possible virtual addresses in average for each memory region.
- * low memory page is used to ensure each CPU can start with a PGD aligned
+ * An additional low memory page is used to ensure each CPU can start with
- * virtual address (for realmode).
+ * a PGD aligned virtual address (for realmode).
 *
 * The order of each memory region is not changed. The feature looks at
 * the available space for the regions based on different configuration
@@ -70,7 +70,7 @@ static __initdata struct kaslr_memory_region {
        unsigned long *base;
        unsigned long size_tb;
 } kaslr_regions[] = {
-        { &page_offset_base, 64/* Maximum */ },
+        { &page_offset_base, 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
        { &vmalloc_base, VMALLOC_SIZE_TB },
        { &vmemmap_base, 1 },
 };
@@ -142,7 +142,10 @@ void __init kernel_randomize_memory(void)
                 */
                entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
                prandom_bytes_state(&rand_state, &rand, sizeof(rand));
-                entropy = (rand % (entropy + 1)) & PUD_MASK;
+                if (IS_ENABLED(CONFIG_X86_5LEVEL))
+                        entropy = (rand % (entropy + 1)) & P4D_MASK;
+                else
+                        entropy = (rand % (entropy + 1)) & PUD_MASK;
                vaddr += entropy;
                *kaslr_regions[i].base = vaddr;
@@ -151,27 +154,21 @@ void __init kernel_randomize_memory(void)
                 * randomization alignment.
                 */
                vaddr += get_padding(&kaslr_regions[i]);
-                vaddr = round_up(vaddr + 1, PUD_SIZE);
+                if (IS_ENABLED(CONFIG_X86_5LEVEL))
+                        vaddr = round_up(vaddr + 1, P4D_SIZE);
+                else
+                        vaddr = round_up(vaddr + 1, PUD_SIZE);
                remain_entropy -= entropy;
        }
 }
-/*
+static void __meminit init_trampoline_pud(void)
- * Create PGD aligned trampoline table to allow real mode initialization
- * of additional CPUs. Consume only 1 low memory page.
- */
-void __meminit init_trampoline(void)
 {
        unsigned long paddr, paddr_next;
        pgd_t *pgd;
        pud_t *pud_page, *pud_page_tramp;
        int i;
-        if (!kaslr_memory_enabled()) {
-                init_trampoline_default();
-                return;
-        }
        pud_page_tramp = alloc_low_page();
        paddr = 0;
@@ -192,3 +189,49 @@ void __meminit init_trampoline(void)
        set_pgd(&trampoline_pgd_entry,
                __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
 }
+static void __meminit init_trampoline_p4d(void)
+{
+        unsigned long paddr, paddr_next;
+        pgd_t *pgd;
+        p4d_t *p4d_page, *p4d_page_tramp;
+        int i;
+        p4d_page_tramp = alloc_low_page();
+        paddr = 0;
+        pgd = pgd_offset_k((unsigned long)__va(paddr));
+        p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
+        for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+                p4d_t *p4d, *p4d_tramp;
+                unsigned long vaddr = (unsigned long)__va(paddr);
+                p4d_tramp = p4d_page_tramp + p4d_index(paddr);
+                p4d = p4d_page + p4d_index(vaddr);
+                paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+                *p4d_tramp = *p4d;
+        }
+        set_pgd(&trampoline_pgd_entry,
+                __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
+}
+/*
+ * Create PGD aligned trampoline table to allow real mode initialization
+ * of additional CPUs. Consume only 1 low memory page.
+ */
+void __meminit init_trampoline(void)
+{
+        if (!kaslr_memory_enabled()) {
+                init_trampoline_default();
+                return;
+        }
+        if (IS_ENABLED(CONFIG_X86_5LEVEL))
+                init_trampoline_p4d();
+        else
+                init_trampoline_pud();
+}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 19ad095b41df..797295e792b2 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -74,9 +74,6 @@ static int mmap_is_legacy(void)
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;
-        if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
-                return 1;
        return sysctl_legacy_va_layout;
 }
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6e7bedf69af7..014d07a80053 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,7 +15,7 @@
 #include <linux/debugfs.h>
 /*
- *      Smarter SMP flushing macros.
+ *      TLB flushing, formerly SMP-only
 *              c/o Linus Torvalds.
 *
 *      These mean you can really definitely utterly forget about
@@ -28,39 +28,28 @@
 *      Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
 */
-#ifdef CONFIG_SMP
-struct flush_tlb_info {
-        struct mm_struct *flush_mm;
-        unsigned long flush_start;
-        unsigned long flush_end;
-};
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- */
 void leave_mm(int cpu)
 {
-        struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
+        struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
+        /*
+         * It's plausible that we're in lazy TLB mode while our mm is init_mm.
+         * If so, our callers still expect us to flush the TLB, but there
+         * aren't any user TLB entries in init_mm to worry about.
+         *
+         * This needs to happen before any other sanity checks due to
+         * intel_idle's shenanigans.
+         */
+        if (loaded_mm == &init_mm)
+                return;
        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
                BUG();
-        if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
-                cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
+        switch_mm(NULL, &init_mm, NULL);
-                load_cr3(swapper_pg_dir);
-                /*
-                 * This gets called in the idle path where RCU
-                 * functions differently.  Tracing normally
-                 * uses RCU, so we have to call the tracepoint
-                 * specially here.
-                 */
-                trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-        }
 }
 EXPORT_SYMBOL_GPL(leave_mm);
-#endif /* CONFIG_SMP */
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
               struct task_struct *tsk)
 {
@@ -75,216 +64,167 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                        struct task_struct *tsk)
 {
        unsigned cpu = smp_processor_id();
+        struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
-        if (likely(prev != next)) {
+        /*
-                if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+         * NB: The scheduler will call us with prev == next when
-                        /*
+         * switching from lazy TLB mode to normal mode if active_mm
-                         * If our current stack is in vmalloc space and isn't
+         * isn't changing.  When this happens, there is no guarantee
-                         * mapped in the new pgd, we'll double-fault.  Forcibly
+         * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
-                         * map it.
+         *
-                         */
+         * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
-                        unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
+         */
-                        pgd_t *pgd = next->pgd + stack_pgd_index;
-                        if (unlikely(pgd_none(*pgd)))
-                                set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
-                }
-#ifdef CONFIG_SMP
+        this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-                this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-                this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
-                cpumask_set_cpu(cpu, mm_cpumask(next));
+        if (real_prev == next) {
+                /*
+                 * There's nothing to do: we always keep the per-mm control
+                 * regs in sync with cpu_tlbstate.loaded_mm.  Just
+                 * sanity-check mm_cpumask.
+                 */
+                if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
+                        cpumask_set_cpu(cpu, mm_cpumask(next));
+                return;
+        }
+        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                /*
-                 * Re-load page tables.
+                 * If our current stack is in vmalloc space and isn't
-                 *
+                 * mapped in the new pgd, we'll double-fault.  Forcibly
-                 * This logic has an ordering constraint:
+                 * map it.
-                 *
-                 *  CPU 0: Write to a PTE for 'next'
-                 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-                 *  CPU 1: set bit 1 in next's mm_cpumask
-                 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-                 *
-                 * We need to prevent an outcome in which CPU 1 observes
-                 * the new PTE value and CPU 0 observes bit 1 clear in
-                 * mm_cpumask.  (If that occurs, then the IPI will never
-                 * be sent, and CPU 0's TLB will contain a stale entry.)
-                 *
-                 * The bad outcome can occur if either CPU's load is
-                 * reordered before that CPU's store, so both CPUs must
-                 * execute full barriers to prevent this from happening.
-                 *
-                 * Thus, switch_mm needs a full barrier between the
-                 * store to mm_cpumask and any operation that could load
-                 * from next->pgd.  TLB fills are special and can happen
-                 * due to instruction fetches or for no reason at all,
-                 * and neither LOCK nor MFENCE orders them.
-                 * Fortunately, load_cr3() is serializing and gives the
-                 * ordering guarantee we need.
-                 *
                 */
-                load_cr3(next->pgd);
+                unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-                trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+                pgd_t *pgd = next->pgd + stack_pgd_index;
-                /* Stop flush ipis for the previous mm */
+                if (unlikely(pgd_none(*pgd)))
-                cpumask_clear_cpu(cpu, mm_cpumask(prev));
+                        set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+        }
-                /* Load per-mm CR4 state */
+        this_cpu_write(cpu_tlbstate.loaded_mm, next);
-                load_mm_cr4(next);
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
+        WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
-                /*
+        cpumask_set_cpu(cpu, mm_cpumask(next));
-                 * Load the LDT, if the LDT is different.
-                 *
+        /*
-                 * It's possible that prev->context.ldt doesn't match
+         * Re-load page tables.
-                 * the LDT register.  This can happen if leave_mm(prev)
+         *
-                 * was called and then modify_ldt changed
+         * This logic has an ordering constraint:
-                 * prev->context.ldt but suppressed an IPI to this CPU.
+         *
-                 * In this case, prev->context.ldt != NULL, because we
+         *  CPU 0: Write to a PTE for 'next'
-                 * never set context.ldt to NULL while the mm still
+         *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-                 * exists.  That means that next->context.ldt !=
+         *  CPU 1: set bit 1 in next's mm_cpumask
-                 * prev->context.ldt, because mms never share an LDT.
+         *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-                 */
+         *
-                if (unlikely(prev->context.ldt != next->context.ldt))
+         * We need to prevent an outcome in which CPU 1 observes
-                        load_mm_ldt(next);
+         * the new PTE value and CPU 0 observes bit 1 clear in
-#endif
+         * mm_cpumask.  (If that occurs, then the IPI will never
+         * be sent, and CPU 0's TLB will contain a stale entry.)
+         *
+         * The bad outcome can occur if either CPU's load is
+         * reordered before that CPU's store, so both CPUs must
+         * execute full barriers to prevent this from happening.
+         *
+         * Thus, switch_mm needs a full barrier between the
+         * store to mm_cpumask and any operation that could load
+         * from next->pgd.  TLB fills are special and can happen
+         * due to instruction fetches or for no reason at all,
+         * and neither LOCK nor MFENCE orders them.
+         * Fortunately, load_cr3() is serializing and gives the
+         * ordering guarantee we need.
+         */
+        load_cr3(next->pgd);
+        /*
+         * This gets called via leave_mm() in the idle path where RCU
+         * functions differently.  Tracing normally uses RCU, so we have to
+         * call the tracepoint specially here.
+         */
+        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+        /* Stop flush ipis for the previous mm */
+        WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
+                     real_prev != &init_mm);
+        cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
+        /* Load per-mm CR4 and LDTR state */
+        load_mm_cr4(next);
+        switch_ldt(real_prev, next);
+}
+static void flush_tlb_func_common(const struct flush_tlb_info *f,
+                                  bool local, enum tlb_flush_reason reason)
+{
+        /* This code cannot presently handle being reentered. */
+        VM_WARN_ON(!irqs_disabled());
+        if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
+                leave_mm(smp_processor_id());
+                return;
        }
-#ifdef CONFIG_SMP
-          else {
-                this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-                BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-                if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
-                        /*
-                         * On established mms, the mm_cpumask is only changed
-                         * from irq context, from ptep_clear_flush() while in
-                         * lazy tlb mode, and here. Irqs are blocked during
-                         * schedule, protecting us from simultaneous changes.
-                         */
-                        cpumask_set_cpu(cpu, mm_cpumask(next));
-                        /*
+        if (f->end == TLB_FLUSH_ALL) {
-                         * We were in lazy tlb mode and leave_mm disabled
+                local_flush_tlb();
-                         * tlb flush IPI delivery. We must reload CR3
+                if (local)
-                         * to make sure to use no freed page tables.
+                        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-                         *
+                trace_tlb_flush(reason, TLB_FLUSH_ALL);
-                         * As above, load_cr3() is serializing and orders TLB
+        } else {
-                         * fills with respect to the mm_cpumask write.
+                unsigned long addr;
-                         */
+                unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
-                        load_cr3(next->pgd);
+                addr = f->start;
-                        trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+                while (addr < f->end) {
-                        load_mm_cr4(next);
+                        __flush_tlb_single(addr);
-                        load_mm_ldt(next);
+                        addr += PAGE_SIZE;
                }
+                if (local)
+                        count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
+                trace_tlb_flush(reason, nr_pages);
        }
-#endif
 }
-#ifdef CONFIG_SMP
+static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
+{
+        const struct flush_tlb_info *f = info;
-/*
+        flush_tlb_func_common(f, true, reason);
- * The flush IPI assumes that a thread switch happens in this order:
+}
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) set cpu_tlbstate to TLBSTATE_OK
- *      Now the tlb flush NMI handler flush_tlb_func won't call leave_mm
- *      if cpu0 was in lazy tlb mode.
- * 1a2) update cpu active_mm
- *      Now cpu0 accepts tlb flushes for the new mm.
- * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask);
- *      Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask);
- *      Stop ipi delivery for the old mm. This is not synchronized with
- *      the other cpus, but flush_tlb_func ignore flush ipis for the wrong
- *      mm, and in the worst case we perform a superfluous tlb flush.
- * 1b) thread switch without mm change
- *      cpu active_mm is correct, cpu0 already handles flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- *      Atomically set the bit [other cpus will start sending flush ipis],
- *      and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- *   runs in kernel space, the cpu could load tlb entries for user space
- *   pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-/*
+static void flush_tlb_func_remote(void *info)
- * TLB flush funcation:
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-static void flush_tlb_func(void *info)
 {
-        struct flush_tlb_info *f = info;
+        const struct flush_tlb_info *f = info;
        inc_irq_stat(irq_tlb_count);
-        if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+        if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
                return;
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
-        if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
+        flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
-                if (f->flush_end == TLB_FLUSH_ALL) {
-                        local_flush_tlb();
-                        trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
-                } else {
-                        unsigned long addr;
-                        unsigned long nr_pages =
-                                (f->flush_end - f->flush_start) / PAGE_SIZE;
-                        addr = f->flush_start;
-                        while (addr < f->flush_end) {
-                                __flush_tlb_single(addr);
-                                addr += PAGE_SIZE;
-                        }
-                        trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
-                }
-        } else
-                leave_mm(smp_processor_id());
 }
 void native_flush_tlb_others(const struct cpumask *cpumask,
-                                 struct mm_struct *mm, unsigned long start,
+                             const struct flush_tlb_info *info)
-                                 unsigned long end)
 {
-        struct flush_tlb_info info;
-        info.flush_mm = mm;
-        info.flush_start = start;
-        info.flush_end = end;
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
-        if (end == TLB_FLUSH_ALL)
+        if (info->end == TLB_FLUSH_ALL)
                trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
        else
                trace_tlb_flush(TLB_REMOTE_SEND_IPI,
-                                (end - start) >> PAGE_SHIFT);
+                                (info->end - info->start) >> PAGE_SHIFT);
        if (is_uv_system()) {
                unsigned int cpu;
                cpu = smp_processor_id();
-                cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
+                cpumask = uv_flush_tlb_others(cpumask, info);
                if (cpumask)
-                        smp_call_function_many(cpumask, flush_tlb_func,
+                        smp_call_function_many(cpumask, flush_tlb_func_remote,
-                                                                &info, 1);
+                                               (void *)info, 1);
                return;
        }
-        smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
+        smp_call_function_many(cpumask, flush_tlb_func_remote,
+                               (void *)info, 1);
 }
 /*
@@ -302,85 +242,41 @@ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
                                unsigned long end, unsigned long vmflag)
 {
-        unsigned long addr;
+        int cpu;
-        /* do a global flush by default */
-        unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
-        preempt_disable();
-        if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+        struct flush_tlb_info info = {
-                base_pages_to_flush = (end - start) >> PAGE_SHIFT;
+                .mm = mm,
-        if (base_pages_to_flush > tlb_single_page_flush_ceiling)
+        };
-                base_pages_to_flush = TLB_FLUSH_ALL;
-        if (current->active_mm != mm) {
+        cpu = get_cpu();
-                /* Synchronize with switch_mm. */
-                smp_mb();
-                goto out;
+        /* Synchronize with switch_mm. */
-        }
+        smp_mb();
-        if (!current->mm) {
-                leave_mm(smp_processor_id());
-                /* Synchronize with switch_mm. */
+        /* Should we flush just the requested range? */
-                smp_mb();
+        if ((end != TLB_FLUSH_ALL) &&
+            !(vmflag & VM_HUGETLB) &&
-                goto out;
+            ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
-        }
+                info.start = start;
+                info.end = end;
-        /*
-         * Both branches below are implicit full barriers (MOV to CR or
-         * INVLPG) that synchronize with switch_mm.
-         */
-        if (base_pages_to_flush == TLB_FLUSH_ALL) {
-                count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-                local_flush_tlb();
        } else {
-                /* flush range by one by one 'invlpg' */
+                info.start = 0UL;
-                for (addr = start; addr < end;  addr += PAGE_SIZE) {
+                info.end = TLB_FLUSH_ALL;
-                        count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
-                        __flush_tlb_single(addr);
-                }
-        }
-        trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
-out:
-        if (base_pages_to_flush == TLB_FLUSH_ALL) {
-                start = 0UL;
-                end = TLB_FLUSH_ALL;
        }
-        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-                flush_tlb_others(mm_cpumask(mm), mm, start, end);
-        preempt_enable();
-}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
+        if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
-{
+                VM_WARN_ON(irqs_disabled());
-        struct mm_struct *mm = vma->vm_mm;
+                local_irq_disable();
+                flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
-        preempt_disable();
+                local_irq_enable();
-        if (current->active_mm == mm) {
-                if (current->mm) {
-                        /*
-                         * Implicit full barrier (INVLPG) that synchronizes
-                         * with switch_mm.
-                         */
-                        __flush_tlb_one(start);
-                } else {
-                        leave_mm(smp_processor_id());
-                        /* Synchronize with switch_mm. */
-                        smp_mb();
-                }
        }
-        if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
+        if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
-                flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
+                flush_tlb_others(mm_cpumask(mm), &info);
+        put_cpu();
-        preempt_enable();
 }
 static void do_flush_tlb_all(void *info)
 {
        count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -401,7 +297,7 @@ static void do_kernel_range_flush(void *info)
        unsigned long addr;
        /* flush range by one by one 'invlpg' */
-        for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+        for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
                __flush_tlb_single(addr);
 }
@@ -410,16 +306,40 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
        /* Balance as user space task's flush, a bit conservative */
        if (end == TLB_FLUSH_ALL ||
-            (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
+            (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
                on_each_cpu(do_flush_tlb_all, NULL, 1);
        } else {
                struct flush_tlb_info info;
-                info.flush_start = start;
+                info.start = start;
-                info.flush_end = end;
+                info.end = end;
                on_each_cpu(do_kernel_range_flush, &info, 1);
        }
 }
+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+        struct flush_tlb_info info = {
+                .mm = NULL,
+                .start = 0UL,
+                .end = TLB_FLUSH_ALL,
+        };
+        int cpu = get_cpu();
+        if (cpumask_test_cpu(cpu, &batch->cpumask)) {
+                VM_WARN_ON(irqs_disabled());
+                local_irq_disable();
+                flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
+                local_irq_enable();
+        }
+        if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
+                flush_tlb_others(&batch->cpumask, &info);
+        cpumask_clear(&batch->cpumask);
+        put_cpu();
+}
 static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
                             size_t count, loff_t *ppos)
 {
@@ -465,5 +385,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
        return 0;
 }
 late_initcall(create_tlb_single_page_flush_ceiling);
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8ff1f95627f9..9bf72f5bfedb 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -80,7 +80,7 @@ pgd_t * __init efi_call_phys_prolog(void)
        int n_pgds, i, j;
        if (!efi_enabled(EFI_OLD_MEMMAP)) {
-                save_pgd = (pgd_t *)read_cr3();
+                save_pgd = (pgd_t *)__read_cr3();
                write_cr3((unsigned long)efi_scratch.efi_pgt);
                goto out;
        }
@@ -649,7 +649,7 @@ efi_status_t efi_thunk_set_virtual_address_map(
        efi_sync_low_kernel_mappings();
        local_irq_save(flags);
-        efi_scratch.prev_cr3 = read_cr3();
+        efi_scratch.prev_cr3 = __read_cr3();
        write_cr3((unsigned long)efi_scratch.efi_pgt);
        __flush_tlb_all();
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
index c5350fd27d70..0668aaff8bfe 100644
--- a/arch/x86/platform/olpc/olpc-xo1-pm.c
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -77,7 +77,7 @@ static int xo1_power_state_enter(suspend_state_t pm_state)
 asmlinkage __visible int xo1_do_sleep(u8 sleep_state)
 {
-        void *pgd_addr = __va(read_cr3());
+        void *pgd_addr = __va(read_cr3_pa());
        /* Program wakeup mask (using dword access to CS5536_PM1_EN) */
        outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 795671593528..2983faab5b18 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1123,11 +1123,9 @@ static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
 * done.  The returned pointer is valid till preemption is re-enabled.
 */
 const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-                                                struct mm_struct *mm,
+                                          const struct flush_tlb_info *info)
-                                                unsigned long start,
-                                                unsigned long end,
-                                                unsigned int cpu)
 {
+        unsigned int cpu = smp_processor_id();
        int locals = 0, remotes = 0, hubs = 0;
        struct bau_desc *bau_desc;
        struct cpumask *flush_mask;
@@ -1181,8 +1179,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
        record_send_statistics(stat, locals, hubs, remotes, bau_desc);
-        if (!end || (end - start) <= PAGE_SIZE)
+        if (!info->end || (info->end - info->start) <= PAGE_SIZE)
-                address = start;
+                address = info->start;
        else
                address = TLB_FLUSH_ALL;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 6b05a9219ea2..78459a6d455a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -129,7 +129,7 @@ static void __save_processor_state(struct saved_context *ctxt)
         */
        ctxt->cr0 = read_cr0();
        ctxt->cr2 = read_cr2();
-        ctxt->cr3 = read_cr3();
+        ctxt->cr3 = __read_cr3();
        ctxt->cr4 = __read_cr4();
 #ifdef CONFIG_X86_64
        ctxt->cr8 = read_cr8();
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index a6e21fee22ea..e3e62c8a8e70 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -150,7 +150,8 @@ static int relocate_restore_code(void)
        memcpy((void *)relocated_restore_code, &core_restore_code, PAGE_SIZE);
        /* Make the page containing the relocated code executable */
-        pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
+        pgd = (pgd_t *)__va(read_cr3_pa()) +
+                pgd_index(relocated_restore_code);
        p4d = p4d_offset(pgd, relocated_restore_code);
        if (p4d_large(*p4d)) {
                set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index a163a90af4aa..cd4be19c36dc 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -102,7 +102,7 @@ static void __init setup_real_mode(void)
        trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
        trampoline_pgd[0] = trampoline_pgd_entry.pgd;
-        trampoline_pgd[511] = init_level4_pgt[511].pgd;
+        trampoline_pgd[511] = init_top_pgt[511].pgd;
 #endif
 }
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 1f386d7fdf70..1d7a7213a310 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -975,37 +975,32 @@ static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
        spin_unlock(&mm->page_table_lock);
 }
+static void drop_mm_ref_this_cpu(void *info)
-#ifdef CONFIG_SMP
-/* Another cpu may still have their %cr3 pointing at the pagetable, so
-   we need to repoint it somewhere else before we can unpin it. */
-static void drop_other_mm_ref(void *info)
 {
        struct mm_struct *mm = info;
-        struct mm_struct *active_mm;
-        active_mm = this_cpu_read(cpu_tlbstate.active_mm);
-        if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
+        if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
                leave_mm(smp_processor_id());
-        /* If this cpu still has a stale cr3 reference, then make sure
+        /*
-           it has been flushed. */
+         * If this cpu still has a stale cr3 reference, then make sure
+         * it has been flushed.
+         */
        if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
-                load_cr3(swapper_pg_dir);
+                xen_mc_flush();
 }
+#ifdef CONFIG_SMP
+/*
+ * Another cpu may still have their %cr3 pointing at the pagetable, so
+ * we need to repoint it somewhere else before we can unpin it.
+ */
 static void xen_drop_mm_ref(struct mm_struct *mm)
 {
        cpumask_var_t mask;
        unsigned cpu;
-        if (current->active_mm == mm) {
+        drop_mm_ref_this_cpu(mm);
-                if (current->mm == mm)
-                        load_cr3(swapper_pg_dir);
-                else
-                        leave_mm(smp_processor_id());
-        }
        /* Get the "official" set of cpus referring to our pagetable. */
        if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
@@ -1013,31 +1008,31 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
                        if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
                            && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
                                continue;
-                        smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
+                        smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
                }
                return;
        }
        cpumask_copy(mask, mm_cpumask(mm));
-        /* It's possible that a vcpu may have a stale reference to our
+        /*
-           cr3, because its in lazy mode, and it hasn't yet flushed
+         * It's possible that a vcpu may have a stale reference to our
-           its set of pending hypercalls yet.  In this case, we can
+         * cr3, because its in lazy mode, and it hasn't yet flushed
-           look at its actual current cr3 value, and force it to flush
+         * its set of pending hypercalls yet.  In this case, we can
-           if needed. */
+         * look at its actual current cr3 value, and force it to flush
+         * if needed.
+         */
        for_each_online_cpu(cpu) {
                if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
                        cpumask_set_cpu(cpu, mask);
        }
-        if (!cpumask_empty(mask))
+        smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
-                smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
        free_cpumask_var(mask);
 }
 #else
 static void xen_drop_mm_ref(struct mm_struct *mm)
 {
-        if (current->active_mm == mm)
+        drop_mm_ref_this_cpu(mm);
-                load_cr3(swapper_pg_dir);
 }
 #endif
@@ -1366,8 +1361,7 @@ static void xen_flush_tlb_single(unsigned long addr)
 }
 static void xen_flush_tlb_others(const struct cpumask *cpus,
-                                 struct mm_struct *mm, unsigned long start,
+                                 const struct flush_tlb_info *info)
-                                 unsigned long end)
 {
        struct {
                struct mmuext_op op;
@@ -1379,7 +1373,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
        } *args;
        struct multicall_space mcs;
-        trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
+        trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
        if (cpumask_empty(cpus))
                return;         /* nothing to do */
@@ -1393,9 +1387,10 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
        cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
        args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-        if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
+        if (info->end != TLB_FLUSH_ALL &&
+            (info->end - info->start) <= PAGE_SIZE) {
                args->op.cmd = MMUEXT_INVLPG_MULTI;
-                args->op.arg1.linear_addr = start;
+                args->op.arg1.linear_addr = info->start;
        }
        MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
@@ -1470,8 +1465,8 @@ static void xen_write_cr3(unsigned long cr3)
 * At the start of the day - when Xen launches a guest, it has already
 * built pagetables for the guest. We diligently look over them
 * in xen_setup_kernel_pagetable and graft as appropriate them in the
- * init_level4_pgt and its friends. Then when we are happy we load
+ * init_top_pgt and its friends. Then when we are happy we load
- * the new init_level4_pgt - and continue on.
+ * the new init_top_pgt - and continue on.
 *
 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
 * up the rest of the pagetables. When it has completed it loads the cr3.
@@ -1914,12 +1909,12 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
        pt_end = pt_base + xen_start_info->nr_pt_frames;
        /* Zap identity mapping */
-        init_level4_pgt[0] = __pgd(0);
+        init_top_pgt[0] = __pgd(0);
        /* Pre-constructed entries are in pfn, so convert to mfn */
        /* L4[272] -> level3_ident_pgt  */
        /* L4[511] -> level3_kernel_pgt */
-        convert_pfn_mfn(init_level4_pgt);
+        convert_pfn_mfn(init_top_pgt);
        /* L3_i[0] -> level2_ident_pgt */
        convert_pfn_mfn(level3_ident_pgt);
@@ -1950,10 +1945,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
        /* Copy the initial P->M table mappings if necessary. */
        i = pgd_index(xen_start_info->mfn_list);
        if (i && i < pgd_index(__START_KERNEL_map))
-                init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
+                init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
        /* Make pagetable pieces RO */
-        set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+        set_page_prot(init_top_pgt, PAGE_KERNEL_RO);
        set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
        set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
        set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
@@ -1964,7 +1959,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
        /* Pin down new L4 */
        pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
-                          PFN_DOWN(__pa_symbol(init_level4_pgt)));
+                          PFN_DOWN(__pa_symbol(init_top_pgt)));
        /* Unpin Xen-provided one */
        pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1974,7 +1969,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
         * attach it to, so make sure we just set kernel pgd.
         */
        xen_mc_batch();
-        __xen_write_cr3(true, __pa(init_level4_pgt));
+        __xen_write_cr3(true, __pa(init_top_pgt));
        xen_mc_issue(PARAVIRT_LAZY_CPU);
        /* We can't that easily rip out L3 and L2, as the Xen pagetables are
@@ -2022,7 +2017,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
        pmd_t pmd;
        pte_t pte;
-        pa = read_cr3();
+        pa = read_cr3_pa();
        pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
                                                       sizeof(pgd)));
        if (!pgd_present(pgd))
@@ -2102,7 +2097,7 @@ void __init xen_relocate_p2m(void)
        pt_phys = pmd_phys + PFN_PHYS(n_pmd);
        p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
-        pgd = __va(read_cr3());
+        pgd = __va(read_cr3_pa());
        new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
        idx_p4d = 0;
        save_pud = n_pud;
@@ -2209,7 +2204,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
 {
        unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
-        BUG_ON(read_cr3() != __pa(initial_page_table));
+        BUG_ON(read_cr3_pa() != __pa(initial_page_table));
        BUG_ON(cr3 != __pa(swapper_pg_dir));
        /*
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
index 5e246716d58f..e1a5fbeae08d 100644
--- a/arch/x86/xen/xen-pvh.S
+++ b/arch/x86/xen/xen-pvh.S
@@ -87,7 +87,7 @@ ENTRY(pvh_start_xen)
        wrmsr
        /* Enable pre-constructed page tables. */
-        mov $_pa(init_level4_pgt), %eax
+        mov $_pa(init_top_pgt), %eax
        mov %eax, %cr3
        mov $(X86_CR0_PG | X86_CR0_PE), %eax
        mov %eax, %cr0
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-07-03 17:45:09 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-07-03 17:45:09 -0400
commit	7a69f9c60b49699579f5bfb71f928cceba0afe1a (patch)
tree	bf3b5640bbd9f23beeb5a55d18348d65bafff8e8 /arch/x86
parent	9bc088ab66be8978fbc981ba9644468fa2c2fd3f (diff)
parent	8781fb7e9749da424e01daacd14834b674658c63 (diff)