Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar: "Lots of changes in this cycle: - Lots of CPA (change page attribute) optimizations and related cleanups (Thomas Gleixner, Peter Zijstra) - Make lazy TLB mode even lazier (Rik van Riel) - Fault handler cleanups and improvements (Dave Hansen) - kdump, vmcore: Enable kdumping encrypted memory with AMD SME enabled (Lianbo Jiang) - Clean up VM layout documentation (Baoquan He, Ingo Molnar) - ... plus misc other fixes and enhancements" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits) x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry() x86/mm: Kill stray kernel fault handling comment x86/mm: Do not warn about PCI BIOS W+X mappings resource: Clean it up a bit resource: Fix find_next_iomem_res() iteration issue resource: Include resource end in walk_*() interfaces x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error x86/mm: Remove spurious fault pkey check x86/mm/vsyscall: Consider vsyscall page part of user address space x86/mm: Add vsyscall address helper x86/mm: Fix exception table comments x86/mm: Add clarifying comments for user addr space x86/mm: Break out user address space handling x86/mm: Break out kernel address space handling x86/mm: Clarify hardware vs. software "error_code" x86/mm/tlb: Make lazy TLB mode lazier x86/mm/tlb: Add freed_tables element to flush_tlb_info x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range smp,cpumask: introduce on_each_cpu_cond_mask smp: use __cpumask_set_cpu in on_each_cpu_cond ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-10-23 12:05:28 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-10-23 12:05:28 -0400
commit: 99792e0cea1ed733cdc8d0758677981e0cbebfed (patch)
tree: acf6868f48f687dd8667ee4f99c156415ea8ff7b
parent: 382d72a9aa525b56ab8453ce61751fa712414d3d (diff)
parent: 977e4be5eb714c48a67afc26a6c477f24130a1f2 (diff)
28 files changed, 1117 insertions, 619 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 5432a96d31ff..702898633b00 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -1,55 +1,124 @@
+====================================================
+Complete virtual memory map with 4-level page tables
+====================================================
-Virtual memory map with 4 level page tables:
+Notes:
-0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
+ - Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down
-hole caused by [47:63] sign extension
+   from the top of the 64-bit address space. It's easier to understand the layout
-ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
+   when seen both in absolute addresses and in distance-from-top notation.
-ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
-ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
+   For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the
-ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
+   64-bit address space (ffffffffffffffff).
-ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
-ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
+   Note that as we get closer to the top of the address space, the notation changes
-... unused hole ...
+   from TB to GB and then MB/KB.
-ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
-... unused hole ...
+ - "16M TB" might look weird at first sight, but it's an easier to visualize size
-                                    vaddr_end for KASLR
+   notation than "16 EB", which few will recognize at first sight as 16 exabytes.
-fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
+   It also shows it nicely how incredibly large 64-bit address space is.
-fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI
-ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
+========================================================================================================================
-... unused hole ...
+    Start addr    |   Offset   |     End addr     |  Size   | VM area description
-ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
+========================================================================================================================
-... unused hole ...
+                  |            |                  |         |
-ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
+ 0000000000000000 |    0       | 00007fffffffffff |  128 TB | user-space virtual memory, different per mm
-ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
+__________________|____________|__________________|_________|___________________________________________________________
-[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+                  |            |                  |         |
-ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
+ 0000800000000000 | +128    TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
-ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
+                  |            |                  |         |     virtual memory addresses up to the -128 TB
+                  |            |                  |         |     starting offset of kernel mappings.
-Virtual memory map with 5 level page tables:
+__________________|____________|__________________|_________|___________________________________________________________
+                                                            |
-0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
+                                                            | Kernel-space virtual memory, shared between all processes:
-hole caused by [56:63] sign extension
+____________________________________________________________|___________________________________________________________
-ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
+                  |            |                  |         |
-ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
+ ffff800000000000 | -128    TB | ffff87ffffffffff |    8 TB | ... guard hole, also reserved for hypervisor
-ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
+ ffff880000000000 | -120    TB | ffffc7ffffffffff |   64 TB | direct mapping of all physical memory (page_offset_base)
-ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
+ ffffc80000000000 |  -56    TB | ffffc8ffffffffff |    1 TB | ... unused hole
-ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
+ ffffc90000000000 |  -55    TB | ffffe8ffffffffff |   32 TB | vmalloc/ioremap space (vmalloc_base)
-ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
+ ffffe90000000000 |  -23    TB | ffffe9ffffffffff |    1 TB | ... unused hole
-... unused hole ...
+ ffffea0000000000 |  -22    TB | ffffeaffffffffff |    1 TB | virtual memory map (vmemmap_base)
-ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
+ ffffeb0000000000 |  -21    TB | ffffebffffffffff |    1 TB | ... unused hole
-... unused hole ...
+ ffffec0000000000 |  -20    TB | fffffbffffffffff |   16 TB | KASAN shadow memory
-                                    vaddr_end for KASLR
+ fffffc0000000000 |   -4    TB | fffffdffffffffff |    2 TB | ... unused hole
-fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping
+                  |            |                  |         | vaddr_end for KASLR
-... unused hole ...
+ fffffe0000000000 |   -2    TB | fffffe7fffffffff |  0.5 TB | cpu_entry_area mapping
-ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
+ fffffe8000000000 |   -1.5  TB | fffffeffffffffff |  0.5 TB | LDT remap for PTI
-... unused hole ...
+ ffffff0000000000 |   -1    TB | ffffff7fffffffff |  0.5 TB | %esp fixup stacks
-ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
+__________________|____________|__________________|_________|____________________________________________________________
-... unused hole ...
+                                                            |
-ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
+                                                            | Identical layout to the 47-bit one from here on:
-ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space
+____________________________________________________________|____________________________________________________________
-[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+                  |            |                  |         |
-ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
+ ffffff8000000000 | -512    GB | ffffffeeffffffff |  444 GB | ... unused hole
-ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
+ ffffffef00000000 |  -68    GB | fffffffeffffffff |   64 GB | EFI region mapping space
+ ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | ... unused hole
+ ffffffff80000000 |   -2    GB | ffffffff9fffffff |  512 MB | kernel text mapping, mapped to physical address 0
+ ffffffff80000000 |-2048    MB |                  |         |
+ ffffffffa0000000 |-1536    MB | fffffffffeffffff | 1520 MB | module mapping space
+ ffffffffff000000 |  -16    MB |                  |         |
+    FIXADDR_START | ~-11    MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
+ ffffffffff600000 |  -10    MB | ffffffffff600fff |    4 kB | legacy vsyscall ABI
+ ffffffffffe00000 |   -2    MB | ffffffffffffffff |    2 MB | ... unused hole
+__________________|____________|__________________|_________|___________________________________________________________
+====================================================
+Complete virtual memory map with 5-level page tables
+====================================================
+Notes:
+ - With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
+   from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
+   offset and many of the regions expand to support the much larger physical
+   memory supported.
+========================================================================================================================
+    Start addr    |   Offset   |     End addr     |  Size   | VM area description
+========================================================================================================================
+                  |            |                  |         |
+ 0000000000000000 |    0       | 00ffffffffffffff |   64 PB | user-space virtual memory, different per mm
+__________________|____________|__________________|_________|___________________________________________________________
+                  |            |                  |         |
+ 0000800000000000 |  +64    PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
+                  |            |                  |         |     virtual memory addresses up to the -128 TB
+                  |            |                  |         |     starting offset of kernel mappings.
+__________________|____________|__________________|_________|___________________________________________________________
+                                                            |
+                                                            | Kernel-space virtual memory, shared between all processes:
+____________________________________________________________|___________________________________________________________
+                  |            |                  |         |
+ ff00000000000000 |  -64    PB | ff0fffffffffffff |    4 PB | ... guard hole, also reserved for hypervisor
+ ff10000000000000 |  -60    PB | ff8fffffffffffff |   32 PB | direct mapping of all physical memory (page_offset_base)
+ ff90000000000000 |  -28    PB | ff9fffffffffffff |    4 PB | LDT remap for PTI
+ ffa0000000000000 |  -24    PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base)
+ ffd2000000000000 |  -11.5  PB | ffd3ffffffffffff |  0.5 PB | ... unused hole
+ ffd4000000000000 |  -11    PB | ffd5ffffffffffff |  0.5 PB | virtual memory map (vmemmap_base)
+ ffd6000000000000 |  -10.5  PB | ffdeffffffffffff | 2.25 PB | ... unused hole
+ ffdf000000000000 |   -8.25 PB | fffffdffffffffff |   ~8 PB | KASAN shadow memory
+ fffffc0000000000 |   -4    TB | fffffdffffffffff |    2 TB | ... unused hole
+                  |            |                  |         | vaddr_end for KASLR
+ fffffe0000000000 |   -2    TB | fffffe7fffffffff |  0.5 TB | cpu_entry_area mapping
+ fffffe8000000000 |   -1.5  TB | fffffeffffffffff |  0.5 TB | ... unused hole
+ ffffff0000000000 |   -1    TB | ffffff7fffffffff |  0.5 TB | %esp fixup stacks
+__________________|____________|__________________|_________|____________________________________________________________
+                                                            |
+                                                            | Identical layout to the 47-bit one from here on:
+____________________________________________________________|____________________________________________________________
+                  |            |                  |         |
+ ffffff8000000000 | -512    GB | ffffffeeffffffff |  444 GB | ... unused hole
+ ffffffef00000000 |  -68    GB | fffffffeffffffff |   64 GB | EFI region mapping space
+ ffffffff00000000 |   -4    GB | ffffffff7fffffff |    2 GB | ... unused hole
+ ffffffff80000000 |   -2    GB | ffffffff9fffffff |  512 MB | kernel text mapping, mapped to physical address 0
+ ffffffff80000000 |-2048    MB |                  |         |
+ ffffffffa0000000 |-1536    MB | fffffffffeffffff | 1520 MB | module mapping space
+ ffffffffff000000 |  -16    MB |                  |         |
+    FIXADDR_START | ~-11    MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
+ ffffffffff600000 |  -10    MB | ffffffffff600fff |    4 kB | legacy vsyscall ABI
+ ffffffffffe00000 |   -2    MB | ffffffffffffffff |    2 MB | ... unused hole
+__________________|____________|__________________|_________|___________________________________________________________
 Architecture defines a 64-bit virtual address. Implementations can support
 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8282985d438a..ff425a2d286c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES
          supports them), so don't confuse the user by printing
          that we have them enabled.
+config X86_CPA_STATISTICS
+        bool "Enable statistic for Change Page Attribute"
+        depends on DEBUG_FS
+        ---help---
+          Expose statistics about the Change Page Attribute mechanims, which
+          helps to determine the effectivness of preserving large and huge
+          page mappings when mapping protections are changed.
 config ARCH_HAS_MEM_ENCRYPT
        def_bool y
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 9a92a3ac2ac5..832da8229cc7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
 #define ioremap_nocache ioremap_nocache
 extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
 #define ioremap_uc ioremap_uc
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 #define ioremap_cache ioremap_cache
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
 #define ioremap_prot ioremap_prot
+extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
+#define ioremap_encrypted ioremap_encrypted
 /**
 * ioremap     -   map bus memory into CPU space
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index f327236f0fa7..5125fca472bb 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -67,7 +67,7 @@ struct kimage;
 /* Memory to backup during crash kdump */
 #define KEXEC_BACKUP_SRC_START  (0UL)
-#define KEXEC_BACKUP_SRC_END    (640 * 1024UL)  /* 640K */
+#define KEXEC_BACKUP_SRC_END    (640 * 1024UL - 1)      /* 640K */
 /*
 * CPU does not save ss and sp on stack if execution is already
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 6afac386a434..cd0cf1c568b4 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -59,13 +59,16 @@
 #endif
 /*
- * Kernel image size is limited to 1GiB due to the fixmap living in the
+ * Maximum kernel image size is limited to 1 GiB, due to the fixmap living
- * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use
+ * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
- * 512MiB by default, leaving 1.5GiB for modules once the page tables
+ *
- * are fully set up. If kernel ASLR is configured, it can extend the
+ * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
- * kernel page table mapping, reducing the size of the modules area.
+ * page tables are fully set up.
+ *
+ * If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
+ * of the modules area to 1.5 GiB.
 */
-#if defined(CONFIG_RANDOMIZE_BASE)
+#ifdef CONFIG_RANDOMIZE_BASE
 #define KERNEL_IMAGE_SIZE       (1024 * 1024 * 1024)
 #else
 #define KERNEL_IMAGE_SIZE       (512 * 1024 * 1024)
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index cb0a1f470980..404b8b1d44f5 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -6,16 +6,23 @@
 #define tlb_end_vma(tlb, vma) do { } while (0)
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
-#define tlb_flush(tlb)                                                  \
+static inline void tlb_flush(struct mmu_gather *tlb);
-{                                                                       \
-        if (!tlb->fullmm && !tlb->need_flush_all)                       \
-                flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
-        else                                                            \
-                flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL);   \
-}
 #include <asm-generic/tlb.h>
+static inline void tlb_flush(struct mmu_gather *tlb)
+{
+        unsigned long start = 0UL, end = TLB_FLUSH_ALL;
+        unsigned int stride_shift = tlb_get_unmap_shift(tlb);
+        if (!tlb->fullmm && !tlb->need_flush_all) {
+                start = tlb->start;
+                end = tlb->end;
+        }
+        flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
+}
 /*
 * While x86 architecture in general requires an IPI to perform TLB
 * shootdown, enablement code for several hypervisors overrides
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 58ce5288878e..323a313947e0 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
 #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
 #endif
-static inline bool tlb_defer_switch_to_init_mm(void)
-{
-        /*
-         * If we have PCID, then switching to init_mm is reasonably
-         * fast.  If we don't have PCID, then switching to init_mm is
-         * quite slow, so we try to defer it in the hopes that we can
-         * avoid it entirely.  The latter approach runs the risk of
-         * receiving otherwise unnecessary IPIs.
-         *
-         * This choice is just a heuristic.  The tlb code can handle this
-         * function returning true or false regardless of whether we have
-         * PCID.
-         */
-        return !static_cpu_has(X86_FEATURE_PCID);
-}
 struct tlb_context {
        u64 ctx_id;
        u64 tlb_gen;
@@ -547,23 +531,30 @@ struct flush_tlb_info {
        unsigned long           start;
        unsigned long           end;
        u64                     new_tlb_gen;
+        unsigned int            stride_shift;
+        bool                    freed_tables;
 };
 #define local_flush_tlb() __flush_tlb()
-#define flush_tlb_mm(mm)        flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
+#define flush_tlb_mm(mm)                                                \
+                flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
-#define flush_tlb_range(vma, start, end)        \
+#define flush_tlb_range(vma, start, end)                                \
-                flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
+        flush_tlb_mm_range((vma)->vm_mm, start, end,                    \
+                           ((vma)->vm_flags & VM_HUGETLB)               \
+                                ? huge_page_shift(hstate_vma(vma))      \
+                                : PAGE_SHIFT, false)
 extern void flush_tlb_all(void);
 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
-                                unsigned long end, unsigned long vmflag);
+                                unsigned long end, unsigned int stride_shift,
+                                bool freed_tables);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
 {
-        flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+        flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
 }
 void native_flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e0778feac..eb8ab3915268 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -11,40 +11,62 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
-/**
+static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
- * copy_oldmem_page - copy one page from "oldmem"
+                                  unsigned long offset, int userbuf,
- * @pfn: page frame number to be copied
+                                  bool encrypted)
- * @buf: target memory address for the copy; this can be in kernel address
- *      space or user address space (see @userbuf)
- * @csize: number of bytes to copy
- * @offset: offset in bytes into the page (based on pfn) to begin the copy
- * @userbuf: if set, @buf is in user address space, use copy_to_user(),
- *      otherwise @buf is in kernel address space, use memcpy().
- *
- * Copy a page from "oldmem". For this page, there is no pte mapped
- * in the current kernel. We stitch up a pte, similar to kmap_atomic.
- */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-                size_t csize, unsigned long offset, int userbuf)
 {
        void  *vaddr;
        if (!csize)
                return 0;
-        vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
+        if (encrypted)
+                vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
+        else
+                vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
        if (!vaddr)
                return -ENOMEM;
        if (userbuf) {
-                if (copy_to_user(buf, vaddr + offset, csize)) {
+                if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
-                        iounmap(vaddr);
+                        iounmap((void __iomem *)vaddr);
                        return -EFAULT;
                }
        } else
                memcpy(buf, vaddr + offset, csize);
        set_iounmap_nonlazy();
-        iounmap(vaddr);
+        iounmap((void __iomem *)vaddr);
        return csize;
 }
+/**
+ * copy_oldmem_page - copy one page of memory
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ *      space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ *      otherwise @buf is in kernel address space, use memcpy().
+ *
+ * Copy a page from the old kernel's memory. For this page, there is no pte
+ * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+                         unsigned long offset, int userbuf)
+{
+        return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
+}
+/**
+ * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
+ * memory with the encryption mask set to accomodate kdump on SME-enabled
+ * machines.
+ */
+ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
+                                   unsigned long offset, int userbuf)
+{
+        return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
+}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 733e6ace0fa4..ab18e0884dc6 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
        map_ldt_struct_to_user(mm);
        va = (unsigned long)ldt_slot_va(slot);
-        flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
+        flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
        ldt->slot = slot;
        return 0;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 1c03e4aa6474..c2fd39752da8 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
        pte_unmap_unlock(pte, ptl);
 out:
        up_write(&mm->mmap_sem);
-        flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
+        flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
 }
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a12afff146d1..fc37bbd23eb8 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -19,7 +19,9 @@
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/highmem.h>
+#include <linux/pci.h>
+#include <asm/e820/types.h>
 #include <asm/pgtable.h>
 /*
@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
        return (signed long)(u << shift) >> shift;
 }
+static void note_wx(struct pg_state *st)
+{
+        unsigned long npages;
+        npages = (st->current_address - st->start_address) / PAGE_SIZE;
+#ifdef CONFIG_PCI_BIOS
+        /*
+         * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
+         * Inform about it, but avoid the warning.
+         */
+        if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
+            st->current_address <= PAGE_OFFSET + BIOS_END) {
+                pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
+                return;
+        }
+#endif
+        /* Account the WX pages */
+        st->wx_pages += npages;
+        WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
+                  (void *)st->start_address);
+}
 /*
 * This function gets called on a break in a continuous series
 * of PTE entries; the next one is different so we need to
@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
                unsigned long delta;
                int width = sizeof(unsigned long) * 2;
-                if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
+                if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
-                        WARN_ONCE(1,
+                        note_wx(st);
-                                  "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
-                                  (void *)st->start_address,
-                                  (void *)st->start_address);
-                        st->wx_pages += (st->current_address -
-                                         st->start_address) / PAGE_SIZE;
-                }
                /*
                 * Now print the actual finished series
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0d45f6debb3a..2b1519bc5381 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
        show_opcodes(regs, loglvl);
 }
+/*
+ * The (legacy) vsyscall page is the long page in the kernel portion
+ * of the address space that has user-accessible permissions.
+ */
+static bool is_vsyscall_vaddr(unsigned long vaddr)
+{
+        return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
+}
 static void
 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                       unsigned long address, u32 *pkey, int si_code)
@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
                if (is_errata100(regs, address))
                        return;
-#ifdef CONFIG_X86_64
-                /*
-                 * Instruction fetch faults in the vsyscall page might need
-                 * emulation.
-                 */
-                if (unlikely((error_code & X86_PF_INSTR) &&
-                             ((address & ~0xfff) == VSYSCALL_ADDR))) {
-                        if (emulate_vsyscall(regs, address))
-                                return;
-                }
-#endif
                /*
                 * To avoid leaking information about the kernel page table
                 * layout, pretend that user-mode accesses to kernel addresses
@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
        }
 }
-static int spurious_fault_check(unsigned long error_code, pte_t *pte)
+static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
 {
        if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
                return 0;
        if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
                return 0;
-        /*
-         * Note: We do not do lazy flushing on protection key
-         * changes, so no spurious fault will ever set X86_PF_PK.
-         */
-        if ((error_code & X86_PF_PK))
-                return 1;
        return 1;
 }
@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 * (Optional Invalidation).
 */
 static noinline int
-spurious_fault(unsigned long error_code, unsigned long address)
+spurious_kernel_fault(unsigned long error_code, unsigned long address)
 {
        pgd_t *pgd;
        p4d_t *p4d;
@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
                return 0;
        if (p4d_large(*p4d))
-                return spurious_fault_check(error_code, (pte_t *) p4d);
+                return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                return 0;
        if (pud_large(*pud))
-                return spurious_fault_check(error_code, (pte_t *) pud);
+                return spurious_kernel_fault_check(error_code, (pte_t *) pud);
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return 0;
        if (pmd_large(*pmd))
-                return spurious_fault_check(error_code, (pte_t *) pmd);
+                return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
        pte = pte_offset_kernel(pmd, address);
        if (!pte_present(*pte))
                return 0;
-        ret = spurious_fault_check(error_code, pte);
+        ret = spurious_kernel_fault_check(error_code, pte);
        if (!ret)
                return 0;
@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
         * Make sure we have permissions in PMD.
         * If not, then there's a bug in the page tables:
         */
-        ret = spurious_fault_check(error_code, (pte_t *) pmd);
+        ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
        WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
        return ret;
 }
-NOKPROBE_SYMBOL(spurious_fault);
+NOKPROBE_SYMBOL(spurious_kernel_fault);
 int show_unhandled_signals = 1;
@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 static int fault_in_kernel_space(unsigned long address)
 {
+        /*
+         * On 64-bit systems, the vsyscall page is at an address above
+         * TASK_SIZE_MAX, but is not considered part of the kernel
+         * address space.
+         */
+        if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
+                return false;
        return address >= TASK_SIZE_MAX;
 }
@@ -1214,31 +1213,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
 }
 /*
- * This routine handles page faults.  It determines the address,
+ * Called for all faults where 'address' is part of the kernel address
- * and the problem, and then passes it off to one of the appropriate
+ * space.  Might get called for faults that originate from *code* that
- * routines.
+ * ran in userspace or the kernel.
 */
-static noinline void
+static void
-__do_page_fault(struct pt_regs *regs, unsigned long error_code,
+do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
-                unsigned long address)
+                   unsigned long address)
 {
-        struct vm_area_struct *vma;
+        /*
-        struct task_struct *tsk;
+         * Protection keys exceptions only happen on user pages.  We
-        struct mm_struct *mm;
+         * have no user pages in the kernel portion of the address
-        vm_fault_t fault, major = 0;
+         * space, so do not expect them here.
-        unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+         */
-        u32 pkey;
+        WARN_ON_ONCE(hw_error_code & X86_PF_PK);
-        tsk = current;
-        mm = tsk->mm;
-        prefetchw(&mm->mmap_sem);
-        if (unlikely(kmmio_fault(regs, address)))
-                return;
        /*
-         * We fault-in kernel-space virtual memory on-demand. The
+         * We can fault-in kernel-space virtual memory on-demand. The
         * 'reference' page table is init_mm.pgd.
         *
         * NOTE! We MUST NOT take any locks for this case. We may
@@ -1246,41 +1237,74 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
         * only copy the information from the master page table,
         * nothing more.
         *
-         * This verifies that the fault happens in kernel space
+         * Before doing this on-demand faulting, ensure that the
-         * (error_code & 4) == 0, and that the fault was not a
+         * fault is not any of the following:
-         * protection error (error_code & 9) == 0.
+         * 1. A fault on a PTE with a reserved bit set.
+         * 2. A fault caused by a user-mode access.  (Do not demand-
+         *    fault kernel memory due to user-mode accesses).
+         * 3. A fault caused by a page-level protection violation.
+         *    (A demand fault would be on a non-present page which
+         *     would have X86_PF_PROT==0).
         */
-        if (unlikely(fault_in_kernel_space(address))) {
+        if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
-                if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
+                if (vmalloc_fault(address) >= 0)
-                        if (vmalloc_fault(address) >= 0)
-                                return;
-                }
-                /* Can handle a stale RO->RW TLB: */
-                if (spurious_fault(error_code, address))
                        return;
+        }
-                /* kprobes don't want to hook the spurious faults: */
+        /* Was the fault spurious, caused by lazy TLB invalidation? */
-                if (kprobes_fault(regs))
+        if (spurious_kernel_fault(hw_error_code, address))
-                        return;
+                return;
-                /*
-                 * Don't take the mm semaphore here. If we fixup a prefetch
-                 * fault we could otherwise deadlock:
-                 */
-                bad_area_nosemaphore(regs, error_code, address, NULL);
+        /* kprobes don't want to hook the spurious faults: */
+        if (kprobes_fault(regs))
                return;
-        }
+        /*
+         * Note, despite being a "bad area", there are quite a few
+         * acceptable reasons to get here, such as erratum fixups
+         * and handling kernel code that can fault, like get_user().
+         *
+         * Don't take the mm semaphore here. If we fixup a prefetch
+         * fault we could otherwise deadlock:
+         */
+        bad_area_nosemaphore(regs, hw_error_code, address, NULL);
+}
+NOKPROBE_SYMBOL(do_kern_addr_fault);
+/* Handle faults in the user portion of the address space */
+static inline
+void do_user_addr_fault(struct pt_regs *regs,
+                        unsigned long hw_error_code,
+                        unsigned long address)
+{
+        unsigned long sw_error_code;
+        struct vm_area_struct *vma;
+        struct task_struct *tsk;
+        struct mm_struct *mm;
+        vm_fault_t fault, major = 0;
+        unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+        u32 pkey;
+        tsk = current;
+        mm = tsk->mm;
        /* kprobes don't want to hook the spurious faults: */
        if (unlikely(kprobes_fault(regs)))
                return;
-        if (unlikely(error_code & X86_PF_RSVD))
+        /*
-                pgtable_bad(regs, error_code, address);
+         * Reserved bits are never expected to be set on
+         * entries in the user portion of the page tables.
+         */
+        if (unlikely(hw_error_code & X86_PF_RSVD))
+                pgtable_bad(regs, hw_error_code, address);
-        if (unlikely(smap_violation(error_code, regs))) {
+        /*
-                bad_area_nosemaphore(regs, error_code, address, NULL);
+         * Check for invalid kernel (supervisor) access to user
+         * pages in the user address space.
+         */
+        if (unlikely(smap_violation(hw_error_code, regs))) {
+                bad_area_nosemaphore(regs, hw_error_code, address, NULL);
                return;
        }
@@ -1289,11 +1313,18 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
         * in a region with pagefaults disabled then we must not take the fault
         */
        if (unlikely(faulthandler_disabled() || !mm)) {
-                bad_area_nosemaphore(regs, error_code, address, NULL);
+                bad_area_nosemaphore(regs, hw_error_code, address, NULL);
                return;
        }
        /*
+         * hw_error_code is literally the "page fault error code" passed to
+         * the kernel directly from the hardware.  But, we will shortly be
+         * modifying it in software, so give it a new name.
+         */
+        sw_error_code = hw_error_code;
+        /*
         * It's safe to allow irq's after cr2 has been saved and the
         * vmalloc fault has been handled.
         *
@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
         */
        if (user_mode(regs)) {
                local_irq_enable();
-                error_code |= X86_PF_USER;
+                /*
+                 * Up to this point, X86_PF_USER set in hw_error_code
+                 * indicated a user-mode access.  But, after this,
+                 * X86_PF_USER in sw_error_code will indicate either
+                 * that, *or* an implicit kernel(supervisor)-mode access
+                 * which originated from user mode.
+                 */
+                if (!(hw_error_code & X86_PF_USER)) {
+                        /*
+                         * The CPU was in user mode, but the CPU says
+                         * the fault was not a user-mode access.
+                         * Must be an implicit kernel-mode access,
+                         * which we do not expect to happen in the
+                         * user address space.
+                         */
+                        pr_warn_once("kernel-mode error from user-mode: %lx\n",
+                                        hw_error_code);
+                        sw_error_code |= X86_PF_USER;
+                }
                flags |= FAULT_FLAG_USER;
        } else {
                if (regs->flags & X86_EFLAGS_IF)
@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-        if (error_code & X86_PF_WRITE)
+        if (sw_error_code & X86_PF_WRITE)
                flags |= FAULT_FLAG_WRITE;
-        if (error_code & X86_PF_INSTR)
+        if (sw_error_code & X86_PF_INSTR)
                flags |= FAULT_FLAG_INSTRUCTION;
+#ifdef CONFIG_X86_64
        /*
-         * When running in the kernel we expect faults to occur only to
+         * Instruction fetch faults in the vsyscall page might need
-         * addresses in user space.  All other faults represent errors in
+         * emulation.  The vsyscall page is at a high address
-         * the kernel and should generate an OOPS.  Unfortunately, in the
+         * (>PAGE_OFFSET), but is considered to be part of the user
-         * case of an erroneous fault occurring in a code path which already
+         * address space.
-         * holds mmap_sem we will deadlock attempting to validate the fault
-         * against the address space.  Luckily the kernel only validly
-         * references user space from well defined areas of code, which are
-         * listed in the exceptions table.
         *
-         * As the vast majority of faults will be valid we will only perform
+         * The vsyscall page does not have a "real" VMA, so do this
-         * the source reference check when there is a possibility of a
+         * emulation before we go searching for VMAs.
-         * deadlock. Attempt to lock the address space, if we cannot we then
+         */
-         * validate the source. If this is invalid we can skip the address
+        if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
-         * space check, thus avoiding the deadlock:
+                if (emulate_vsyscall(regs, address))
+                        return;
+        }
+#endif
+        /*
+         * Kernel-mode access to the user address space should only occur
+         * on well-defined single instructions listed in the exception
+         * tables.  But, an erroneous kernel fault occurring outside one of
+         * those areas which also holds mmap_sem might deadlock attempting
+         * to validate the fault against the address space.
+         *
+         * Only do the expensive exception table search when we might be at
+         * risk of a deadlock.  This happens if we
+         * 1. Failed to acquire mmap_sem, and
+         * 2. The access did not originate in userspace.  Note: either the
+         *    hardware or earlier page fault code may set X86_PF_USER
+         *    in sw_error_code.
         */
        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
-                if (!(error_code & X86_PF_USER) &&
+                if (!(sw_error_code & X86_PF_USER) &&
                    !search_exception_tables(regs->ip)) {
-                        bad_area_nosemaphore(regs, error_code, address, NULL);
+                        /*
+                         * Fault from code in kernel from
+                         * which we do not expect faults.
+                         */
+                        bad_area_nosemaphore(regs, sw_error_code, address, NULL);
                        return;
                }
 retry:
@@ -1351,16 +1419,16 @@ retry:
        vma = find_vma(mm, address);
        if (unlikely(!vma)) {
-                bad_area(regs, error_code, address);
+                bad_area(regs, sw_error_code, address);
                return;
        }
        if (likely(vma->vm_start <= address))
                goto good_area;
        if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
-                bad_area(regs, error_code, address);
+                bad_area(regs, sw_error_code, address);
                return;
        }
-        if (error_code & X86_PF_USER) {
+        if (sw_error_code & X86_PF_USER) {
                /*
                 * Accessing the stack below %sp is always a bug.
                 * The large cushion allows instructions like enter
@@ -1368,12 +1436,12 @@ retry:
                 * 32 pointers and then decrements %sp by 65535.)
                 */
                if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
-                        bad_area(regs, error_code, address);
+                        bad_area(regs, sw_error_code, address);
                        return;
                }
        }
        if (unlikely(expand_stack(vma, address))) {
-                bad_area(regs, error_code, address);
+                bad_area(regs, sw_error_code, address);
                return;
        }
@@ -1382,8 +1450,8 @@ retry:
         * we can handle it..
         */
 good_area:
-        if (unlikely(access_error(error_code, vma))) {
+        if (unlikely(access_error(sw_error_code, vma))) {
-                bad_area_access_error(regs, error_code, address, vma);
+                bad_area_access_error(regs, sw_error_code, address, vma);
                return;
        }
@@ -1425,13 +1493,13 @@ good_area:
                        return;
                /* Not returning to user mode? Handle exceptions or die: */
-                no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+                no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
                return;
        }
        up_read(&mm->mmap_sem);
        if (unlikely(fault & VM_FAULT_ERROR)) {
-                mm_fault_error(regs, error_code, address, &pkey, fault);
+                mm_fault_error(regs, sw_error_code, address, &pkey, fault);
                return;
        }
@@ -1449,6 +1517,28 @@ good_area:
        check_v8086_mode(regs, address, tsk);
 }
+NOKPROBE_SYMBOL(do_user_addr_fault);
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ */
+static noinline void
+__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
+                unsigned long address)
+{
+        prefetchw(&current->mm->mmap_sem);
+        if (unlikely(kmmio_fault(regs, address)))
+                return;
+        /* Was the fault on kernel-controlled part of the address space? */
+        if (unlikely(fault_in_kernel_space(address)))
+                do_kern_addr_fault(regs, hw_error_code, address);
+        else
+                do_user_addr_fault(regs, hw_error_code, address);
+}
 NOKPROBE_SYMBOL(__do_page_fault);
 static nokprobe_inline void
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 979e0a02cbe1..142c7d9f89cc 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
 void mark_rodata_ro(void)
 {
        unsigned long start = PFN_ALIGN(_text);
-        unsigned long size = PFN_ALIGN(_etext) - start;
+        unsigned long size = (unsigned long)__end_rodata - start;
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-        printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+        pr_info("Write protecting kernel text and read-only data: %luk\n",
                size >> 10);
        kernel_set_to_readonly = 1;
 #ifdef CONFIG_CPA_DEBUG
-        printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+        pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
-                start, start+size);
-        set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
-        printk(KERN_INFO "Testing CPA: write protecting again\n");
-        set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
-#endif
-        start += size;
-        size = (unsigned long)__end_rodata - start;
-        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
-        printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
-                size >> 10);
-#ifdef CONFIG_CPA_DEBUG
-        printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
        set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
-        printk(KERN_INFO "Testing CPA: write protecting again\n");
+        pr_info("Testing CPA: write protecting again\n");
        set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 #endif
        mark_nxdata_nx();
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..24e0920a9b25 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
 * caller shouldn't need to know that small detail.
 */
 static void __iomem *__ioremap_caller(resource_size_t phys_addr,
-                unsigned long size, enum page_cache_mode pcm, void *caller)
+                unsigned long size, enum page_cache_mode pcm,
+                void *caller, bool encrypted)
 {
        unsigned long offset, vaddr;
        resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
         * resulting mapping.
         */
        prot = PAGE_KERNEL_IO;
-        if (sev_active() && mem_flags.desc_other)
+        if ((sev_active() && mem_flags.desc_other) || encrypted)
                prot = pgprot_encrypted(prot);
        switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
        enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
        return __ioremap_caller(phys_addr, size, pcm,
-                                __builtin_return_address(0));
+                                __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_nocache);
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
        enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
        return __ioremap_caller(phys_addr, size, pcm,
-                                __builtin_return_address(0));
+                                __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL_GPL(ioremap_uc);
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
        return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
-                                        __builtin_return_address(0));
+                                        __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wc);
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
 void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
 {
        return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
-                                        __builtin_return_address(0));
+                                        __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_wt);
+void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
+{
+        return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
+                                __builtin_return_address(0), true);
+}
+EXPORT_SYMBOL(ioremap_encrypted);
 void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
 {
        return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
-                                __builtin_return_address(0));
+                                __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_cache);
@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
 {
        return __ioremap_caller(phys_addr, size,
                                pgprot2cachemode(__pgprot(prot_val)),
-                                __builtin_return_address(0));
+                                __builtin_return_address(0), false);
 }
 EXPORT_SYMBOL(ioremap_prot);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 51a5a69ecac9..62bb30b4bd2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -37,11 +37,20 @@ struct cpa_data {
        unsigned long   numpages;
        int             flags;
        unsigned long   pfn;
-        unsigned        force_split : 1;
+        unsigned        force_split             : 1,
+                        force_static_prot       : 1;
        int             curpage;
        struct page     **pages;
 };
+enum cpa_warn {
+        CPA_CONFLICT,
+        CPA_PROTECT,
+        CPA_DETECT,
+};
+static const int cpa_warn_level = CPA_PROTECT;
 /*
 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
@@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m)
 static inline void split_page_count(int level) { }
 #endif
+#ifdef CONFIG_X86_CPA_STATISTICS
+static unsigned long cpa_1g_checked;
+static unsigned long cpa_1g_sameprot;
+static unsigned long cpa_1g_preserved;
+static unsigned long cpa_2m_checked;
+static unsigned long cpa_2m_sameprot;
+static unsigned long cpa_2m_preserved;
+static unsigned long cpa_4k_install;
+static inline void cpa_inc_1g_checked(void)
+{
+        cpa_1g_checked++;
+}
+static inline void cpa_inc_2m_checked(void)
+{
+        cpa_2m_checked++;
+}
+static inline void cpa_inc_4k_install(void)
+{
+        cpa_4k_install++;
+}
+static inline void cpa_inc_lp_sameprot(int level)
+{
+        if (level == PG_LEVEL_1G)
+                cpa_1g_sameprot++;
+        else
+                cpa_2m_sameprot++;
+}
+static inline void cpa_inc_lp_preserved(int level)
+{
+        if (level == PG_LEVEL_1G)
+                cpa_1g_preserved++;
+        else
+                cpa_2m_preserved++;
+}
+static int cpastats_show(struct seq_file *m, void *p)
+{
+        seq_printf(m, "1G pages checked:     %16lu\n", cpa_1g_checked);
+        seq_printf(m, "1G pages sameprot:    %16lu\n", cpa_1g_sameprot);
+        seq_printf(m, "1G pages preserved:   %16lu\n", cpa_1g_preserved);
+        seq_printf(m, "2M pages checked:     %16lu\n", cpa_2m_checked);
+        seq_printf(m, "2M pages sameprot:    %16lu\n", cpa_2m_sameprot);
+        seq_printf(m, "2M pages preserved:   %16lu\n", cpa_2m_preserved);
+        seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
+        return 0;
+}
+static int cpastats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, cpastats_show, NULL);
+}
+static const struct file_operations cpastats_fops = {
+        .open           = cpastats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int __init cpa_stats_init(void)
+{
+        debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
+                            &cpastats_fops);
+        return 0;
+}
+late_initcall(cpa_stats_init);
+#else
+static inline void cpa_inc_1g_checked(void) { }
+static inline void cpa_inc_2m_checked(void) { }
+static inline void cpa_inc_4k_install(void) { }
+static inline void cpa_inc_lp_sameprot(int level) { }
+static inline void cpa_inc_lp_preserved(int level) { }
+#endif
 static inline int
 within(unsigned long addr, unsigned long start, unsigned long end)
 {
@@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache)
        on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
-static void __cpa_flush_range(void *arg)
+static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
 {
-        /*
+        BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
-         * We could optimize that further and do individual per page
-         * tlb invalidates for a low number of pages. Caveat: we must
+        WARN_ON(PAGE_ALIGN(start) != start);
-         * flush the high aliases on 64bit as well.
-         */
+        if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
-        __flush_tlb_all();
+                cpa_flush_all(cache);
+                return true;
+        }
+        flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
+        return !cache;
 }
 static void cpa_flush_range(unsigned long start, int numpages, int cache)
@@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
        unsigned int i, level;
        unsigned long addr;
-        BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+        if (__cpa_flush_range(start, numpages, cache))
-        WARN_ON(PAGE_ALIGN(start) != start);
-        on_each_cpu(__cpa_flush_range, NULL, 1);
-        if (!cache)
                return;
        /*
@@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
        }
 }
-static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+static void cpa_flush_array(unsigned long baddr, unsigned long *start,
+                            int numpages, int cache,
                            int in_flags, struct page **pages)
 {
        unsigned int i, level;
-#ifdef CONFIG_PREEMPT
-        /*
-         * Avoid wbinvd() because it causes latencies on all CPUs,
-         * regardless of any CPU isolation that may be in effect.
-         *
-         * This should be extended for CAT enabled systems independent of
-         * PREEMPT because wbinvd() does not respect the CAT partitions and
-         * this is exposed to unpriviledged users through the graphics
-         * subsystem.
-         */
-        unsigned long do_wbinvd = 0;
-#else
-        unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
-#endif
-        BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
-        on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
+        if (__cpa_flush_range(baddr, numpages, cache))
-        if (!cache || do_wbinvd)
                return;
        /*
@@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
        }
 }
-/*
+static bool overlaps(unsigned long r1_start, unsigned long r1_end,
- * Certain areas of memory on x86 require very specific protection flags,
+                     unsigned long r2_start, unsigned long r2_end)
- * for example the BIOS area or kernel text. Callers don't always get this
- * right (again, ioremap() on BIOS memory is not uncommon) so this function
- * checks and fixes these known static required protection bits.
- */
-static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
-                                   unsigned long pfn)
 {
-        pgprot_t forbidden = __pgprot(0);
+        return (r1_start <= r2_end && r1_end >= r2_start) ||
+                (r2_start <= r1_end && r2_end >= r1_start);
+}
-        /*
-         * The BIOS area between 640k and 1Mb needs to be executable for
-         * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
-         */
 #ifdef CONFIG_PCI_BIOS
-        if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+/*
-                pgprot_val(forbidden) |= _PAGE_NX;
+ * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
+ * based config access (CONFIG_PCI_GOBIOS) support.
+ */
+#define BIOS_PFN        PFN_DOWN(BIOS_BEGIN)
+#define BIOS_PFN_END    PFN_DOWN(BIOS_END - 1)
+static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
+{
+        if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
+                return _PAGE_NX;
+        return 0;
+}
+#else
+static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
+{
+        return 0;
+}
 #endif
-        /*
+/*
-         * The kernel text needs to be executable for obvious reasons
+ * The .rodata section needs to be read-only. Using the pfn catches all
-         * Does not cover __inittext since that is gone later on. On
+ * aliases.  This also includes __ro_after_init, so do not enforce until
-         * 64bit we do not enforce !NX on the low mapping
+ * kernel_set_to_readonly is true.
-         */
+ */
-        if (within(address, (unsigned long)_text, (unsigned long)_etext))
+static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
-                pgprot_val(forbidden) |= _PAGE_NX;
+{
+        unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
        /*
-         * The .rodata section needs to be read-only. Using the pfn
+         * Note: __end_rodata is at page aligned and not inclusive, so
-         * catches all aliases.  This also includes __ro_after_init,
+         * subtract 1 to get the last enforced PFN in the rodata area.
-         * so do not enforce until kernel_set_to_readonly is true.
         */
-        if (kernel_set_to_readonly &&
+        epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
-            within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT,
-                   __pa_symbol(__end_rodata) >> PAGE_SHIFT))
+        if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
-                pgprot_val(forbidden) |= _PAGE_RW;
+                return _PAGE_RW;
+        return 0;
+}
+/*
+ * Protect kernel text against becoming non executable by forbidding
+ * _PAGE_NX.  This protects only the high kernel mapping (_text -> _etext)
+ * out of which the kernel actually executes.  Do not protect the low
+ * mapping.
+ *
+ * This does not cover __inittext since that is gone after boot.
+ */
+static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
+{
+        unsigned long t_end = (unsigned long)_etext - 1;
+        unsigned long t_start = (unsigned long)_text;
+        if (overlaps(start, end, t_start, t_end))
+                return _PAGE_NX;
+        return 0;
+}
 #if defined(CONFIG_X86_64)
+/*
+ * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+ * kernel text mappings for the large page aligned text, rodata sections
+ * will be always read-only. For the kernel identity mappings covering the
+ * holes caused by this alignment can be anything that user asks.
+ *
+ * This will preserve the large page mappings for kernel text/data at no
+ * extra cost.
+ */
+static pgprotval_t protect_kernel_text_ro(unsigned long start,
+                                          unsigned long end)
+{
+        unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
+        unsigned long t_start = (unsigned long)_text;
+        unsigned int level;
+        if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
+                return 0;
        /*
-         * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
+         * Don't enforce the !RW mapping for the kernel text mapping, if
-         * kernel text mappings for the large page aligned text, rodata sections
+         * the current mapping is already using small page mapping.  No
-         * will be always read-only. For the kernel identity mappings covering
+         * need to work hard to preserve large page mappings in this case.
-         * the holes caused by this alignment can be anything that user asks.
         *
-         * This will preserve the large page mappings for kernel text/data
+         * This also fixes the Linux Xen paravirt guest boot failure caused
-         * at no extra cost.
+         * by unexpected read-only mappings for kernel identity
+         * mappings. In this paravirt guest case, the kernel text mapping
+         * and the kernel identity mapping share the same page-table pages,
+         * so the protections for kernel text and identity mappings have to
+         * be the same.
         */
-        if (kernel_set_to_readonly &&
+        if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
-            within(address, (unsigned long)_text,
+                return _PAGE_RW;
-                   (unsigned long)__end_rodata_hpage_align)) {
+        return 0;
-                unsigned int level;
+}
+#else
-                /*
+static pgprotval_t protect_kernel_text_ro(unsigned long start,
-                 * Don't enforce the !RW mapping for the kernel text mapping,
+                                          unsigned long end)
-                 * if the current mapping is already using small page mapping.
+{
-                 * No need to work hard to preserve large page mappings in this
+        return 0;
-                 * case.
+}
-                 *
-                 * This also fixes the Linux Xen paravirt guest boot failure
-                 * (because of unexpected read-only mappings for kernel identity
-                 * mappings). In this paravirt guest case, the kernel text
-                 * mapping and the kernel identity mapping share the same
-                 * page-table pages. Thus we can't really use different
-                 * protections for the kernel text and identity mappings. Also,
-                 * these shared mappings are made of small page mappings.
-                 * Thus this don't enforce !RW mapping for small page kernel
-                 * text mapping logic will help Linux Xen parvirt guest boot
-                 * as well.
-                 */
-                if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
-                        pgprot_val(forbidden) |= _PAGE_RW;
-        }
 #endif
-        prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+static inline bool conflicts(pgprot_t prot, pgprotval_t val)
+{
+        return (pgprot_val(prot) & ~val) != pgprot_val(prot);
+}
-        return prot;
+static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
+                                  unsigned long start, unsigned long end,
+                                  unsigned long pfn, const char *txt)
+{
+        static const char *lvltxt[] = {
+                [CPA_CONFLICT]  = "conflict",
+                [CPA_PROTECT]   = "protect",
+                [CPA_DETECT]    = "detect",
+        };
+        if (warnlvl > cpa_warn_level || !conflicts(prot, val))
+                return;
+        pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
+                lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
+                (unsigned long long)val);
+}
+/*
+ * Certain areas of memory on x86 require very specific protection flags,
+ * for example the BIOS area or kernel text. Callers don't always get this
+ * right (again, ioremap() on BIOS memory is not uncommon) so this function
+ * checks and fixes these known static required protection bits.
+ */
+static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
+                                          unsigned long pfn, unsigned long npg,
+                                          int warnlvl)
+{
+        pgprotval_t forbidden, res;
+        unsigned long end;
+        /*
+         * There is no point in checking RW/NX conflicts when the requested
+         * mapping is setting the page !PRESENT.
+         */
+        if (!(pgprot_val(prot) & _PAGE_PRESENT))
+                return prot;
+        /* Operate on the virtual address */
+        end = start + npg * PAGE_SIZE - 1;
+        res = protect_kernel_text(start, end);
+        check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
+        forbidden = res;
+        res = protect_kernel_text_ro(start, end);
+        check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
+        forbidden |= res;
+        /* Check the PFN directly */
+        res = protect_pci_bios(pfn, pfn + npg - 1);
+        check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
+        forbidden |= res;
+        res = protect_rodata(pfn, pfn + npg - 1);
+        check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
+        forbidden |= res;
+        return __pgprot(pgprot_val(prot) & ~forbidden);
 }
 /*
@@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 */
 pte_t *lookup_address(unsigned long address, unsigned int *level)
 {
-        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
+        return lookup_address_in_pgd(pgd_offset_k(address), address, level);
 }
 EXPORT_SYMBOL_GPL(lookup_address);
 static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
                                  unsigned int *level)
 {
-        if (cpa->pgd)
+        if (cpa->pgd)
                return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
                                               address, level);
-        return lookup_address(address, level);
+        return lookup_address(address, level);
 }
 /*
@@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
        return prot;
 }
-static int
+static int __should_split_large_page(pte_t *kpte, unsigned long address,
-try_preserve_large_page(pte_t *kpte, unsigned long address,
+                                     struct cpa_data *cpa)
-                        struct cpa_data *cpa)
 {
-        unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn;
+        unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
+        pgprot_t old_prot, new_prot, req_prot, chk_prot;
        pte_t new_pte, old_pte, *tmp;
-        pgprot_t old_prot, new_prot, req_prot;
-        int i, do_split = 1;
        enum pg_level level;
-        if (cpa->force_split)
-                return 1;
-        spin_lock(&pgd_lock);
        /*
         * Check for races, another CPU might have split this page
         * up already:
         */
        tmp = _lookup_address_cpa(cpa, address, &level);
        if (tmp != kpte)
-                goto out_unlock;
+                return 1;
        switch (level) {
        case PG_LEVEL_2M:
                old_prot = pmd_pgprot(*(pmd_t *)kpte);
                old_pfn = pmd_pfn(*(pmd_t *)kpte);
+                cpa_inc_2m_checked();
                break;
        case PG_LEVEL_1G:
                old_prot = pud_pgprot(*(pud_t *)kpte);
                old_pfn = pud_pfn(*(pud_t *)kpte);
+                cpa_inc_1g_checked();
                break;
        default:
-                do_split = -EINVAL;
+                return -EINVAL;
-                goto out_unlock;
        }
        psize = page_level_size(level);
@@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
         * Calculate the number of pages, which fit into this large
         * page starting at address:
         */
-        nextpage_addr = (address + psize) & pmask;
+        lpaddr = (address + psize) & pmask;
-        numpages = (nextpage_addr - address) >> PAGE_SHIFT;
+        numpages = (lpaddr - address) >> PAGE_SHIFT;
        if (numpages < cpa->numpages)
                cpa->numpages = numpages;
@@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
                pgprot_val(req_prot) |= _PAGE_PSE;
        /*
-         * old_pfn points to the large page base pfn. So we need
+         * old_pfn points to the large page base pfn. So we need to add the
-         * to add the offset of the virtual address:
+         * offset of the virtual address:
         */
        pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
        cpa->pfn = pfn;
-        new_prot = static_protections(req_prot, address, pfn);
+        /*
+         * Calculate the large page base address and the number of 4K pages
+         * in the large page
+         */
+        lpaddr = address & pmask;
+        numpages = psize >> PAGE_SHIFT;
        /*
-         * We need to check the full range, whether
+         * Sanity check that the existing mapping is correct versus the static
-         * static_protection() requires a different pgprot for one of
+         * protections. static_protections() guards against !PRESENT, so no
-         * the pages in the range we try to preserve:
+         * extra conditional required here.
         */
-        addr = address & pmask;
+        chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
-        pfn = old_pfn;
+                                      CPA_CONFLICT);
-        for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
-                pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
-                if (pgprot_val(chk_prot) != pgprot_val(new_prot))
+        if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
-                        goto out_unlock;
+                /*
+                 * Split the large page and tell the split code to
+                 * enforce static protections.
+                 */
+                cpa->force_static_prot = 1;
+                return 1;
        }
        /*
-         * If there are no changes, return. maxpages has been updated
+         * Optimization: If the requested pgprot is the same as the current
-         * above:
+         * pgprot, then the large page can be preserved and no updates are
+         * required independent of alignment and length of the requested
+         * range. The above already established that the current pgprot is
+         * correct, which in consequence makes the requested pgprot correct
+         * as well if it is the same. The static protection scan below will
+         * not come to a different conclusion.
         */
-        if (pgprot_val(new_prot) == pgprot_val(old_prot)) {
+        if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
-                do_split = 0;
+                cpa_inc_lp_sameprot(level);
-                goto out_unlock;
+                return 0;
        }
        /*
-         * We need to change the attributes. Check, whether we can
+         * If the requested range does not cover the full page, split it up
-         * change the large page in one go. We request a split, when
-         * the address is not aligned and the number of pages is
-         * smaller than the number of pages in the large page. Note
-         * that we limited the number of possible pages already to
-         * the number of pages in the large page.
         */
-        if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
+        if (address != lpaddr || cpa->numpages != numpages)
-                /*
+                return 1;
-                 * The address is aligned and the number of pages
-                 * covers the full page.
-                 */
-                new_pte = pfn_pte(old_pfn, new_prot);
-                __set_pmd_pte(kpte, address, new_pte);
-                cpa->flags |= CPA_FLUSHTLB;
-                do_split = 0;
-        }
-out_unlock:
+        /*
+         * Check whether the requested pgprot is conflicting with a static
+         * protection requirement in the large page.
+         */
+        new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
+                                      CPA_DETECT);
+        /*
+         * If there is a conflict, split the large page.
+         *
+         * There used to be a 4k wise evaluation trying really hard to
+         * preserve the large pages, but experimentation has shown, that this
+         * does not help at all. There might be corner cases which would
+         * preserve one large page occasionally, but it's really not worth the
+         * extra code and cycles for the common case.
+         */
+        if (pgprot_val(req_prot) != pgprot_val(new_prot))
+                return 1;
+        /* All checks passed. Update the large page mapping. */
+        new_pte = pfn_pte(old_pfn, new_prot);
+        __set_pmd_pte(kpte, address, new_pte);
+        cpa->flags |= CPA_FLUSHTLB;
+        cpa_inc_lp_preserved(level);
+        return 0;
+}
+static int should_split_large_page(pte_t *kpte, unsigned long address,
+                                   struct cpa_data *cpa)
+{
+        int do_split;
+        if (cpa->force_split)
+                return 1;
+        spin_lock(&pgd_lock);
+        do_split = __should_split_large_page(kpte, address, cpa);
        spin_unlock(&pgd_lock);
        return do_split;
 }
+static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
+                          pgprot_t ref_prot, unsigned long address,
+                          unsigned long size)
+{
+        unsigned int npg = PFN_DOWN(size);
+        pgprot_t prot;
+        /*
+         * If should_split_large_page() discovered an inconsistent mapping,
+         * remove the invalid protection in the split mapping.
+         */
+        if (!cpa->force_static_prot)
+                goto set;
+        prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT);
+        if (pgprot_val(prot) == pgprot_val(ref_prot))
+                goto set;
+        /*
+         * If this is splitting a PMD, fix it up. PUD splits cannot be
+         * fixed trivially as that would require to rescan the newly
+         * installed PMD mappings after returning from split_large_page()
+         * so an eventual further split can allocate the necessary PTE
+         * pages. Warn for now and revisit it in case this actually
+         * happens.
+         */
+        if (size == PAGE_SIZE)
+                ref_prot = prot;
+        else
+                pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
+set:
+        set_pte(pte, pfn_pte(pfn, ref_prot));
+}
 static int
 __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
                   struct page *base)
 {
+        unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
        pte_t *pbase = (pte_t *)page_address(base);
-        unsigned long ref_pfn, pfn, pfninc = 1;
        unsigned int i, level;
-        pte_t *tmp;
        pgprot_t ref_prot;
+        pte_t *tmp;
        spin_lock(&pgd_lock);
        /*
@@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
                 * PAT bit to correct position.
                 */
                ref_prot = pgprot_large_2_4k(ref_prot);
                ref_pfn = pmd_pfn(*(pmd_t *)kpte);
+                lpaddr = address & PMD_MASK;
+                lpinc = PAGE_SIZE;
                break;
        case PG_LEVEL_1G:
                ref_prot = pud_pgprot(*(pud_t *)kpte);
                ref_pfn = pud_pfn(*(pud_t *)kpte);
                pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
+                lpaddr = address & PUD_MASK;
+                lpinc = PMD_SIZE;
                /*
                 * Clear the PSE flags if the PRESENT flag is not set
                 * otherwise pmd_present/pmd_huge will return true
@@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
         * Get the target pfn from the original entry:
         */
        pfn = ref_pfn;
-        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc)
+        for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
-                set_pte(&pbase[i], pfn_pte(pfn, ref_prot));
+                split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
        if (virt_addr_valid(address)) {
                unsigned long pfn = PFN_DOWN(__pa(address));
@@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
        __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
        /*
-         * Intel Atom errata AAH41 workaround.
+         * Do a global flush tlb after splitting the large page
+         * and before we do the actual change page attribute in the PTE.
+         *
+         * Without this, we violate the TLB application note, that says:
+         * "The TLBs may contain both ordinary and large-page
+         *  translations for a 4-KByte range of linear addresses. This
+         *  may occur if software modifies the paging structures so that
+         *  the page size used for the address range changes. If the two
+         *  translations differ with respect to page frame or attributes
+         *  (e.g., permissions), processor behavior is undefined and may
+         *  be implementation-specific."
         *
-         * The real fix should be in hw or in a microcode update, but
+         * We do this global tlb flush inside the cpa_lock, so that we
-         * we also probabilistically try to reduce the window of having
+         * don't allow any other cpu, with stale tlb entries change the
-         * a large TLB mixed with 4K TLBs while instruction fetches are
+         * page attribute in parallel, that also falls into the
-         * going on.
+         * just split large page entry.
         */
-        __flush_tlb_all();
+        flush_tlb_all();
        spin_unlock(&pgd_lock);
        return 0;
@@ -1247,7 +1494,9 @@ repeat:
                pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
                pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
-                new_prot = static_protections(new_prot, address, pfn);
+                cpa_inc_4k_install();
+                new_prot = static_protections(new_prot, address, pfn, 1,
+                                              CPA_PROTECT);
                new_prot = pgprot_clear_protnone_bits(new_prot);
@@ -1273,7 +1522,7 @@ repeat:
         * Check, whether we can keep the large page intact
         * and just change the pte:
         */
-        do_split = try_preserve_large_page(kpte, address, cpa);
+        do_split = should_split_large_page(kpte, address, cpa);
        /*
         * When the range fits into the existing large page,
         * return. cp->numpages and cpa->tlbflush have been updated in
@@ -1286,28 +1535,8 @@ repeat:
         * We have to split the large page:
         */
        err = split_large_page(cpa, kpte, address);
-        if (!err) {
+        if (!err)
-                /*
-                 * Do a global flush tlb after splitting the large page
-                 * and before we do the actual change page attribute in the PTE.
-                 *
-                 * With out this, we violate the TLB application note, that says
-                 * "The TLBs may contain both ordinary and large-page
-                 *  translations for a 4-KByte range of linear addresses. This
-                 *  may occur if software modifies the paging structures so that
-                 *  the page size used for the address range changes. If the two
-                 *  translations differ with respect to page frame or attributes
-                 *  (e.g., permissions), processor behavior is undefined and may
-                 *  be implementation-specific."
-                 *
-                 * We do this global tlb flush inside the cpa_lock, so that we
-                 * don't allow any other cpu, with stale tlb entries change the
-                 * page attribute in parallel, that also falls into the
-                 * just split large page entry.
-                 */
-                flush_tlb_all();
                goto repeat;
-        }
        return err;
 }
@@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
        cache = !!pgprot2cachemode(mask_set);
        /*
-         * On success we use CLFLUSH, when the CPU supports it to
+         * On error; flush everything to be sure.
-         * avoid the WBINVD. If the CPU does not support it and in the
-         * error case we fall back to cpa_flush_all (which uses
-         * WBINVD):
         */
-        if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
+        if (ret) {
-                if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
-                        cpa_flush_array(addr, numpages, cache,
-                                        cpa.flags, pages);
-                } else
-                        cpa_flush_range(baddr, numpages, cache);
-        } else
                cpa_flush_all(cache);
+                goto out;
+        }
+        if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+                cpa_flush_array(baddr, addr, numpages, cache,
+                                cpa.flags, pages);
+        } else {
+                cpa_flush_range(baddr, numpages, cache);
+        }
 out:
        return ret;
@@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
        /*
         * Before changing the encryption attribute, we need to flush caches.
         */
-        if (static_cpu_has(X86_FEATURE_CLFLUSH))
+        cpa_flush_range(start, numpages, 1);
-                cpa_flush_range(start, numpages, 1);
-        else
-                cpa_flush_all(1);
        ret = __change_page_attr_set_clr(&cpa, 1);
@@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
         * in case TLB flushing gets optimized in the cpa_flush_range()
         * path use the same logic as above.
         */
-        if (static_cpu_has(X86_FEATURE_CLFLUSH))
+        cpa_flush_range(start, numpages, 0);
-                cpa_flush_range(start, numpages, 0);
-        else
-                cpa_flush_all(0);
        return ret;
 }
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e96b99eb800c..7d68489cfdb1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 {
        struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
        u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+        bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
        unsigned cpu = smp_processor_id();
        u64 next_tlb_gen;
+        bool need_flush;
+        u16 new_asid;
        /*
         * NB: The scheduler will call us with prev == next when switching
@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                           next->context.ctx_id);
                /*
-                 * We don't currently support having a real mm loaded without
+                 * Even in lazy TLB mode, the CPU should stay set in the
-                 * our cpu set in mm_cpumask().  We have all the bookkeeping
+                 * mm_cpumask. The TLB shootdown code can figure out from
-                 * in place to figure out whether we would need to flush
+                 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
-                 * if our cpu were cleared in mm_cpumask(), but we don't
-                 * currently use it.
                 */
                if (WARN_ON_ONCE(real_prev != &init_mm &&
                                 !cpumask_test_cpu(cpu, mm_cpumask(next))))
                        cpumask_set_cpu(cpu, mm_cpumask(next));
-                return;
+                /*
+                 * If the CPU is not in lazy TLB mode, we are just switching
+                 * from one thread in a process to another thread in the same
+                 * process. No TLB flush required.
+                 */
+                if (!was_lazy)
+                        return;
+                /*
+                 * Read the tlb_gen to check whether a flush is needed.
+                 * If the TLB is up to date, just use it.
+                 * The barrier synchronizes with the tlb_gen increment in
+                 * the TLB shootdown code.
+                 */
+                smp_mb();
+                next_tlb_gen = atomic64_read(&next->context.tlb_gen);
+                if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
+                                next_tlb_gen)
+                        return;
+                /*
+                 * TLB contents went out of date while we were in lazy
+                 * mode. Fall through to the TLB switching code below.
+                 */
+                new_asid = prev_asid;
+                need_flush = true;
        } else {
-                u16 new_asid;
-                bool need_flush;
                u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
                /*
@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
                /* Let nmi_uaccess_okay() know that we're changing CR3. */
                this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
                barrier();
+        }
-                if (need_flush) {
+        if (need_flush) {
-                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+                this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
-                        this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+                this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
-                        load_new_mm_cr3(next->pgd, new_asid, true);
+                load_new_mm_cr3(next->pgd, new_asid, true);
-                        /*
-                         * NB: This gets called via leave_mm() in the idle path
-                         * where RCU functions differently.  Tracing normally
-                         * uses RCU, so we need to use the _rcuidle variant.
-                         *
-                         * (There is no good reason for this.  The idle code should
-                         *  be rearranged to call this before rcu_idle_enter().)
-                         */
-                        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-                } else {
-                        /* The new ASID is already up to date. */
-                        load_new_mm_cr3(next->pgd, new_asid, false);
-                        /* See above wrt _rcuidle. */
-                        trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
-                }
                /*
-                 * Record last user mm's context id, so we can avoid
+                 * NB: This gets called via leave_mm() in the idle path
-                 * flushing branch buffer with IBPB if we switch back
+                 * where RCU functions differently.  Tracing normally
-                 * to the same user.
+                 * uses RCU, so we need to use the _rcuidle variant.
+                 *
+                 * (There is no good reason for this.  The idle code should
+                 *  be rearranged to call this before rcu_idle_enter().)
                 */
-                if (next != &init_mm)
+                trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-                        this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+        } else {
+                /* The new ASID is already up to date. */
-                /* Make sure we write CR3 before loaded_mm. */
+                load_new_mm_cr3(next->pgd, new_asid, false);
-                barrier();
-                this_cpu_write(cpu_tlbstate.loaded_mm, next);
+                /* See above wrt _rcuidle. */
-                this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+                trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
        }
-        load_mm_cr4(next);
+        /*
-        switch_ldt(real_prev, next);
+         * Record last user mm's context id, so we can avoid
+         * flushing branch buffer with IBPB if we switch back
+         * to the same user.
+         */
+        if (next != &init_mm)
+                this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
+        /* Make sure we write CR3 before loaded_mm. */
+        barrier();
+        this_cpu_write(cpu_tlbstate.loaded_mm, next);
+        this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
+        if (next != real_prev) {
+                load_mm_cr4(next);
+                switch_ldt(real_prev, next);
+        }
 }
 /*
@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
        if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
                return;
-        if (tlb_defer_switch_to_init_mm()) {
+        this_cpu_write(cpu_tlbstate.is_lazy, true);
-                /*
-                 * There's a significant optimization that may be possible
-                 * here.  We have accurate enough TLB flush tracking that we
-                 * don't need to maintain coherence of TLB per se when we're
-                 * lazy.  We do, however, need to maintain coherence of
-                 * paging-structure caches.  We could, in principle, leave our
-                 * old mm loaded and only switch to init_mm when
-                 * tlb_remove_page() happens.
-                 */
-                this_cpu_write(cpu_tlbstate.is_lazy, true);
-        } else {
-                switch_mm(NULL, &init_mm, NULL);
-        }
 }
 /*
@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
                 * paging-structure cache to avoid speculatively reading
                 * garbage into our TLB.  Since switching to init_mm is barely
                 * slower than a minimal flush, just switch to init_mm.
+                 *
+                 * This should be rare, with native_flush_tlb_others skipping
+                 * IPIs to lazy TLB mode CPUs.
                 */
                switch_mm_irqs_off(NULL, &init_mm, NULL);
                return;
@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
            f->new_tlb_gen == local_tlb_gen + 1 &&
            f->new_tlb_gen == mm_tlb_gen) {
                /* Partial flush */
-                unsigned long addr;
+                unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
-                unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
+                unsigned long addr = f->start;
-                addr = f->start;
                while (addr < f->end) {
                        __flush_tlb_one_user(addr);
-                        addr += PAGE_SIZE;
+                        addr += 1UL << f->stride_shift;
                }
                if (local)
-                        count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
+                        count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
-                trace_tlb_flush(reason, nr_pages);
+                trace_tlb_flush(reason, nr_invalidate);
        } else {
                /* Full flush. */
                local_flush_tlb();
@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
        flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
 }
+static bool tlb_is_not_lazy(int cpu, void *data)
+{
+        return !per_cpu(cpu_tlbstate.is_lazy, cpu);
+}
 void native_flush_tlb_others(const struct cpumask *cpumask,
                             const struct flush_tlb_info *info)
 {
@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
                                               (void *)info, 1);
                return;
        }
-        smp_call_function_many(cpumask, flush_tlb_func_remote,
+        /*
+         * If no page tables were freed, we can skip sending IPIs to
+         * CPUs in lazy TLB mode. They will flush the CPU themselves
+         * at the next context switch.
+         *
+         * However, if page tables are getting freed, we need to send the
+         * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
+         * up on the new contents of what used to be page tables, while
+         * doing a speculative memory access.
+         */
+        if (info->freed_tables)
+                smp_call_function_many(cpumask, flush_tlb_func_remote,
                               (void *)info, 1);
+        else
+                on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
+                                (void *)info, 1, GFP_ATOMIC, cpumask);
 }
 /*
@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
-                                unsigned long end, unsigned long vmflag)
+                                unsigned long end, unsigned int stride_shift,
+                                bool freed_tables)
 {
        int cpu;
        struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
                .mm = mm,
+                .stride_shift = stride_shift,
+                .freed_tables = freed_tables,
        };
        cpu = get_cpu();
@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
        /* Should we flush just the requested range? */
        if ((end != TLB_FLUSH_ALL) &&
-            !(vmflag & VM_HUGETLB) &&
+            ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
-            ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
                info.start = start;
                info.end = end;
        } else {
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index e3b18ad49889..145506f9fdbe 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -22,6 +22,7 @@
 #include <linux/tick.h>
 #include <linux/nmi.h>
 #include <linux/cpuhotplug.h>
+#include <linux/stackprotector.h>
 #include <asm/paravirt.h>
 #include <asm/desc.h>
@@ -88,6 +89,7 @@ static void cpu_bringup(void)
 asmlinkage __visible void cpu_bringup_and_idle(void)
 {
        cpu_bringup();
+        boot_init_stack_canary();
        cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
 }
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 84b3e4445d46..3931c7de7c69 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -902,12 +902,22 @@ static bool copy_device_table(void)
                }
        }
-        old_devtb_phys = entry & PAGE_MASK;
+        /*
+         * When SME is enabled in the first kernel, the entry includes the
+         * memory encryption mask(sme_me_mask), we must remove the memory
+         * encryption mask to obtain the true physical address in kdump kernel.
+         */
+        old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
        if (old_devtb_phys >= 0x100000000ULL) {
                pr_err("The address of old device table is above 4G, not trustworthy!\n");
                return false;
        }
-        old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
+        old_devtb = (sme_active() && is_kdump_kernel())
+                    ? (__force void *)ioremap_encrypted(old_devtb_phys,
+                                                        dev_table_size)
+                    : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
        if (!old_devtb)
                return false;
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cbde728f8ac6..91ae16fbd7d5 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -24,6 +24,8 @@
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/uaccess.h>
+#include <linux/mem_encrypt.h>
+#include <asm/pgtable.h>
 #include <asm/io.h>
 #include "internal.h"
@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
 /* Reads a page from the oldmem device from given offset. */
 static ssize_t read_from_oldmem(char *buf, size_t count,
-                                u64 *ppos, int userbuf)
+                                u64 *ppos, int userbuf,
+                                bool encrypted)
 {
        unsigned long pfn, offset;
        size_t nr_bytes;
@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
                if (pfn_is_ram(pfn) == 0)
                        memset(buf, 0, nr_bytes);
                else {
-                        tmp = copy_oldmem_page(pfn, buf, nr_bytes,
+                        if (encrypted)
-                                                offset, userbuf);
+                                tmp = copy_oldmem_page_encrypted(pfn, buf,
+                                                                 nr_bytes,
+                                                                 offset,
+                                                                 userbuf);
+                        else
+                                tmp = copy_oldmem_page(pfn, buf, nr_bytes,
+                                                       offset, userbuf);
                        if (tmp < 0)
                                return tmp;
                }
@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
 */
 ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-        return read_from_oldmem(buf, count, ppos, 0);
+        return read_from_oldmem(buf, count, ppos, 0, false);
 }
 /*
@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 */
 ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
 {
-        return read_from_oldmem(buf, count, ppos, 0);
+        return read_from_oldmem(buf, count, ppos, 0, sme_active());
 }
 /*
@@ -173,10 +183,21 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
                                  unsigned long from, unsigned long pfn,
                                  unsigned long size, pgprot_t prot)
 {
+        prot = pgprot_encrypted(prot);
        return remap_pfn_range(vma, from, pfn, size, prot);
 }
 /*
+ * Architectures which support memory encryption override this.
+ */
+ssize_t __weak
+copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
+                           unsigned long offset, int userbuf)
+{
+        return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
+}
+/*
 * Copy to either kernel or user space
 */
 static int copy_to(void *target, void *src, size_t size, int userbuf)
@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
                                            m->offset + m->size - *fpos,
                                            buflen);
                        start = m->paddr + *fpos - m->offset;
-                        tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
+                        tmp = read_from_oldmem(buffer, tsz, &start,
+                                               userbuf, sme_active());
                        if (tmp < 0)
                                return tmp;
                        buflen -= tsz;
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 3e4ba9d753c8..f774c5eb9e3c 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
                                                unsigned long, int);
+extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
+                                          size_t csize, unsigned long offset,
+                                          int userbuf);
 void vmcore_cleanup(void);
 /* Architecture code defines this if there are other possible ELF
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9fb239e12b82..a56f08ff3097 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
                smp_call_func_t func, void *info, bool wait,
                gfp_t gfp_flags);
+void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
+                smp_call_func_t func, void *info, bool wait,
+                gfp_t gfp_flags, const struct cpumask *mask);
 int smp_call_function_single_async(int cpu, call_single_data_t *csd);
 #ifdef CONFIG_SMP
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4da38a..86ef06d3dbe3 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
                }
        }
+        /* Ensure that these pages are decrypted if SME is enabled. */
+        if (pages)
+                arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
        return pages;
 }
@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
                        result  = -ENOMEM;
                        goto out;
                }
+                arch_kexec_post_alloc_pages(page_address(page), 1, 0);
                ptr = kmap(page);
                ptr += maddr & ~PAGE_MASK;
                mchunk = min_t(size_t, mbytes,
@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
                        result = copy_from_user(ptr, buf, uchunk);
                kexec_flush_icache_page(page);
                kunmap(page);
+                arch_kexec_pre_free_pages(page_address(page), 1);
                if (result) {
                        result = -EFAULT;
                        goto out;
diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..b3a3a1fc499e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -318,33 +318,34 @@ int release_resource(struct resource *old)
 EXPORT_SYMBOL(release_resource);
-/*
+/**
- * Finds the lowest iomem resource existing within [res->start.res->end).
+ * Finds the lowest iomem resource that covers part of [start..end].  The
- * The caller must specify res->start, res->end, res->flags, and optionally
+ * caller must specify start, end, flags, and desc (which may be
- * desc.  If found, returns 0, res is overwritten, if not found, returns -1.
+ * IORES_DESC_NONE).
- * This function walks the whole tree and not just first level children until
+ *
- * and unless first_level_children_only is true.
+ * If a resource is found, returns 0 and *res is overwritten with the part
+ * of the resource that's within [start..end]; if none is found, returns
+ * -1.
+ *
+ * This function walks the whole tree and not just first level children
+ * unless @first_lvl is true.
 */
-static int find_next_iomem_res(struct resource *res, unsigned long desc,
+static int find_next_iomem_res(resource_size_t start, resource_size_t end,
-                               bool first_level_children_only)
+                               unsigned long flags, unsigned long desc,
+                               bool first_lvl, struct resource *res)
 {
-        resource_size_t start, end;
        struct resource *p;
-        bool sibling_only = false;
-        BUG_ON(!res);
+        if (!res)
+                return -EINVAL;
-        start = res->start;
-        end = res->end;
-        BUG_ON(start >= end);
-        if (first_level_children_only)
+        if (start >= end)
-                sibling_only = true;
+                return -EINVAL;
        read_lock(&resource_lock);
-        for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
+        for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
-                if ((p->flags & res->flags) != res->flags)
+                if ((p->flags & flags) != flags)
                        continue;
                if ((desc != IORES_DESC_NONE) && (desc != p->desc))
                        continue;
@@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
                        p = NULL;
                        break;
                }
-                if ((p->end >= start) && (p->start < end))
+                if ((p->end >= start) && (p->start <= end))
                        break;
        }
        read_unlock(&resource_lock);
        if (!p)
                return -1;
        /* copy data */
-        if (res->start < p->start)
+        res->start = max(start, p->start);
-                res->start = p->start;
+        res->end = min(end, p->end);
-        if (res->end > p->end)
-                res->end = p->end;
        res->flags = p->flags;
        res->desc = p->desc;
        return 0;
 }
-static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
+static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
-                                 bool first_level_children_only,
+                                 unsigned long flags, unsigned long desc,
-                                 void *arg,
+                                 bool first_lvl, void *arg,
                                 int (*func)(struct resource *, void *))
 {
-        u64 orig_end = res->end;
+        struct resource res;
        int ret = -1;
-        while ((res->start < res->end) &&
+        while (start < end &&
-               !find_next_iomem_res(res, desc, first_level_children_only)) {
+               !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
-                ret = (*func)(res, arg);
+                ret = (*func)(&res, arg);
                if (ret)
                        break;
-                res->start = res->end + 1;
+                start = res.end + 1;
-                res->end = orig_end;
        }
        return ret;
 }
-/*
+/**
 * Walks through iomem resources and calls func() with matching resource
 * ranges. This walks through whole tree and not just first level children.
 * All the memory ranges which overlap start,end and also match flags and
@@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
 int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
                u64 end, void *arg, int (*func)(struct resource *, void *))
 {
-        struct resource res;
+        return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
-        res.start = start;
-        res.end = end;
-        res.flags = flags;
-        return __walk_iomem_res_desc(&res, desc, false, arg, func);
 }
 EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
@@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
 * ranges.
 */
 int walk_system_ram_res(u64 start, u64 end, void *arg,
-                                int (*func)(struct resource *, void *))
+                        int (*func)(struct resource *, void *))
 {
-        struct resource res;
+        unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-        res.start = start;
+        return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
-        res.end = end;
-        res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-        return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
                                     arg, func);
 }
@@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
 int walk_mem_res(u64 start, u64 end, void *arg,
                 int (*func)(struct resource *, void *))
 {
-        struct resource res;
+        unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-        res.start = start;
-        res.end = end;
-        res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-        return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
+        return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
                                     arg, func);
 }
@@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg,
 * It is to be used only for System RAM.
 */
 int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
-                void *arg, int (*func)(unsigned long, unsigned long, void *))
+                          void *arg, int (*func)(unsigned long, unsigned long, void *))
 {
+        resource_size_t start, end;
+        unsigned long flags;
        struct resource res;
        unsigned long pfn, end_pfn;
-        u64 orig_end;
        int ret = -1;
-        res.start = (u64) start_pfn << PAGE_SHIFT;
+        start = (u64) start_pfn << PAGE_SHIFT;
-        res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
+        end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
-        res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+        flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-        orig_end = res.end;
+        while (start < end &&
-        while ((res.start < res.end) &&
+               !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
-                (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
+                                    true, &res)) {
                pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
                end_pfn = (res.end + 1) >> PAGE_SHIFT;
                if (end_pfn > pfn)
                        ret = (*func)(pfn, end_pfn - pfn, arg);
                if (ret)
                        break;
-                res.start = res.end + 1;
+                start = res.end + 1;
-                res.end = orig_end;
        }
        return ret;
 }
@@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new,
 * @constraint: the size and alignment constraints to be met.
 */
 static int reallocate_resource(struct resource *root, struct resource *old,
-                        resource_size_t newsize,
+                               resource_size_t newsize,
-                        struct resource_constraint  *constraint)
+                               struct resource_constraint *constraint)
 {
        int err=0;
        struct resource new = *old;
@@ -972,7 +957,7 @@ skip:
 * Existing children of the resource are assumed to be immutable.
 */
 int adjust_resource(struct resource *res, resource_size_t start,
-                        resource_size_t size)
+                    resource_size_t size)
 {
        int result;
@@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start,
 }
 EXPORT_SYMBOL(adjust_resource);
-static void __init __reserve_region_with_split(struct resource *root,
+static void __init
-                resource_size_t start, resource_size_t end,
+__reserve_region_with_split(struct resource *root, resource_size_t start,
-                const char *name)
+                            resource_size_t end, const char *name)
 {
        struct resource *parent = root;
        struct resource *conflict;
@@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root,
 }
-void __init reserve_region_with_split(struct resource *root,
+void __init
-                resource_size_t start, resource_size_t end,
+reserve_region_with_split(struct resource *root, resource_size_t start,
-                const char *name)
+                          resource_size_t end, const char *name)
 {
        int abort = 0;
@@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region);
 * The described resource region must match a currently busy region.
 */
 void __release_region(struct resource *parent, resource_size_t start,
-                        resource_size_t n)
+                      resource_size_t n)
 {
        struct resource **p;
        resource_size_t end;
@@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region);
 *   simplicity.  Enhance this logic when necessary.
 */
 int release_mem_region_adjustable(struct resource *parent,
-                        resource_size_t start, resource_size_t size)
+                                  resource_size_t start, resource_size_t size)
 {
        struct resource **p;
        struct resource *res;
@@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data)
                this->start == match->start && this->n == match->n;
 }
-struct resource * __devm_request_region(struct device *dev,
+struct resource *
-                                struct resource *parent, resource_size_t start,
+__devm_request_region(struct device *dev, struct resource *parent,
-                                resource_size_t n, const char *name)
+                      resource_size_t start, resource_size_t n, const char *name)
 {
        struct region_devres *dr = NULL;
        struct resource *res;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 16f84142f2f4..f5516bae0c1b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle);
 void cpu_startup_entry(enum cpuhp_state state)
 {
-        /*
-         * This #ifdef needs to die, but it's too late in the cycle to
-         * make this generic (ARM and SH have never invoked the canary
-         * init for the non boot CPUs!). Will be fixed in 3.11
-         */
-#ifdef CONFIG_X86
-        /*
-         * If we're the non-boot CPU, nothing set the stack canary up
-         * for us. The boot CPU already has it initialized but no harm
-         * in doing it again. This is a good place for updating it, as
-         * we wont ever return from this function (so the invalid
-         * canaries already on the stack wont ever trigger).
-         */
-        boot_init_stack_canary();
-#endif
        arch_cpu_idle_prepare();
        cpuhp_online_idle(state);
        while (1)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc24f2b8c646..b8c007713b3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -56,7 +56,6 @@
 #include <linux/profile.h>
 #include <linux/rcupdate_wait.h>
 #include <linux/security.h>
-#include <linux/stackprotector.h>
 #include <linux/stop_machine.h>
 #include <linux/suspend.h>
 #include <linux/swait.h>
diff --git a/kernel/smp.c b/kernel/smp.c
index d86eec5f51c1..163c451af42e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
 * You must not call this function with disabled interrupts or
 * from a hardware interrupt handler or from a bottom half handler.
 */
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
                        smp_call_func_t func, void *info, bool wait,
-                        gfp_t gfp_flags)
+                        gfp_t gfp_flags, const struct cpumask *mask)
 {
        cpumask_var_t cpus;
        int cpu, ret;
@@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
        if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
                preempt_disable();
-                for_each_online_cpu(cpu)
+                for_each_cpu(cpu, mask)
                        if (cond_func(cpu, info))
-                                cpumask_set_cpu(cpu, cpus);
+                                __cpumask_set_cpu(cpu, cpus);
                on_each_cpu_mask(cpus, func, info, wait);
                preempt_enable();
                free_cpumask_var(cpus);
@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
                 * just have to IPI them one by one.
                 */
                preempt_disable();
-                for_each_online_cpu(cpu)
+                for_each_cpu(cpu, mask)
                        if (cond_func(cpu, info)) {
                                ret = smp_call_function_single(cpu, func,
                                                                info, wait);
@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
                preempt_enable();
        }
 }
+EXPORT_SYMBOL(on_each_cpu_cond_mask);
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+                        smp_call_func_t func, void *info, bool wait,
+                        gfp_t gfp_flags)
+{
+        on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
+                                cpu_online_mask);
+}
 EXPORT_SYMBOL(on_each_cpu_cond);
 static void do_nothing(void *unused)
diff --git a/kernel/up.c b/kernel/up.c
index 42c46bf3e0a5..ff536f9cc8a2 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
 * Preemption is disabled here to make sure the cond_func is called under the
 * same condtions in UP and SMP.
 */
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
-                      smp_call_func_t func, void *info, bool wait,
+                           smp_call_func_t func, void *info, bool wait,
-                      gfp_t gfp_flags)
+                           gfp_t gfp_flags, const struct cpumask *mask)
 {
        unsigned long flags;
@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
        }
        preempt_enable();
 }
+EXPORT_SYMBOL(on_each_cpu_cond_mask);
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+                      smp_call_func_t func, void *info, bool wait,
+                      gfp_t gfp_flags)
+{
+        on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
+}
 EXPORT_SYMBOL(on_each_cpu_cond);
 int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cf2af04b34b9..532c29276fce 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -8,6 +8,7 @@
 */
 #include <linux/pagemap.h>
+#include <linux/hugetlb.h>
 #include <asm/tlb.h>
 #include <asm-generic/pgtable.h>
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-10-23 12:05:28 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-10-23 12:05:28 -0400
commit	99792e0cea1ed733cdc8d0758677981e0cbebfed (patch)
tree	acf6868f48f687dd8667ee4f99c156415ea8ff7b
parent	382d72a9aa525b56ab8453ce61751fa712414d3d (diff)
parent	977e4be5eb714c48a67afc26a6c477f24130a1f2 (diff)