aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-10-23 12:05:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-10-23 12:05:28 -0400
commit99792e0cea1ed733cdc8d0758677981e0cbebfed (patch)
treeacf6868f48f687dd8667ee4f99c156415ea8ff7b
parent382d72a9aa525b56ab8453ce61751fa712414d3d (diff)
parent977e4be5eb714c48a67afc26a6c477f24130a1f2 (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "Lots of changes in this cycle: - Lots of CPA (change page attribute) optimizations and related cleanups (Thomas Gleixner, Peter Zijstra) - Make lazy TLB mode even lazier (Rik van Riel) - Fault handler cleanups and improvements (Dave Hansen) - kdump, vmcore: Enable kdumping encrypted memory with AMD SME enabled (Lianbo Jiang) - Clean up VM layout documentation (Baoquan He, Ingo Molnar) - ... plus misc other fixes and enhancements" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits) x86/stackprotector: Remove the call to boot_init_stack_canary() from cpu_startup_entry() x86/mm: Kill stray kernel fault handling comment x86/mm: Do not warn about PCI BIOS W+X mappings resource: Clean it up a bit resource: Fix find_next_iomem_res() iteration issue resource: Include resource end in walk_*() interfaces x86/kexec: Correct KEXEC_BACKUP_SRC_END off-by-one error x86/mm: Remove spurious fault pkey check x86/mm/vsyscall: Consider vsyscall page part of user address space x86/mm: Add vsyscall address helper x86/mm: Fix exception table comments x86/mm: Add clarifying comments for user addr space x86/mm: Break out user address space handling x86/mm: Break out kernel address space handling x86/mm: Clarify hardware vs. software "error_code" x86/mm/tlb: Make lazy TLB mode lazier x86/mm/tlb: Add freed_tables element to flush_tlb_info x86/mm/tlb: Add freed_tables argument to flush_tlb_mm_range smp,cpumask: introduce on_each_cpu_cond_mask smp: use __cpumask_set_cpu in on_each_cpu_cond ...
-rw-r--r--Documentation/x86/x86_64/mm.txt171
-rw-r--r--arch/x86/Kconfig8
-rw-r--r--arch/x86/include/asm/io.h3
-rw-r--r--arch/x86/include/asm/kexec.h2
-rw-r--r--arch/x86/include/asm/page_64_types.h15
-rw-r--r--arch/x86/include/asm/tlb.h21
-rw-r--r--arch/x86/include/asm/tlbflush.h33
-rw-r--r--arch/x86/kernel/crash_dump_64.c60
-rw-r--r--arch/x86/kernel/ldt.c2
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/mm/dump_pagetables.c35
-rw-r--r--arch/x86/mm/fault.c288
-rw-r--r--arch/x86/mm/init_32.c23
-rw-r--r--arch/x86/mm/ioremap.c24
-rw-r--r--arch/x86/mm/pageattr.c627
-rw-r--r--arch/x86/mm/tlb.c167
-rw-r--r--arch/x86/xen/smp_pv.c2
-rw-r--r--drivers/iommu/amd_iommu_init.c14
-rw-r--r--fs/proc/vmcore.c34
-rw-r--r--include/linux/crash_dump.h4
-rw-r--r--include/linux/smp.h4
-rw-r--r--kernel/kexec_core.c6
-rw-r--r--kernel/resource.c141
-rw-r--r--kernel/sched/idle.c15
-rw-r--r--kernel/sched/sched.h1
-rw-r--r--kernel/smp.c19
-rw-r--r--kernel/up.c14
-rw-r--r--mm/pgtable-generic.c1
28 files changed, 1117 insertions, 619 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 5432a96d31ff..702898633b00 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -1,55 +1,124 @@
1====================================================
2Complete virtual memory map with 4-level page tables
3====================================================
1 4
2Virtual memory map with 4 level page tables: 5Notes:
3 6
40000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm 7 - Negative addresses such as "-23 TB" are absolute addresses in bytes, counted down
5hole caused by [47:63] sign extension 8 from the top of the 64-bit address space. It's easier to understand the layout
6ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor 9 when seen both in absolute addresses and in distance-from-top notation.
7ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory 10
8ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole 11 For example 0xffffe90000000000 == -23 TB, it's 23 TB lower than the top of the
9ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space 12 64-bit address space (ffffffffffffffff).
10ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole 13
11ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) 14 Note that as we get closer to the top of the address space, the notation changes
12... unused hole ... 15 from TB to GB and then MB/KB.
13ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) 16
14... unused hole ... 17 - "16M TB" might look weird at first sight, but it's an easier to visualize size
15 vaddr_end for KASLR 18 notation than "16 EB", which few will recognize at first sight as 16 exabytes.
16fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping 19 It also shows it nicely how incredibly large 64-bit address space is.
17fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI 20
18ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks 21========================================================================================================================
19... unused hole ... 22 Start addr | Offset | End addr | Size | VM area description
20ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space 23========================================================================================================================
21... unused hole ... 24 | | | |
22ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 25 0000000000000000 | 0 | 00007fffffffffff | 128 TB | user-space virtual memory, different per mm
23ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space 26__________________|____________|__________________|_________|___________________________________________________________
24[fixmap start] - ffffffffff5fffff kernel-internal fixmap range 27 | | | |
25ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI 28 0000800000000000 | +128 TB | ffff7fffffffffff | ~16M TB | ... huge, almost 64 bits wide hole of non-canonical
26ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole 29 | | | | virtual memory addresses up to the -128 TB
27 30 | | | | starting offset of kernel mappings.
28Virtual memory map with 5 level page tables: 31__________________|____________|__________________|_________|___________________________________________________________
29 32 |
300000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm 33 | Kernel-space virtual memory, shared between all processes:
31hole caused by [56:63] sign extension 34____________________________________________________________|___________________________________________________________
32ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor 35 | | | |
33ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory 36 ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor
34ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI 37 ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base)
35ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) 38 ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole
36ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole 39 ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base)
37ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) 40 ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole
38... unused hole ... 41 ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base)
39ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) 42 ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole
40... unused hole ... 43 ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory
41 vaddr_end for KASLR 44 fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
42fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping 45 | | | | vaddr_end for KASLR
43... unused hole ... 46 fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
44ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks 47 fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI
45... unused hole ... 48 ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
46ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space 49__________________|____________|__________________|_________|____________________________________________________________
47... unused hole ... 50 |
48ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 51 | Identical layout to the 47-bit one from here on:
49ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space 52____________________________________________________________|____________________________________________________________
50[fixmap start] - ffffffffff5fffff kernel-internal fixmap range 53 | | | |
51ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI 54 ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
52ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole 55 ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
56 ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
57 ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
58 ffffffff80000000 |-2048 MB | | |
59 ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
60 ffffffffff000000 | -16 MB | | |
61 FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
62 ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
63 ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
64__________________|____________|__________________|_________|___________________________________________________________
65
66
67====================================================
68Complete virtual memory map with 5-level page tables
69====================================================
70
71Notes:
72
73 - With 56-bit addresses, user-space memory gets expanded by a factor of 512x,
74 from 0.125 PB to 64 PB. All kernel mappings shift down to the -64 PT starting
75 offset and many of the regions expand to support the much larger physical
76 memory supported.
77
78========================================================================================================================
79 Start addr | Offset | End addr | Size | VM area description
80========================================================================================================================
81 | | | |
82 0000000000000000 | 0 | 00ffffffffffffff | 64 PB | user-space virtual memory, different per mm
83__________________|____________|__________________|_________|___________________________________________________________
84 | | | |
85 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical
86 | | | | virtual memory addresses up to the -128 TB
87 | | | | starting offset of kernel mappings.
88__________________|____________|__________________|_________|___________________________________________________________
89 |
90 | Kernel-space virtual memory, shared between all processes:
91____________________________________________________________|___________________________________________________________
92 | | | |
93 ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor
94 ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base)
95 ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI
96 ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base)
97 ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole
98 ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base)
99 ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole
100 ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory
101 fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole
102 | | | | vaddr_end for KASLR
103 fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping
104 fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole
105 ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks
106__________________|____________|__________________|_________|____________________________________________________________
107 |
108 | Identical layout to the 47-bit one from here on:
109____________________________________________________________|____________________________________________________________
110 | | | |
111 ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole
112 ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space
113 ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole
114 ffffffff80000000 | -2 GB | ffffffff9fffffff | 512 MB | kernel text mapping, mapped to physical address 0
115 ffffffff80000000 |-2048 MB | | |
116 ffffffffa0000000 |-1536 MB | fffffffffeffffff | 1520 MB | module mapping space
117 ffffffffff000000 | -16 MB | | |
118 FIXADDR_START | ~-11 MB | ffffffffff5fffff | ~0.5 MB | kernel-internal fixmap range, variable size and offset
119 ffffffffff600000 | -10 MB | ffffffffff600fff | 4 kB | legacy vsyscall ABI
120 ffffffffffe00000 | -2 MB | ffffffffffffffff | 2 MB | ... unused hole
121__________________|____________|__________________|_________|___________________________________________________________
53 122
54Architecture defines a 64-bit virtual address. Implementations can support 123Architecture defines a 64-bit virtual address. Implementations can support
55less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 124less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8282985d438a..ff425a2d286c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1487,6 +1487,14 @@ config X86_DIRECT_GBPAGES
1487 supports them), so don't confuse the user by printing 1487 supports them), so don't confuse the user by printing
1488 that we have them enabled. 1488 that we have them enabled.
1489 1489
1490config X86_CPA_STATISTICS
1491 bool "Enable statistic for Change Page Attribute"
1492 depends on DEBUG_FS
1493 ---help---
1494 Expose statistics about the Change Page Attribute mechanims, which
1495 helps to determine the effectivness of preserving large and huge
1496 page mappings when mapping protections are changed.
1497
1490config ARCH_HAS_MEM_ENCRYPT 1498config ARCH_HAS_MEM_ENCRYPT
1491 def_bool y 1499 def_bool y
1492 1500
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 9a92a3ac2ac5..832da8229cc7 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -187,11 +187,12 @@ extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size)
187#define ioremap_nocache ioremap_nocache 187#define ioremap_nocache ioremap_nocache
188extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); 188extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
189#define ioremap_uc ioremap_uc 189#define ioremap_uc ioremap_uc
190
191extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); 190extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
192#define ioremap_cache ioremap_cache 191#define ioremap_cache ioremap_cache
193extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); 192extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val);
194#define ioremap_prot ioremap_prot 193#define ioremap_prot ioremap_prot
194extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size);
195#define ioremap_encrypted ioremap_encrypted
195 196
196/** 197/**
197 * ioremap - map bus memory into CPU space 198 * ioremap - map bus memory into CPU space
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index f327236f0fa7..5125fca472bb 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -67,7 +67,7 @@ struct kimage;
67 67
68/* Memory to backup during crash kdump */ 68/* Memory to backup during crash kdump */
69#define KEXEC_BACKUP_SRC_START (0UL) 69#define KEXEC_BACKUP_SRC_START (0UL)
70#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ 70#define KEXEC_BACKUP_SRC_END (640 * 1024UL - 1) /* 640K */
71 71
72/* 72/*
73 * CPU does not save ss and sp on stack if execution is already 73 * CPU does not save ss and sp on stack if execution is already
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 6afac386a434..cd0cf1c568b4 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -59,13 +59,16 @@
59#endif 59#endif
60 60
61/* 61/*
62 * Kernel image size is limited to 1GiB due to the fixmap living in the 62 * Maximum kernel image size is limited to 1 GiB, due to the fixmap living
63 * next 1GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S). Use 63 * in the next 1 GiB (see level2_kernel_pgt in arch/x86/kernel/head_64.S).
64 * 512MiB by default, leaving 1.5GiB for modules once the page tables 64 *
65 * are fully set up. If kernel ASLR is configured, it can extend the 65 * On KASLR use 1 GiB by default, leaving 1 GiB for modules once the
66 * kernel page table mapping, reducing the size of the modules area. 66 * page tables are fully set up.
67 *
68 * If KASLR is disabled we can shrink it to 0.5 GiB and increase the size
69 * of the modules area to 1.5 GiB.
67 */ 70 */
68#if defined(CONFIG_RANDOMIZE_BASE) 71#ifdef CONFIG_RANDOMIZE_BASE
69#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) 72#define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024)
70#else 73#else
71#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) 74#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index cb0a1f470980..404b8b1d44f5 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -6,16 +6,23 @@
6#define tlb_end_vma(tlb, vma) do { } while (0) 6#define tlb_end_vma(tlb, vma) do { } while (0)
7#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) 7#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
8 8
9#define tlb_flush(tlb) \ 9static inline void tlb_flush(struct mmu_gather *tlb);
10{ \
11 if (!tlb->fullmm && !tlb->need_flush_all) \
12 flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end, 0UL); \
13 else \
14 flush_tlb_mm_range(tlb->mm, 0UL, TLB_FLUSH_ALL, 0UL); \
15}
16 10
17#include <asm-generic/tlb.h> 11#include <asm-generic/tlb.h>
18 12
13static inline void tlb_flush(struct mmu_gather *tlb)
14{
15 unsigned long start = 0UL, end = TLB_FLUSH_ALL;
16 unsigned int stride_shift = tlb_get_unmap_shift(tlb);
17
18 if (!tlb->fullmm && !tlb->need_flush_all) {
19 start = tlb->start;
20 end = tlb->end;
21 }
22
23 flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables);
24}
25
19/* 26/*
20 * While x86 architecture in general requires an IPI to perform TLB 27 * While x86 architecture in general requires an IPI to perform TLB
21 * shootdown, enablement code for several hypervisors overrides 28 * shootdown, enablement code for several hypervisors overrides
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 58ce5288878e..323a313947e0 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
148#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) 148#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
149#endif 149#endif
150 150
151static inline bool tlb_defer_switch_to_init_mm(void)
152{
153 /*
154 * If we have PCID, then switching to init_mm is reasonably
155 * fast. If we don't have PCID, then switching to init_mm is
156 * quite slow, so we try to defer it in the hopes that we can
157 * avoid it entirely. The latter approach runs the risk of
158 * receiving otherwise unnecessary IPIs.
159 *
160 * This choice is just a heuristic. The tlb code can handle this
161 * function returning true or false regardless of whether we have
162 * PCID.
163 */
164 return !static_cpu_has(X86_FEATURE_PCID);
165}
166
167struct tlb_context { 151struct tlb_context {
168 u64 ctx_id; 152 u64 ctx_id;
169 u64 tlb_gen; 153 u64 tlb_gen;
@@ -547,23 +531,30 @@ struct flush_tlb_info {
547 unsigned long start; 531 unsigned long start;
548 unsigned long end; 532 unsigned long end;
549 u64 new_tlb_gen; 533 u64 new_tlb_gen;
534 unsigned int stride_shift;
535 bool freed_tables;
550}; 536};
551 537
552#define local_flush_tlb() __flush_tlb() 538#define local_flush_tlb() __flush_tlb()
553 539
554#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL) 540#define flush_tlb_mm(mm) \
541 flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true)
555 542
556#define flush_tlb_range(vma, start, end) \ 543#define flush_tlb_range(vma, start, end) \
557 flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) 544 flush_tlb_mm_range((vma)->vm_mm, start, end, \
545 ((vma)->vm_flags & VM_HUGETLB) \
546 ? huge_page_shift(hstate_vma(vma)) \
547 : PAGE_SHIFT, false)
558 548
559extern void flush_tlb_all(void); 549extern void flush_tlb_all(void);
560extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 550extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
561 unsigned long end, unsigned long vmflag); 551 unsigned long end, unsigned int stride_shift,
552 bool freed_tables);
562extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); 553extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
563 554
564static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) 555static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
565{ 556{
566 flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE); 557 flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false);
567} 558}
568 559
569void native_flush_tlb_others(const struct cpumask *cpumask, 560void native_flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 4f2e0778feac..eb8ab3915268 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -11,40 +11,62 @@
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/io.h> 12#include <linux/io.h>
13 13
14/** 14static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
15 * copy_oldmem_page - copy one page from "oldmem" 15 unsigned long offset, int userbuf,
16 * @pfn: page frame number to be copied 16 bool encrypted)
17 * @buf: target memory address for the copy; this can be in kernel address
18 * space or user address space (see @userbuf)
19 * @csize: number of bytes to copy
20 * @offset: offset in bytes into the page (based on pfn) to begin the copy
21 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
22 * otherwise @buf is in kernel address space, use memcpy().
23 *
24 * Copy a page from "oldmem". For this page, there is no pte mapped
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf)
29{ 17{
30 void *vaddr; 18 void *vaddr;
31 19
32 if (!csize) 20 if (!csize)
33 return 0; 21 return 0;
34 22
35 vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); 23 if (encrypted)
24 vaddr = (__force void *)ioremap_encrypted(pfn << PAGE_SHIFT, PAGE_SIZE);
25 else
26 vaddr = (__force void *)ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
27
36 if (!vaddr) 28 if (!vaddr)
37 return -ENOMEM; 29 return -ENOMEM;
38 30
39 if (userbuf) { 31 if (userbuf) {
40 if (copy_to_user(buf, vaddr + offset, csize)) { 32 if (copy_to_user((void __user *)buf, vaddr + offset, csize)) {
41 iounmap(vaddr); 33 iounmap((void __iomem *)vaddr);
42 return -EFAULT; 34 return -EFAULT;
43 } 35 }
44 } else 36 } else
45 memcpy(buf, vaddr + offset, csize); 37 memcpy(buf, vaddr + offset, csize);
46 38
47 set_iounmap_nonlazy(); 39 set_iounmap_nonlazy();
48 iounmap(vaddr); 40 iounmap((void __iomem *)vaddr);
49 return csize; 41 return csize;
50} 42}
43
44/**
45 * copy_oldmem_page - copy one page of memory
46 * @pfn: page frame number to be copied
47 * @buf: target memory address for the copy; this can be in kernel address
48 * space or user address space (see @userbuf)
49 * @csize: number of bytes to copy
50 * @offset: offset in bytes into the page (based on pfn) to begin the copy
51 * @userbuf: if set, @buf is in user address space, use copy_to_user(),
52 * otherwise @buf is in kernel address space, use memcpy().
53 *
54 * Copy a page from the old kernel's memory. For this page, there is no pte
55 * mapped in the current kernel. We stitch up a pte, similar to kmap_atomic.
56 */
57ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
58 unsigned long offset, int userbuf)
59{
60 return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, false);
61}
62
63/**
64 * copy_oldmem_page_encrypted - same as copy_oldmem_page() above but ioremap the
65 * memory with the encryption mask set to accomodate kdump on SME-enabled
66 * machines.
67 */
68ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
69 unsigned long offset, int userbuf)
70{
71 return __copy_oldmem_page(pfn, buf, csize, offset, userbuf, true);
72}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 733e6ace0fa4..ab18e0884dc6 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -273,7 +273,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
273 map_ldt_struct_to_user(mm); 273 map_ldt_struct_to_user(mm);
274 274
275 va = (unsigned long)ldt_slot_va(slot); 275 va = (unsigned long)ldt_slot_va(slot);
276 flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); 276 flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false);
277 277
278 ldt->slot = slot; 278 ldt->slot = slot;
279 return 0; 279 return 0;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 1c03e4aa6474..c2fd39752da8 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -199,7 +199,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
199 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
200out: 200out:
201 up_write(&mm->mmap_sem); 201 up_write(&mm->mmap_sem);
202 flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); 202 flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
203} 203}
204 204
205 205
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a12afff146d1..fc37bbd23eb8 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -19,7 +19,9 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/highmem.h> 21#include <linux/highmem.h>
22#include <linux/pci.h>
22 23
24#include <asm/e820/types.h>
23#include <asm/pgtable.h> 25#include <asm/pgtable.h>
24 26
25/* 27/*
@@ -241,6 +243,29 @@ static unsigned long normalize_addr(unsigned long u)
241 return (signed long)(u << shift) >> shift; 243 return (signed long)(u << shift) >> shift;
242} 244}
243 245
246static void note_wx(struct pg_state *st)
247{
248 unsigned long npages;
249
250 npages = (st->current_address - st->start_address) / PAGE_SIZE;
251
252#ifdef CONFIG_PCI_BIOS
253 /*
254 * If PCI BIOS is enabled, the PCI BIOS area is forced to WX.
255 * Inform about it, but avoid the warning.
256 */
257 if (pcibios_enabled && st->start_address >= PAGE_OFFSET + BIOS_BEGIN &&
258 st->current_address <= PAGE_OFFSET + BIOS_END) {
259 pr_warn_once("x86/mm: PCI BIOS W+X mapping %lu pages\n", npages);
260 return;
261 }
262#endif
263 /* Account the WX pages */
264 st->wx_pages += npages;
265 WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n",
266 (void *)st->start_address);
267}
268
244/* 269/*
245 * This function gets called on a break in a continuous series 270 * This function gets called on a break in a continuous series
246 * of PTE entries; the next one is different so we need to 271 * of PTE entries; the next one is different so we need to
@@ -276,14 +301,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
276 unsigned long delta; 301 unsigned long delta;
277 int width = sizeof(unsigned long) * 2; 302 int width = sizeof(unsigned long) * 2;
278 303
279 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) { 304 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX))
280 WARN_ONCE(1, 305 note_wx(st);
281 "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
282 (void *)st->start_address,
283 (void *)st->start_address);
284 st->wx_pages += (st->current_address -
285 st->start_address) / PAGE_SIZE;
286 }
287 306
288 /* 307 /*
289 * Now print the actual finished series 308 * Now print the actual finished series
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 0d45f6debb3a..2b1519bc5381 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -851,6 +851,15 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
851 show_opcodes(regs, loglvl); 851 show_opcodes(regs, loglvl);
852} 852}
853 853
854/*
855 * The (legacy) vsyscall page is the long page in the kernel portion
856 * of the address space that has user-accessible permissions.
857 */
858static bool is_vsyscall_vaddr(unsigned long vaddr)
859{
860 return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
861}
862
854static void 863static void
855__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, 864__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
856 unsigned long address, u32 *pkey, int si_code) 865 unsigned long address, u32 *pkey, int si_code)
@@ -874,18 +883,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
874 if (is_errata100(regs, address)) 883 if (is_errata100(regs, address))
875 return; 884 return;
876 885
877#ifdef CONFIG_X86_64
878 /*
879 * Instruction fetch faults in the vsyscall page might need
880 * emulation.
881 */
882 if (unlikely((error_code & X86_PF_INSTR) &&
883 ((address & ~0xfff) == VSYSCALL_ADDR))) {
884 if (emulate_vsyscall(regs, address))
885 return;
886 }
887#endif
888
889 /* 886 /*
890 * To avoid leaking information about the kernel page table 887 * To avoid leaking information about the kernel page table
891 * layout, pretend that user-mode accesses to kernel addresses 888 * layout, pretend that user-mode accesses to kernel addresses
@@ -1043,19 +1040,13 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
1043 } 1040 }
1044} 1041}
1045 1042
1046static int spurious_fault_check(unsigned long error_code, pte_t *pte) 1043static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
1047{ 1044{
1048 if ((error_code & X86_PF_WRITE) && !pte_write(*pte)) 1045 if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
1049 return 0; 1046 return 0;
1050 1047
1051 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte)) 1048 if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
1052 return 0; 1049 return 0;
1053 /*
1054 * Note: We do not do lazy flushing on protection key
1055 * changes, so no spurious fault will ever set X86_PF_PK.
1056 */
1057 if ((error_code & X86_PF_PK))
1058 return 1;
1059 1050
1060 return 1; 1051 return 1;
1061} 1052}
@@ -1082,7 +1073,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
1082 * (Optional Invalidation). 1073 * (Optional Invalidation).
1083 */ 1074 */
1084static noinline int 1075static noinline int
1085spurious_fault(unsigned long error_code, unsigned long address) 1076spurious_kernel_fault(unsigned long error_code, unsigned long address)
1086{ 1077{
1087 pgd_t *pgd; 1078 pgd_t *pgd;
1088 p4d_t *p4d; 1079 p4d_t *p4d;
@@ -1113,27 +1104,27 @@ spurious_fault(unsigned long error_code, unsigned long address)
1113 return 0; 1104 return 0;
1114 1105
1115 if (p4d_large(*p4d)) 1106 if (p4d_large(*p4d))
1116 return spurious_fault_check(error_code, (pte_t *) p4d); 1107 return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
1117 1108
1118 pud = pud_offset(p4d, address); 1109 pud = pud_offset(p4d, address);
1119 if (!pud_present(*pud)) 1110 if (!pud_present(*pud))
1120 return 0; 1111 return 0;
1121 1112
1122 if (pud_large(*pud)) 1113 if (pud_large(*pud))
1123 return spurious_fault_check(error_code, (pte_t *) pud); 1114 return spurious_kernel_fault_check(error_code, (pte_t *) pud);
1124 1115
1125 pmd = pmd_offset(pud, address); 1116 pmd = pmd_offset(pud, address);
1126 if (!pmd_present(*pmd)) 1117 if (!pmd_present(*pmd))
1127 return 0; 1118 return 0;
1128 1119
1129 if (pmd_large(*pmd)) 1120 if (pmd_large(*pmd))
1130 return spurious_fault_check(error_code, (pte_t *) pmd); 1121 return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1131 1122
1132 pte = pte_offset_kernel(pmd, address); 1123 pte = pte_offset_kernel(pmd, address);
1133 if (!pte_present(*pte)) 1124 if (!pte_present(*pte))
1134 return 0; 1125 return 0;
1135 1126
1136 ret = spurious_fault_check(error_code, pte); 1127 ret = spurious_kernel_fault_check(error_code, pte);
1137 if (!ret) 1128 if (!ret)
1138 return 0; 1129 return 0;
1139 1130
@@ -1141,12 +1132,12 @@ spurious_fault(unsigned long error_code, unsigned long address)
1141 * Make sure we have permissions in PMD. 1132 * Make sure we have permissions in PMD.
1142 * If not, then there's a bug in the page tables: 1133 * If not, then there's a bug in the page tables:
1143 */ 1134 */
1144 ret = spurious_fault_check(error_code, (pte_t *) pmd); 1135 ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
1145 WARN_ONCE(!ret, "PMD has incorrect permission bits\n"); 1136 WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
1146 1137
1147 return ret; 1138 return ret;
1148} 1139}
1149NOKPROBE_SYMBOL(spurious_fault); 1140NOKPROBE_SYMBOL(spurious_kernel_fault);
1150 1141
1151int show_unhandled_signals = 1; 1142int show_unhandled_signals = 1;
1152 1143
@@ -1193,6 +1184,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
1193 1184
1194static int fault_in_kernel_space(unsigned long address) 1185static int fault_in_kernel_space(unsigned long address)
1195{ 1186{
1187 /*
1188 * On 64-bit systems, the vsyscall page is at an address above
1189 * TASK_SIZE_MAX, but is not considered part of the kernel
1190 * address space.
1191 */
1192 if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
1193 return false;
1194
1196 return address >= TASK_SIZE_MAX; 1195 return address >= TASK_SIZE_MAX;
1197} 1196}
1198 1197
@@ -1214,31 +1213,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
1214} 1213}
1215 1214
1216/* 1215/*
1217 * This routine handles page faults. It determines the address, 1216 * Called for all faults where 'address' is part of the kernel address
1218 * and the problem, and then passes it off to one of the appropriate 1217 * space. Might get called for faults that originate from *code* that
1219 * routines. 1218 * ran in userspace or the kernel.
1220 */ 1219 */
1221static noinline void 1220static void
1222__do_page_fault(struct pt_regs *regs, unsigned long error_code, 1221do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
1223 unsigned long address) 1222 unsigned long address)
1224{ 1223{
1225 struct vm_area_struct *vma; 1224 /*
1226 struct task_struct *tsk; 1225 * Protection keys exceptions only happen on user pages. We
1227 struct mm_struct *mm; 1226 * have no user pages in the kernel portion of the address
1228 vm_fault_t fault, major = 0; 1227 * space, so do not expect them here.
1229 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; 1228 */
1230 u32 pkey; 1229 WARN_ON_ONCE(hw_error_code & X86_PF_PK);
1231
1232 tsk = current;
1233 mm = tsk->mm;
1234
1235 prefetchw(&mm->mmap_sem);
1236
1237 if (unlikely(kmmio_fault(regs, address)))
1238 return;
1239 1230
1240 /* 1231 /*
1241 * We fault-in kernel-space virtual memory on-demand. The 1232 * We can fault-in kernel-space virtual memory on-demand. The
1242 * 'reference' page table is init_mm.pgd. 1233 * 'reference' page table is init_mm.pgd.
1243 * 1234 *
1244 * NOTE! We MUST NOT take any locks for this case. We may 1235 * NOTE! We MUST NOT take any locks for this case. We may
@@ -1246,41 +1237,74 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1246 * only copy the information from the master page table, 1237 * only copy the information from the master page table,
1247 * nothing more. 1238 * nothing more.
1248 * 1239 *
1249 * This verifies that the fault happens in kernel space 1240 * Before doing this on-demand faulting, ensure that the
1250 * (error_code & 4) == 0, and that the fault was not a 1241 * fault is not any of the following:
1251 * protection error (error_code & 9) == 0. 1242 * 1. A fault on a PTE with a reserved bit set.
1243 * 2. A fault caused by a user-mode access. (Do not demand-
1244 * fault kernel memory due to user-mode accesses).
1245 * 3. A fault caused by a page-level protection violation.
1246 * (A demand fault would be on a non-present page which
1247 * would have X86_PF_PROT==0).
1252 */ 1248 */
1253 if (unlikely(fault_in_kernel_space(address))) { 1249 if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1254 if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { 1250 if (vmalloc_fault(address) >= 0)
1255 if (vmalloc_fault(address) >= 0)
1256 return;
1257 }
1258
1259 /* Can handle a stale RO->RW TLB: */
1260 if (spurious_fault(error_code, address))
1261 return; 1251 return;
1252 }
1262 1253
1263 /* kprobes don't want to hook the spurious faults: */ 1254 /* Was the fault spurious, caused by lazy TLB invalidation? */
1264 if (kprobes_fault(regs)) 1255 if (spurious_kernel_fault(hw_error_code, address))
1265 return; 1256 return;
1266 /*
1267 * Don't take the mm semaphore here. If we fixup a prefetch
1268 * fault we could otherwise deadlock:
1269 */
1270 bad_area_nosemaphore(regs, error_code, address, NULL);
1271 1257
1258 /* kprobes don't want to hook the spurious faults: */
1259 if (kprobes_fault(regs))
1272 return; 1260 return;
1273 } 1261
1262 /*
1263 * Note, despite being a "bad area", there are quite a few
1264 * acceptable reasons to get here, such as erratum fixups
1265 * and handling kernel code that can fault, like get_user().
1266 *
1267 * Don't take the mm semaphore here. If we fixup a prefetch
1268 * fault we could otherwise deadlock:
1269 */
1270 bad_area_nosemaphore(regs, hw_error_code, address, NULL);
1271}
1272NOKPROBE_SYMBOL(do_kern_addr_fault);
1273
1274/* Handle faults in the user portion of the address space */
1275static inline
1276void do_user_addr_fault(struct pt_regs *regs,
1277 unsigned long hw_error_code,
1278 unsigned long address)
1279{
1280 unsigned long sw_error_code;
1281 struct vm_area_struct *vma;
1282 struct task_struct *tsk;
1283 struct mm_struct *mm;
1284 vm_fault_t fault, major = 0;
1285 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1286 u32 pkey;
1287
1288 tsk = current;
1289 mm = tsk->mm;
1274 1290
1275 /* kprobes don't want to hook the spurious faults: */ 1291 /* kprobes don't want to hook the spurious faults: */
1276 if (unlikely(kprobes_fault(regs))) 1292 if (unlikely(kprobes_fault(regs)))
1277 return; 1293 return;
1278 1294
1279 if (unlikely(error_code & X86_PF_RSVD)) 1295 /*
1280 pgtable_bad(regs, error_code, address); 1296 * Reserved bits are never expected to be set on
1297 * entries in the user portion of the page tables.
1298 */
1299 if (unlikely(hw_error_code & X86_PF_RSVD))
1300 pgtable_bad(regs, hw_error_code, address);
1281 1301
1282 if (unlikely(smap_violation(error_code, regs))) { 1302 /*
1283 bad_area_nosemaphore(regs, error_code, address, NULL); 1303 * Check for invalid kernel (supervisor) access to user
1304 * pages in the user address space.
1305 */
1306 if (unlikely(smap_violation(hw_error_code, regs))) {
1307 bad_area_nosemaphore(regs, hw_error_code, address, NULL);
1284 return; 1308 return;
1285 } 1309 }
1286 1310
@@ -1289,11 +1313,18 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1289 * in a region with pagefaults disabled then we must not take the fault 1313 * in a region with pagefaults disabled then we must not take the fault
1290 */ 1314 */
1291 if (unlikely(faulthandler_disabled() || !mm)) { 1315 if (unlikely(faulthandler_disabled() || !mm)) {
1292 bad_area_nosemaphore(regs, error_code, address, NULL); 1316 bad_area_nosemaphore(regs, hw_error_code, address, NULL);
1293 return; 1317 return;
1294 } 1318 }
1295 1319
1296 /* 1320 /*
1321 * hw_error_code is literally the "page fault error code" passed to
1322 * the kernel directly from the hardware. But, we will shortly be
1323 * modifying it in software, so give it a new name.
1324 */
1325 sw_error_code = hw_error_code;
1326
1327 /*
1297 * It's safe to allow irq's after cr2 has been saved and the 1328 * It's safe to allow irq's after cr2 has been saved and the
1298 * vmalloc fault has been handled. 1329 * vmalloc fault has been handled.
1299 * 1330 *
@@ -1302,7 +1333,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1302 */ 1333 */
1303 if (user_mode(regs)) { 1334 if (user_mode(regs)) {
1304 local_irq_enable(); 1335 local_irq_enable();
1305 error_code |= X86_PF_USER; 1336 /*
1337 * Up to this point, X86_PF_USER set in hw_error_code
1338 * indicated a user-mode access. But, after this,
1339 * X86_PF_USER in sw_error_code will indicate either
1340 * that, *or* an implicit kernel(supervisor)-mode access
1341 * which originated from user mode.
1342 */
1343 if (!(hw_error_code & X86_PF_USER)) {
1344 /*
1345 * The CPU was in user mode, but the CPU says
1346 * the fault was not a user-mode access.
1347 * Must be an implicit kernel-mode access,
1348 * which we do not expect to happen in the
1349 * user address space.
1350 */
1351 pr_warn_once("kernel-mode error from user-mode: %lx\n",
1352 hw_error_code);
1353
1354 sw_error_code |= X86_PF_USER;
1355 }
1306 flags |= FAULT_FLAG_USER; 1356 flags |= FAULT_FLAG_USER;
1307 } else { 1357 } else {
1308 if (regs->flags & X86_EFLAGS_IF) 1358 if (regs->flags & X86_EFLAGS_IF)
@@ -1311,31 +1361,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1311 1361
1312 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); 1362 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1313 1363
1314 if (error_code & X86_PF_WRITE) 1364 if (sw_error_code & X86_PF_WRITE)
1315 flags |= FAULT_FLAG_WRITE; 1365 flags |= FAULT_FLAG_WRITE;
1316 if (error_code & X86_PF_INSTR) 1366 if (sw_error_code & X86_PF_INSTR)
1317 flags |= FAULT_FLAG_INSTRUCTION; 1367 flags |= FAULT_FLAG_INSTRUCTION;
1318 1368
1369#ifdef CONFIG_X86_64
1319 /* 1370 /*
1320 * When running in the kernel we expect faults to occur only to 1371 * Instruction fetch faults in the vsyscall page might need
1321 * addresses in user space. All other faults represent errors in 1372 * emulation. The vsyscall page is at a high address
1322 * the kernel and should generate an OOPS. Unfortunately, in the 1373 * (>PAGE_OFFSET), but is considered to be part of the user
1323 * case of an erroneous fault occurring in a code path which already 1374 * address space.
1324 * holds mmap_sem we will deadlock attempting to validate the fault
1325 * against the address space. Luckily the kernel only validly
1326 * references user space from well defined areas of code, which are
1327 * listed in the exceptions table.
1328 * 1375 *
1329 * As the vast majority of faults will be valid we will only perform 1376 * The vsyscall page does not have a "real" VMA, so do this
1330 * the source reference check when there is a possibility of a 1377 * emulation before we go searching for VMAs.
1331 * deadlock. Attempt to lock the address space, if we cannot we then 1378 */
1332 * validate the source. If this is invalid we can skip the address 1379 if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) {
1333 * space check, thus avoiding the deadlock: 1380 if (emulate_vsyscall(regs, address))
1381 return;
1382 }
1383#endif
1384
1385 /*
1386 * Kernel-mode access to the user address space should only occur
1387 * on well-defined single instructions listed in the exception
1388 * tables. But, an erroneous kernel fault occurring outside one of
1389 * those areas which also holds mmap_sem might deadlock attempting
1390 * to validate the fault against the address space.
1391 *
1392 * Only do the expensive exception table search when we might be at
1393 * risk of a deadlock. This happens if we
1394 * 1. Failed to acquire mmap_sem, and
1395 * 2. The access did not originate in userspace. Note: either the
1396 * hardware or earlier page fault code may set X86_PF_USER
1397 * in sw_error_code.
1334 */ 1398 */
1335 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 1399 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1336 if (!(error_code & X86_PF_USER) && 1400 if (!(sw_error_code & X86_PF_USER) &&
1337 !search_exception_tables(regs->ip)) { 1401 !search_exception_tables(regs->ip)) {
1338 bad_area_nosemaphore(regs, error_code, address, NULL); 1402 /*
1403 * Fault from code in kernel from
1404 * which we do not expect faults.
1405 */
1406 bad_area_nosemaphore(regs, sw_error_code, address, NULL);
1339 return; 1407 return;
1340 } 1408 }
1341retry: 1409retry:
@@ -1351,16 +1419,16 @@ retry:
1351 1419
1352 vma = find_vma(mm, address); 1420 vma = find_vma(mm, address);
1353 if (unlikely(!vma)) { 1421 if (unlikely(!vma)) {
1354 bad_area(regs, error_code, address); 1422 bad_area(regs, sw_error_code, address);
1355 return; 1423 return;
1356 } 1424 }
1357 if (likely(vma->vm_start <= address)) 1425 if (likely(vma->vm_start <= address))
1358 goto good_area; 1426 goto good_area;
1359 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { 1427 if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1360 bad_area(regs, error_code, address); 1428 bad_area(regs, sw_error_code, address);
1361 return; 1429 return;
1362 } 1430 }
1363 if (error_code & X86_PF_USER) { 1431 if (sw_error_code & X86_PF_USER) {
1364 /* 1432 /*
1365 * Accessing the stack below %sp is always a bug. 1433 * Accessing the stack below %sp is always a bug.
1366 * The large cushion allows instructions like enter 1434 * The large cushion allows instructions like enter
@@ -1368,12 +1436,12 @@ retry:
1368 * 32 pointers and then decrements %sp by 65535.) 1436 * 32 pointers and then decrements %sp by 65535.)
1369 */ 1437 */
1370 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { 1438 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1371 bad_area(regs, error_code, address); 1439 bad_area(regs, sw_error_code, address);
1372 return; 1440 return;
1373 } 1441 }
1374 } 1442 }
1375 if (unlikely(expand_stack(vma, address))) { 1443 if (unlikely(expand_stack(vma, address))) {
1376 bad_area(regs, error_code, address); 1444 bad_area(regs, sw_error_code, address);
1377 return; 1445 return;
1378 } 1446 }
1379 1447
@@ -1382,8 +1450,8 @@ retry:
1382 * we can handle it.. 1450 * we can handle it..
1383 */ 1451 */
1384good_area: 1452good_area:
1385 if (unlikely(access_error(error_code, vma))) { 1453 if (unlikely(access_error(sw_error_code, vma))) {
1386 bad_area_access_error(regs, error_code, address, vma); 1454 bad_area_access_error(regs, sw_error_code, address, vma);
1387 return; 1455 return;
1388 } 1456 }
1389 1457
@@ -1425,13 +1493,13 @@ good_area:
1425 return; 1493 return;
1426 1494
1427 /* Not returning to user mode? Handle exceptions or die: */ 1495 /* Not returning to user mode? Handle exceptions or die: */
1428 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); 1496 no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);
1429 return; 1497 return;
1430 } 1498 }
1431 1499
1432 up_read(&mm->mmap_sem); 1500 up_read(&mm->mmap_sem);
1433 if (unlikely(fault & VM_FAULT_ERROR)) { 1501 if (unlikely(fault & VM_FAULT_ERROR)) {
1434 mm_fault_error(regs, error_code, address, &pkey, fault); 1502 mm_fault_error(regs, sw_error_code, address, &pkey, fault);
1435 return; 1503 return;
1436 } 1504 }
1437 1505
@@ -1449,6 +1517,28 @@ good_area:
1449 1517
1450 check_v8086_mode(regs, address, tsk); 1518 check_v8086_mode(regs, address, tsk);
1451} 1519}
1520NOKPROBE_SYMBOL(do_user_addr_fault);
1521
1522/*
1523 * This routine handles page faults. It determines the address,
1524 * and the problem, and then passes it off to one of the appropriate
1525 * routines.
1526 */
1527static noinline void
1528__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
1529 unsigned long address)
1530{
1531 prefetchw(&current->mm->mmap_sem);
1532
1533 if (unlikely(kmmio_fault(regs, address)))
1534 return;
1535
1536 /* Was the fault on kernel-controlled part of the address space? */
1537 if (unlikely(fault_in_kernel_space(address)))
1538 do_kern_addr_fault(regs, hw_error_code, address);
1539 else
1540 do_user_addr_fault(regs, hw_error_code, address);
1541}
1452NOKPROBE_SYMBOL(__do_page_fault); 1542NOKPROBE_SYMBOL(__do_page_fault);
1453 1543
1454static nokprobe_inline void 1544static nokprobe_inline void
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 979e0a02cbe1..142c7d9f89cc 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -923,34 +923,19 @@ static void mark_nxdata_nx(void)
923void mark_rodata_ro(void) 923void mark_rodata_ro(void)
924{ 924{
925 unsigned long start = PFN_ALIGN(_text); 925 unsigned long start = PFN_ALIGN(_text);
926 unsigned long size = PFN_ALIGN(_etext) - start; 926 unsigned long size = (unsigned long)__end_rodata - start;
927 927
928 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 928 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
929 printk(KERN_INFO "Write protecting the kernel text: %luk\n", 929 pr_info("Write protecting kernel text and read-only data: %luk\n",
930 size >> 10); 930 size >> 10);
931 931
932 kernel_set_to_readonly = 1; 932 kernel_set_to_readonly = 1;
933 933
934#ifdef CONFIG_CPA_DEBUG 934#ifdef CONFIG_CPA_DEBUG
935 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", 935 pr_info("Testing CPA: Reverting %lx-%lx\n", start, start + size);
936 start, start+size);
937 set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
938
939 printk(KERN_INFO "Testing CPA: write protecting again\n");
940 set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
941#endif
942
943 start += size;
944 size = (unsigned long)__end_rodata - start;
945 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
946 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
947 size >> 10);
948
949#ifdef CONFIG_CPA_DEBUG
950 printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size);
951 set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); 936 set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT);
952 937
953 printk(KERN_INFO "Testing CPA: write protecting again\n"); 938 pr_info("Testing CPA: write protecting again\n");
954 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 939 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
955#endif 940#endif
956 mark_nxdata_nx(); 941 mark_nxdata_nx();
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c63a545ec199..24e0920a9b25 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -131,7 +131,8 @@ static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
131 * caller shouldn't need to know that small detail. 131 * caller shouldn't need to know that small detail.
132 */ 132 */
133static void __iomem *__ioremap_caller(resource_size_t phys_addr, 133static void __iomem *__ioremap_caller(resource_size_t phys_addr,
134 unsigned long size, enum page_cache_mode pcm, void *caller) 134 unsigned long size, enum page_cache_mode pcm,
135 void *caller, bool encrypted)
135{ 136{
136 unsigned long offset, vaddr; 137 unsigned long offset, vaddr;
137 resource_size_t last_addr; 138 resource_size_t last_addr;
@@ -199,7 +200,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
199 * resulting mapping. 200 * resulting mapping.
200 */ 201 */
201 prot = PAGE_KERNEL_IO; 202 prot = PAGE_KERNEL_IO;
202 if (sev_active() && mem_flags.desc_other) 203 if ((sev_active() && mem_flags.desc_other) || encrypted)
203 prot = pgprot_encrypted(prot); 204 prot = pgprot_encrypted(prot);
204 205
205 switch (pcm) { 206 switch (pcm) {
@@ -291,7 +292,7 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
291 enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS; 292 enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
292 293
293 return __ioremap_caller(phys_addr, size, pcm, 294 return __ioremap_caller(phys_addr, size, pcm,
294 __builtin_return_address(0)); 295 __builtin_return_address(0), false);
295} 296}
296EXPORT_SYMBOL(ioremap_nocache); 297EXPORT_SYMBOL(ioremap_nocache);
297 298
@@ -324,7 +325,7 @@ void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
324 enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC; 325 enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
325 326
326 return __ioremap_caller(phys_addr, size, pcm, 327 return __ioremap_caller(phys_addr, size, pcm,
327 __builtin_return_address(0)); 328 __builtin_return_address(0), false);
328} 329}
329EXPORT_SYMBOL_GPL(ioremap_uc); 330EXPORT_SYMBOL_GPL(ioremap_uc);
330 331
@@ -341,7 +342,7 @@ EXPORT_SYMBOL_GPL(ioremap_uc);
341void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size) 342void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
342{ 343{
343 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC, 344 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
344 __builtin_return_address(0)); 345 __builtin_return_address(0), false);
345} 346}
346EXPORT_SYMBOL(ioremap_wc); 347EXPORT_SYMBOL(ioremap_wc);
347 348
@@ -358,14 +359,21 @@ EXPORT_SYMBOL(ioremap_wc);
358void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size) 359void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
359{ 360{
360 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT, 361 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
361 __builtin_return_address(0)); 362 __builtin_return_address(0), false);
362} 363}
363EXPORT_SYMBOL(ioremap_wt); 364EXPORT_SYMBOL(ioremap_wt);
364 365
366void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size)
367{
368 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
369 __builtin_return_address(0), true);
370}
371EXPORT_SYMBOL(ioremap_encrypted);
372
365void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) 373void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
366{ 374{
367 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB, 375 return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
368 __builtin_return_address(0)); 376 __builtin_return_address(0), false);
369} 377}
370EXPORT_SYMBOL(ioremap_cache); 378EXPORT_SYMBOL(ioremap_cache);
371 379
@@ -374,7 +382,7 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
374{ 382{
375 return __ioremap_caller(phys_addr, size, 383 return __ioremap_caller(phys_addr, size,
376 pgprot2cachemode(__pgprot(prot_val)), 384 pgprot2cachemode(__pgprot(prot_val)),
377 __builtin_return_address(0)); 385 __builtin_return_address(0), false);
378} 386}
379EXPORT_SYMBOL(ioremap_prot); 387EXPORT_SYMBOL(ioremap_prot);
380 388
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 51a5a69ecac9..62bb30b4bd2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -37,11 +37,20 @@ struct cpa_data {
37 unsigned long numpages; 37 unsigned long numpages;
38 int flags; 38 int flags;
39 unsigned long pfn; 39 unsigned long pfn;
40 unsigned force_split : 1; 40 unsigned force_split : 1,
41 force_static_prot : 1;
41 int curpage; 42 int curpage;
42 struct page **pages; 43 struct page **pages;
43}; 44};
44 45
46enum cpa_warn {
47 CPA_CONFLICT,
48 CPA_PROTECT,
49 CPA_DETECT,
50};
51
52static const int cpa_warn_level = CPA_PROTECT;
53
45/* 54/*
46 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) 55 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
47 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb 56 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
@@ -94,6 +103,87 @@ void arch_report_meminfo(struct seq_file *m)
94static inline void split_page_count(int level) { } 103static inline void split_page_count(int level) { }
95#endif 104#endif
96 105
106#ifdef CONFIG_X86_CPA_STATISTICS
107
108static unsigned long cpa_1g_checked;
109static unsigned long cpa_1g_sameprot;
110static unsigned long cpa_1g_preserved;
111static unsigned long cpa_2m_checked;
112static unsigned long cpa_2m_sameprot;
113static unsigned long cpa_2m_preserved;
114static unsigned long cpa_4k_install;
115
116static inline void cpa_inc_1g_checked(void)
117{
118 cpa_1g_checked++;
119}
120
121static inline void cpa_inc_2m_checked(void)
122{
123 cpa_2m_checked++;
124}
125
126static inline void cpa_inc_4k_install(void)
127{
128 cpa_4k_install++;
129}
130
131static inline void cpa_inc_lp_sameprot(int level)
132{
133 if (level == PG_LEVEL_1G)
134 cpa_1g_sameprot++;
135 else
136 cpa_2m_sameprot++;
137}
138
139static inline void cpa_inc_lp_preserved(int level)
140{
141 if (level == PG_LEVEL_1G)
142 cpa_1g_preserved++;
143 else
144 cpa_2m_preserved++;
145}
146
147static int cpastats_show(struct seq_file *m, void *p)
148{
149 seq_printf(m, "1G pages checked: %16lu\n", cpa_1g_checked);
150 seq_printf(m, "1G pages sameprot: %16lu\n", cpa_1g_sameprot);
151 seq_printf(m, "1G pages preserved: %16lu\n", cpa_1g_preserved);
152 seq_printf(m, "2M pages checked: %16lu\n", cpa_2m_checked);
153 seq_printf(m, "2M pages sameprot: %16lu\n", cpa_2m_sameprot);
154 seq_printf(m, "2M pages preserved: %16lu\n", cpa_2m_preserved);
155 seq_printf(m, "4K pages set-checked: %16lu\n", cpa_4k_install);
156 return 0;
157}
158
159static int cpastats_open(struct inode *inode, struct file *file)
160{
161 return single_open(file, cpastats_show, NULL);
162}
163
164static const struct file_operations cpastats_fops = {
165 .open = cpastats_open,
166 .read = seq_read,
167 .llseek = seq_lseek,
168 .release = single_release,
169};
170
171static int __init cpa_stats_init(void)
172{
173 debugfs_create_file("cpa_stats", S_IRUSR, arch_debugfs_dir, NULL,
174 &cpastats_fops);
175 return 0;
176}
177late_initcall(cpa_stats_init);
178#else
179static inline void cpa_inc_1g_checked(void) { }
180static inline void cpa_inc_2m_checked(void) { }
181static inline void cpa_inc_4k_install(void) { }
182static inline void cpa_inc_lp_sameprot(int level) { }
183static inline void cpa_inc_lp_preserved(int level) { }
184#endif
185
186
97static inline int 187static inline int
98within(unsigned long addr, unsigned long start, unsigned long end) 188within(unsigned long addr, unsigned long start, unsigned long end)
99{ 189{
@@ -195,14 +285,20 @@ static void cpa_flush_all(unsigned long cache)
195 on_each_cpu(__cpa_flush_all, (void *) cache, 1); 285 on_each_cpu(__cpa_flush_all, (void *) cache, 1);
196} 286}
197 287
198static void __cpa_flush_range(void *arg) 288static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
199{ 289{
200 /* 290 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
201 * We could optimize that further and do individual per page 291
202 * tlb invalidates for a low number of pages. Caveat: we must 292 WARN_ON(PAGE_ALIGN(start) != start);
203 * flush the high aliases on 64bit as well. 293
204 */ 294 if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
205 __flush_tlb_all(); 295 cpa_flush_all(cache);
296 return true;
297 }
298
299 flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
300
301 return !cache;
206} 302}
207 303
208static void cpa_flush_range(unsigned long start, int numpages, int cache) 304static void cpa_flush_range(unsigned long start, int numpages, int cache)
@@ -210,12 +306,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
210 unsigned int i, level; 306 unsigned int i, level;
211 unsigned long addr; 307 unsigned long addr;
212 308
213 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); 309 if (__cpa_flush_range(start, numpages, cache))
214 WARN_ON(PAGE_ALIGN(start) != start);
215
216 on_each_cpu(__cpa_flush_range, NULL, 1);
217
218 if (!cache)
219 return; 310 return;
220 311
221 /* 312 /*
@@ -235,30 +326,13 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
235 } 326 }
236} 327}
237 328
238static void cpa_flush_array(unsigned long *start, int numpages, int cache, 329static void cpa_flush_array(unsigned long baddr, unsigned long *start,
330 int numpages, int cache,
239 int in_flags, struct page **pages) 331 int in_flags, struct page **pages)
240{ 332{
241 unsigned int i, level; 333 unsigned int i, level;
242#ifdef CONFIG_PREEMPT
243 /*
244 * Avoid wbinvd() because it causes latencies on all CPUs,
245 * regardless of any CPU isolation that may be in effect.
246 *
247 * This should be extended for CAT enabled systems independent of
248 * PREEMPT because wbinvd() does not respect the CAT partitions and
249 * this is exposed to unpriviledged users through the graphics
250 * subsystem.
251 */
252 unsigned long do_wbinvd = 0;
253#else
254 unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
255#endif
256
257 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
258 334
259 on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); 335 if (__cpa_flush_range(baddr, numpages, cache))
260
261 if (!cache || do_wbinvd)
262 return; 336 return;
263 337
264 /* 338 /*
@@ -286,84 +360,179 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache,
286 } 360 }
287} 361}
288 362
289/* 363static bool overlaps(unsigned long r1_start, unsigned long r1_end,
290 * Certain areas of memory on x86 require very specific protection flags, 364 unsigned long r2_start, unsigned long r2_end)
291 * for example the BIOS area or kernel text. Callers don't always get this
292 * right (again, ioremap() on BIOS memory is not uncommon) so this function
293 * checks and fixes these known static required protection bits.
294 */
295static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
296 unsigned long pfn)
297{ 365{
298 pgprot_t forbidden = __pgprot(0); 366 return (r1_start <= r2_end && r1_end >= r2_start) ||
367 (r2_start <= r1_end && r2_end >= r1_start);
368}
299 369
300 /*
301 * The BIOS area between 640k and 1Mb needs to be executable for
302 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
303 */
304#ifdef CONFIG_PCI_BIOS 370#ifdef CONFIG_PCI_BIOS
305 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 371/*
306 pgprot_val(forbidden) |= _PAGE_NX; 372 * The BIOS area between 640k and 1Mb needs to be executable for PCI BIOS
373 * based config access (CONFIG_PCI_GOBIOS) support.
374 */
375#define BIOS_PFN PFN_DOWN(BIOS_BEGIN)
376#define BIOS_PFN_END PFN_DOWN(BIOS_END - 1)
377
378static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
379{
380 if (pcibios_enabled && overlaps(spfn, epfn, BIOS_PFN, BIOS_PFN_END))
381 return _PAGE_NX;
382 return 0;
383}
384#else
385static pgprotval_t protect_pci_bios(unsigned long spfn, unsigned long epfn)
386{
387 return 0;
388}
307#endif 389#endif
308 390
309 /* 391/*
310 * The kernel text needs to be executable for obvious reasons 392 * The .rodata section needs to be read-only. Using the pfn catches all
311 * Does not cover __inittext since that is gone later on. On 393 * aliases. This also includes __ro_after_init, so do not enforce until
312 * 64bit we do not enforce !NX on the low mapping 394 * kernel_set_to_readonly is true.
313 */ 395 */
314 if (within(address, (unsigned long)_text, (unsigned long)_etext)) 396static pgprotval_t protect_rodata(unsigned long spfn, unsigned long epfn)
315 pgprot_val(forbidden) |= _PAGE_NX; 397{
398 unsigned long epfn_ro, spfn_ro = PFN_DOWN(__pa_symbol(__start_rodata));
316 399
317 /* 400 /*
318 * The .rodata section needs to be read-only. Using the pfn 401 * Note: __end_rodata is at page aligned and not inclusive, so
319 * catches all aliases. This also includes __ro_after_init, 402 * subtract 1 to get the last enforced PFN in the rodata area.
320 * so do not enforce until kernel_set_to_readonly is true.
321 */ 403 */
322 if (kernel_set_to_readonly && 404 epfn_ro = PFN_DOWN(__pa_symbol(__end_rodata)) - 1;
323 within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, 405
324 __pa_symbol(__end_rodata) >> PAGE_SHIFT)) 406 if (kernel_set_to_readonly && overlaps(spfn, epfn, spfn_ro, epfn_ro))
325 pgprot_val(forbidden) |= _PAGE_RW; 407 return _PAGE_RW;
408 return 0;
409}
410
411/*
412 * Protect kernel text against becoming non executable by forbidding
413 * _PAGE_NX. This protects only the high kernel mapping (_text -> _etext)
414 * out of which the kernel actually executes. Do not protect the low
415 * mapping.
416 *
417 * This does not cover __inittext since that is gone after boot.
418 */
419static pgprotval_t protect_kernel_text(unsigned long start, unsigned long end)
420{
421 unsigned long t_end = (unsigned long)_etext - 1;
422 unsigned long t_start = (unsigned long)_text;
423
424 if (overlaps(start, end, t_start, t_end))
425 return _PAGE_NX;
426 return 0;
427}
326 428
327#if defined(CONFIG_X86_64) 429#if defined(CONFIG_X86_64)
430/*
431 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
432 * kernel text mappings for the large page aligned text, rodata sections
433 * will be always read-only. For the kernel identity mappings covering the
434 * holes caused by this alignment can be anything that user asks.
435 *
436 * This will preserve the large page mappings for kernel text/data at no
437 * extra cost.
438 */
439static pgprotval_t protect_kernel_text_ro(unsigned long start,
440 unsigned long end)
441{
442 unsigned long t_end = (unsigned long)__end_rodata_hpage_align - 1;
443 unsigned long t_start = (unsigned long)_text;
444 unsigned int level;
445
446 if (!kernel_set_to_readonly || !overlaps(start, end, t_start, t_end))
447 return 0;
328 /* 448 /*
329 * Once the kernel maps the text as RO (kernel_set_to_readonly is set), 449 * Don't enforce the !RW mapping for the kernel text mapping, if
330 * kernel text mappings for the large page aligned text, rodata sections 450 * the current mapping is already using small page mapping. No
331 * will be always read-only. For the kernel identity mappings covering 451 * need to work hard to preserve large page mappings in this case.
332 * the holes caused by this alignment can be anything that user asks.
333 * 452 *
334 * This will preserve the large page mappings for kernel text/data 453 * This also fixes the Linux Xen paravirt guest boot failure caused
335 * at no extra cost. 454 * by unexpected read-only mappings for kernel identity
455 * mappings. In this paravirt guest case, the kernel text mapping
456 * and the kernel identity mapping share the same page-table pages,
457 * so the protections for kernel text and identity mappings have to
458 * be the same.
336 */ 459 */
337 if (kernel_set_to_readonly && 460 if (lookup_address(start, &level) && (level != PG_LEVEL_4K))
338 within(address, (unsigned long)_text, 461 return _PAGE_RW;
339 (unsigned long)__end_rodata_hpage_align)) { 462 return 0;
340 unsigned int level; 463}
341 464#else
342 /* 465static pgprotval_t protect_kernel_text_ro(unsigned long start,
343 * Don't enforce the !RW mapping for the kernel text mapping, 466 unsigned long end)
344 * if the current mapping is already using small page mapping. 467{
345 * No need to work hard to preserve large page mappings in this 468 return 0;
346 * case. 469}
347 *
348 * This also fixes the Linux Xen paravirt guest boot failure
349 * (because of unexpected read-only mappings for kernel identity
350 * mappings). In this paravirt guest case, the kernel text
351 * mapping and the kernel identity mapping share the same
352 * page-table pages. Thus we can't really use different
353 * protections for the kernel text and identity mappings. Also,
354 * these shared mappings are made of small page mappings.
355 * Thus this don't enforce !RW mapping for small page kernel
356 * text mapping logic will help Linux Xen parvirt guest boot
357 * as well.
358 */
359 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
360 pgprot_val(forbidden) |= _PAGE_RW;
361 }
362#endif 470#endif
363 471
364 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 472static inline bool conflicts(pgprot_t prot, pgprotval_t val)
473{
474 return (pgprot_val(prot) & ~val) != pgprot_val(prot);
475}
365 476
366 return prot; 477static inline void check_conflict(int warnlvl, pgprot_t prot, pgprotval_t val,
478 unsigned long start, unsigned long end,
479 unsigned long pfn, const char *txt)
480{
481 static const char *lvltxt[] = {
482 [CPA_CONFLICT] = "conflict",
483 [CPA_PROTECT] = "protect",
484 [CPA_DETECT] = "detect",
485 };
486
487 if (warnlvl > cpa_warn_level || !conflicts(prot, val))
488 return;
489
490 pr_warn("CPA %8s %10s: 0x%016lx - 0x%016lx PFN %lx req %016llx prevent %016llx\n",
491 lvltxt[warnlvl], txt, start, end, pfn, (unsigned long long)pgprot_val(prot),
492 (unsigned long long)val);
493}
494
495/*
496 * Certain areas of memory on x86 require very specific protection flags,
497 * for example the BIOS area or kernel text. Callers don't always get this
498 * right (again, ioremap() on BIOS memory is not uncommon) so this function
499 * checks and fixes these known static required protection bits.
500 */
501static inline pgprot_t static_protections(pgprot_t prot, unsigned long start,
502 unsigned long pfn, unsigned long npg,
503 int warnlvl)
504{
505 pgprotval_t forbidden, res;
506 unsigned long end;
507
508 /*
509 * There is no point in checking RW/NX conflicts when the requested
510 * mapping is setting the page !PRESENT.
511 */
512 if (!(pgprot_val(prot) & _PAGE_PRESENT))
513 return prot;
514
515 /* Operate on the virtual address */
516 end = start + npg * PAGE_SIZE - 1;
517
518 res = protect_kernel_text(start, end);
519 check_conflict(warnlvl, prot, res, start, end, pfn, "Text NX");
520 forbidden = res;
521
522 res = protect_kernel_text_ro(start, end);
523 check_conflict(warnlvl, prot, res, start, end, pfn, "Text RO");
524 forbidden |= res;
525
526 /* Check the PFN directly */
527 res = protect_pci_bios(pfn, pfn + npg - 1);
528 check_conflict(warnlvl, prot, res, start, end, pfn, "PCIBIOS NX");
529 forbidden |= res;
530
531 res = protect_rodata(pfn, pfn + npg - 1);
532 check_conflict(warnlvl, prot, res, start, end, pfn, "Rodata RO");
533 forbidden |= res;
534
535 return __pgprot(pgprot_val(prot) & ~forbidden);
367} 536}
368 537
369/* 538/*
@@ -421,18 +590,18 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
421 */ 590 */
422pte_t *lookup_address(unsigned long address, unsigned int *level) 591pte_t *lookup_address(unsigned long address, unsigned int *level)
423{ 592{
424 return lookup_address_in_pgd(pgd_offset_k(address), address, level); 593 return lookup_address_in_pgd(pgd_offset_k(address), address, level);
425} 594}
426EXPORT_SYMBOL_GPL(lookup_address); 595EXPORT_SYMBOL_GPL(lookup_address);
427 596
428static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, 597static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
429 unsigned int *level) 598 unsigned int *level)
430{ 599{
431 if (cpa->pgd) 600 if (cpa->pgd)
432 return lookup_address_in_pgd(cpa->pgd + pgd_index(address), 601 return lookup_address_in_pgd(cpa->pgd + pgd_index(address),
433 address, level); 602 address, level);
434 603
435 return lookup_address(address, level); 604 return lookup_address(address, level);
436} 605}
437 606
438/* 607/*
@@ -549,40 +718,35 @@ static pgprot_t pgprot_clear_protnone_bits(pgprot_t prot)
549 return prot; 718 return prot;
550} 719}
551 720
552static int 721static int __should_split_large_page(pte_t *kpte, unsigned long address,
553try_preserve_large_page(pte_t *kpte, unsigned long address, 722 struct cpa_data *cpa)
554 struct cpa_data *cpa)
555{ 723{
556 unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn, old_pfn; 724 unsigned long numpages, pmask, psize, lpaddr, pfn, old_pfn;
725 pgprot_t old_prot, new_prot, req_prot, chk_prot;
557 pte_t new_pte, old_pte, *tmp; 726 pte_t new_pte, old_pte, *tmp;
558 pgprot_t old_prot, new_prot, req_prot;
559 int i, do_split = 1;
560 enum pg_level level; 727 enum pg_level level;
561 728
562 if (cpa->force_split)
563 return 1;
564
565 spin_lock(&pgd_lock);
566 /* 729 /*
567 * Check for races, another CPU might have split this page 730 * Check for races, another CPU might have split this page
568 * up already: 731 * up already:
569 */ 732 */
570 tmp = _lookup_address_cpa(cpa, address, &level); 733 tmp = _lookup_address_cpa(cpa, address, &level);
571 if (tmp != kpte) 734 if (tmp != kpte)
572 goto out_unlock; 735 return 1;
573 736
574 switch (level) { 737 switch (level) {
575 case PG_LEVEL_2M: 738 case PG_LEVEL_2M:
576 old_prot = pmd_pgprot(*(pmd_t *)kpte); 739 old_prot = pmd_pgprot(*(pmd_t *)kpte);
577 old_pfn = pmd_pfn(*(pmd_t *)kpte); 740 old_pfn = pmd_pfn(*(pmd_t *)kpte);
741 cpa_inc_2m_checked();
578 break; 742 break;
579 case PG_LEVEL_1G: 743 case PG_LEVEL_1G:
580 old_prot = pud_pgprot(*(pud_t *)kpte); 744 old_prot = pud_pgprot(*(pud_t *)kpte);
581 old_pfn = pud_pfn(*(pud_t *)kpte); 745 old_pfn = pud_pfn(*(pud_t *)kpte);
746 cpa_inc_1g_checked();
582 break; 747 break;
583 default: 748 default:
584 do_split = -EINVAL; 749 return -EINVAL;
585 goto out_unlock;
586 } 750 }
587 751
588 psize = page_level_size(level); 752 psize = page_level_size(level);
@@ -592,8 +756,8 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
592 * Calculate the number of pages, which fit into this large 756 * Calculate the number of pages, which fit into this large
593 * page starting at address: 757 * page starting at address:
594 */ 758 */
595 nextpage_addr = (address + psize) & pmask; 759 lpaddr = (address + psize) & pmask;
596 numpages = (nextpage_addr - address) >> PAGE_SHIFT; 760 numpages = (lpaddr - address) >> PAGE_SHIFT;
597 if (numpages < cpa->numpages) 761 if (numpages < cpa->numpages)
598 cpa->numpages = numpages; 762 cpa->numpages = numpages;
599 763
@@ -620,71 +784,142 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
620 pgprot_val(req_prot) |= _PAGE_PSE; 784 pgprot_val(req_prot) |= _PAGE_PSE;
621 785
622 /* 786 /*
623 * old_pfn points to the large page base pfn. So we need 787 * old_pfn points to the large page base pfn. So we need to add the
624 * to add the offset of the virtual address: 788 * offset of the virtual address:
625 */ 789 */
626 pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT); 790 pfn = old_pfn + ((address & (psize - 1)) >> PAGE_SHIFT);
627 cpa->pfn = pfn; 791 cpa->pfn = pfn;
628 792
629 new_prot = static_protections(req_prot, address, pfn); 793 /*
794 * Calculate the large page base address and the number of 4K pages
795 * in the large page
796 */
797 lpaddr = address & pmask;
798 numpages = psize >> PAGE_SHIFT;
630 799
631 /* 800 /*
632 * We need to check the full range, whether 801 * Sanity check that the existing mapping is correct versus the static
633 * static_protection() requires a different pgprot for one of 802 * protections. static_protections() guards against !PRESENT, so no
634 * the pages in the range we try to preserve: 803 * extra conditional required here.
635 */ 804 */
636 addr = address & pmask; 805 chk_prot = static_protections(old_prot, lpaddr, old_pfn, numpages,
637 pfn = old_pfn; 806 CPA_CONFLICT);
638 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
639 pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
640 807
641 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 808 if (WARN_ON_ONCE(pgprot_val(chk_prot) != pgprot_val(old_prot))) {
642 goto out_unlock; 809 /*
810 * Split the large page and tell the split code to
811 * enforce static protections.
812 */
813 cpa->force_static_prot = 1;
814 return 1;
643 } 815 }
644 816
645 /* 817 /*
646 * If there are no changes, return. maxpages has been updated 818 * Optimization: If the requested pgprot is the same as the current
647 * above: 819 * pgprot, then the large page can be preserved and no updates are
820 * required independent of alignment and length of the requested
821 * range. The above already established that the current pgprot is
822 * correct, which in consequence makes the requested pgprot correct
823 * as well if it is the same. The static protection scan below will
824 * not come to a different conclusion.
648 */ 825 */
649 if (pgprot_val(new_prot) == pgprot_val(old_prot)) { 826 if (pgprot_val(req_prot) == pgprot_val(old_prot)) {
650 do_split = 0; 827 cpa_inc_lp_sameprot(level);
651 goto out_unlock; 828 return 0;
652 } 829 }
653 830
654 /* 831 /*
655 * We need to change the attributes. Check, whether we can 832 * If the requested range does not cover the full page, split it up
656 * change the large page in one go. We request a split, when
657 * the address is not aligned and the number of pages is
658 * smaller than the number of pages in the large page. Note
659 * that we limited the number of possible pages already to
660 * the number of pages in the large page.
661 */ 833 */
662 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { 834 if (address != lpaddr || cpa->numpages != numpages)
663 /* 835 return 1;
664 * The address is aligned and the number of pages
665 * covers the full page.
666 */
667 new_pte = pfn_pte(old_pfn, new_prot);
668 __set_pmd_pte(kpte, address, new_pte);
669 cpa->flags |= CPA_FLUSHTLB;
670 do_split = 0;
671 }
672 836
673out_unlock: 837 /*
838 * Check whether the requested pgprot is conflicting with a static
839 * protection requirement in the large page.
840 */
841 new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages,
842 CPA_DETECT);
843
844 /*
845 * If there is a conflict, split the large page.
846 *
847 * There used to be a 4k wise evaluation trying really hard to
848 * preserve the large pages, but experimentation has shown, that this
849 * does not help at all. There might be corner cases which would
850 * preserve one large page occasionally, but it's really not worth the
851 * extra code and cycles for the common case.
852 */
853 if (pgprot_val(req_prot) != pgprot_val(new_prot))
854 return 1;
855
856 /* All checks passed. Update the large page mapping. */
857 new_pte = pfn_pte(old_pfn, new_prot);
858 __set_pmd_pte(kpte, address, new_pte);
859 cpa->flags |= CPA_FLUSHTLB;
860 cpa_inc_lp_preserved(level);
861 return 0;
862}
863
864static int should_split_large_page(pte_t *kpte, unsigned long address,
865 struct cpa_data *cpa)
866{
867 int do_split;
868
869 if (cpa->force_split)
870 return 1;
871
872 spin_lock(&pgd_lock);
873 do_split = __should_split_large_page(kpte, address, cpa);
674 spin_unlock(&pgd_lock); 874 spin_unlock(&pgd_lock);
675 875
676 return do_split; 876 return do_split;
677} 877}
678 878
879static void split_set_pte(struct cpa_data *cpa, pte_t *pte, unsigned long pfn,
880 pgprot_t ref_prot, unsigned long address,
881 unsigned long size)
882{
883 unsigned int npg = PFN_DOWN(size);
884 pgprot_t prot;
885
886 /*
887 * If should_split_large_page() discovered an inconsistent mapping,
888 * remove the invalid protection in the split mapping.
889 */
890 if (!cpa->force_static_prot)
891 goto set;
892
893 prot = static_protections(ref_prot, address, pfn, npg, CPA_PROTECT);
894
895 if (pgprot_val(prot) == pgprot_val(ref_prot))
896 goto set;
897
898 /*
899 * If this is splitting a PMD, fix it up. PUD splits cannot be
900 * fixed trivially as that would require to rescan the newly
901 * installed PMD mappings after returning from split_large_page()
902 * so an eventual further split can allocate the necessary PTE
903 * pages. Warn for now and revisit it in case this actually
904 * happens.
905 */
906 if (size == PAGE_SIZE)
907 ref_prot = prot;
908 else
909 pr_warn_once("CPA: Cannot fixup static protections for PUD split\n");
910set:
911 set_pte(pte, pfn_pte(pfn, ref_prot));
912}
913
679static int 914static int
680__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, 915__split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
681 struct page *base) 916 struct page *base)
682{ 917{
918 unsigned long lpaddr, lpinc, ref_pfn, pfn, pfninc = 1;
683 pte_t *pbase = (pte_t *)page_address(base); 919 pte_t *pbase = (pte_t *)page_address(base);
684 unsigned long ref_pfn, pfn, pfninc = 1;
685 unsigned int i, level; 920 unsigned int i, level;
686 pte_t *tmp;
687 pgprot_t ref_prot; 921 pgprot_t ref_prot;
922 pte_t *tmp;
688 923
689 spin_lock(&pgd_lock); 924 spin_lock(&pgd_lock);
690 /* 925 /*
@@ -707,15 +942,17 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
707 * PAT bit to correct position. 942 * PAT bit to correct position.
708 */ 943 */
709 ref_prot = pgprot_large_2_4k(ref_prot); 944 ref_prot = pgprot_large_2_4k(ref_prot);
710
711 ref_pfn = pmd_pfn(*(pmd_t *)kpte); 945 ref_pfn = pmd_pfn(*(pmd_t *)kpte);
946 lpaddr = address & PMD_MASK;
947 lpinc = PAGE_SIZE;
712 break; 948 break;
713 949
714 case PG_LEVEL_1G: 950 case PG_LEVEL_1G:
715 ref_prot = pud_pgprot(*(pud_t *)kpte); 951 ref_prot = pud_pgprot(*(pud_t *)kpte);
716 ref_pfn = pud_pfn(*(pud_t *)kpte); 952 ref_pfn = pud_pfn(*(pud_t *)kpte);
717 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; 953 pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT;
718 954 lpaddr = address & PUD_MASK;
955 lpinc = PMD_SIZE;
719 /* 956 /*
720 * Clear the PSE flags if the PRESENT flag is not set 957 * Clear the PSE flags if the PRESENT flag is not set
721 * otherwise pmd_present/pmd_huge will return true 958 * otherwise pmd_present/pmd_huge will return true
@@ -736,8 +973,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
736 * Get the target pfn from the original entry: 973 * Get the target pfn from the original entry:
737 */ 974 */
738 pfn = ref_pfn; 975 pfn = ref_pfn;
739 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) 976 for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc, lpaddr += lpinc)
740 set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); 977 split_set_pte(cpa, pbase + i, pfn, ref_prot, lpaddr, lpinc);
741 978
742 if (virt_addr_valid(address)) { 979 if (virt_addr_valid(address)) {
743 unsigned long pfn = PFN_DOWN(__pa(address)); 980 unsigned long pfn = PFN_DOWN(__pa(address));
@@ -756,14 +993,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address,
756 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); 993 __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
757 994
758 /* 995 /*
759 * Intel Atom errata AAH41 workaround. 996 * Do a global flush tlb after splitting the large page
997 * and before we do the actual change page attribute in the PTE.
998 *
999 * Without this, we violate the TLB application note, that says:
1000 * "The TLBs may contain both ordinary and large-page
1001 * translations for a 4-KByte range of linear addresses. This
1002 * may occur if software modifies the paging structures so that
1003 * the page size used for the address range changes. If the two
1004 * translations differ with respect to page frame or attributes
1005 * (e.g., permissions), processor behavior is undefined and may
1006 * be implementation-specific."
760 * 1007 *
761 * The real fix should be in hw or in a microcode update, but 1008 * We do this global tlb flush inside the cpa_lock, so that we
762 * we also probabilistically try to reduce the window of having 1009 * don't allow any other cpu, with stale tlb entries change the
763 * a large TLB mixed with 4K TLBs while instruction fetches are 1010 * page attribute in parallel, that also falls into the
764 * going on. 1011 * just split large page entry.
765 */ 1012 */
766 __flush_tlb_all(); 1013 flush_tlb_all();
767 spin_unlock(&pgd_lock); 1014 spin_unlock(&pgd_lock);
768 1015
769 return 0; 1016 return 0;
@@ -1247,7 +1494,9 @@ repeat:
1247 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 1494 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
1248 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 1495 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
1249 1496
1250 new_prot = static_protections(new_prot, address, pfn); 1497 cpa_inc_4k_install();
1498 new_prot = static_protections(new_prot, address, pfn, 1,
1499 CPA_PROTECT);
1251 1500
1252 new_prot = pgprot_clear_protnone_bits(new_prot); 1501 new_prot = pgprot_clear_protnone_bits(new_prot);
1253 1502
@@ -1273,7 +1522,7 @@ repeat:
1273 * Check, whether we can keep the large page intact 1522 * Check, whether we can keep the large page intact
1274 * and just change the pte: 1523 * and just change the pte:
1275 */ 1524 */
1276 do_split = try_preserve_large_page(kpte, address, cpa); 1525 do_split = should_split_large_page(kpte, address, cpa);
1277 /* 1526 /*
1278 * When the range fits into the existing large page, 1527 * When the range fits into the existing large page,
1279 * return. cp->numpages and cpa->tlbflush have been updated in 1528 * return. cp->numpages and cpa->tlbflush have been updated in
@@ -1286,28 +1535,8 @@ repeat:
1286 * We have to split the large page: 1535 * We have to split the large page:
1287 */ 1536 */
1288 err = split_large_page(cpa, kpte, address); 1537 err = split_large_page(cpa, kpte, address);
1289 if (!err) { 1538 if (!err)
1290 /*
1291 * Do a global flush tlb after splitting the large page
1292 * and before we do the actual change page attribute in the PTE.
1293 *
1294 * With out this, we violate the TLB application note, that says
1295 * "The TLBs may contain both ordinary and large-page
1296 * translations for a 4-KByte range of linear addresses. This
1297 * may occur if software modifies the paging structures so that
1298 * the page size used for the address range changes. If the two
1299 * translations differ with respect to page frame or attributes
1300 * (e.g., permissions), processor behavior is undefined and may
1301 * be implementation-specific."
1302 *
1303 * We do this global tlb flush inside the cpa_lock, so that we
1304 * don't allow any other cpu, with stale tlb entries change the
1305 * page attribute in parallel, that also falls into the
1306 * just split large page entry.
1307 */
1308 flush_tlb_all();
1309 goto repeat; 1539 goto repeat;
1310 }
1311 1540
1312 return err; 1541 return err;
1313} 1542}
@@ -1529,19 +1758,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
1529 cache = !!pgprot2cachemode(mask_set); 1758 cache = !!pgprot2cachemode(mask_set);
1530 1759
1531 /* 1760 /*
1532 * On success we use CLFLUSH, when the CPU supports it to 1761 * On error; flush everything to be sure.
1533 * avoid the WBINVD. If the CPU does not support it and in the
1534 * error case we fall back to cpa_flush_all (which uses
1535 * WBINVD):
1536 */ 1762 */
1537 if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { 1763 if (ret) {
1538 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1539 cpa_flush_array(addr, numpages, cache,
1540 cpa.flags, pages);
1541 } else
1542 cpa_flush_range(baddr, numpages, cache);
1543 } else
1544 cpa_flush_all(cache); 1764 cpa_flush_all(cache);
1765 goto out;
1766 }
1767
1768 if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
1769 cpa_flush_array(baddr, addr, numpages, cache,
1770 cpa.flags, pages);
1771 } else {
1772 cpa_flush_range(baddr, numpages, cache);
1773 }
1545 1774
1546out: 1775out:
1547 return ret; 1776 return ret;
@@ -1856,10 +2085,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
1856 /* 2085 /*
1857 * Before changing the encryption attribute, we need to flush caches. 2086 * Before changing the encryption attribute, we need to flush caches.
1858 */ 2087 */
1859 if (static_cpu_has(X86_FEATURE_CLFLUSH)) 2088 cpa_flush_range(start, numpages, 1);
1860 cpa_flush_range(start, numpages, 1);
1861 else
1862 cpa_flush_all(1);
1863 2089
1864 ret = __change_page_attr_set_clr(&cpa, 1); 2090 ret = __change_page_attr_set_clr(&cpa, 1);
1865 2091
@@ -1870,10 +2096,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
1870 * in case TLB flushing gets optimized in the cpa_flush_range() 2096 * in case TLB flushing gets optimized in the cpa_flush_range()
1871 * path use the same logic as above. 2097 * path use the same logic as above.
1872 */ 2098 */
1873 if (static_cpu_has(X86_FEATURE_CLFLUSH)) 2099 cpa_flush_range(start, numpages, 0);
1874 cpa_flush_range(start, numpages, 0);
1875 else
1876 cpa_flush_all(0);
1877 2100
1878 return ret; 2101 return ret;
1879} 2102}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e96b99eb800c..7d68489cfdb1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -185,8 +185,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
185{ 185{
186 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 186 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
187 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 187 u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
188 bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
188 unsigned cpu = smp_processor_id(); 189 unsigned cpu = smp_processor_id();
189 u64 next_tlb_gen; 190 u64 next_tlb_gen;
191 bool need_flush;
192 u16 new_asid;
190 193
191 /* 194 /*
192 * NB: The scheduler will call us with prev == next when switching 195 * NB: The scheduler will call us with prev == next when switching
@@ -240,20 +243,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
240 next->context.ctx_id); 243 next->context.ctx_id);
241 244
242 /* 245 /*
243 * We don't currently support having a real mm loaded without 246 * Even in lazy TLB mode, the CPU should stay set in the
244 * our cpu set in mm_cpumask(). We have all the bookkeeping 247 * mm_cpumask. The TLB shootdown code can figure out from
245 * in place to figure out whether we would need to flush 248 * from cpu_tlbstate.is_lazy whether or not to send an IPI.
246 * if our cpu were cleared in mm_cpumask(), but we don't
247 * currently use it.
248 */ 249 */
249 if (WARN_ON_ONCE(real_prev != &init_mm && 250 if (WARN_ON_ONCE(real_prev != &init_mm &&
250 !cpumask_test_cpu(cpu, mm_cpumask(next)))) 251 !cpumask_test_cpu(cpu, mm_cpumask(next))))
251 cpumask_set_cpu(cpu, mm_cpumask(next)); 252 cpumask_set_cpu(cpu, mm_cpumask(next));
252 253
253 return; 254 /*
255 * If the CPU is not in lazy TLB mode, we are just switching
256 * from one thread in a process to another thread in the same
257 * process. No TLB flush required.
258 */
259 if (!was_lazy)
260 return;
261
262 /*
263 * Read the tlb_gen to check whether a flush is needed.
264 * If the TLB is up to date, just use it.
265 * The barrier synchronizes with the tlb_gen increment in
266 * the TLB shootdown code.
267 */
268 smp_mb();
269 next_tlb_gen = atomic64_read(&next->context.tlb_gen);
270 if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
271 next_tlb_gen)
272 return;
273
274 /*
275 * TLB contents went out of date while we were in lazy
276 * mode. Fall through to the TLB switching code below.
277 */
278 new_asid = prev_asid;
279 need_flush = true;
254 } else { 280 } else {
255 u16 new_asid;
256 bool need_flush;
257 u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); 281 u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
258 282
259 /* 283 /*
@@ -308,46 +332,48 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
308 /* Let nmi_uaccess_okay() know that we're changing CR3. */ 332 /* Let nmi_uaccess_okay() know that we're changing CR3. */
309 this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); 333 this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING);
310 barrier(); 334 barrier();
335 }
311 336
312 if (need_flush) { 337 if (need_flush) {
313 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 338 this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
314 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 339 this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
315 load_new_mm_cr3(next->pgd, new_asid, true); 340 load_new_mm_cr3(next->pgd, new_asid, true);
316
317 /*
318 * NB: This gets called via leave_mm() in the idle path
319 * where RCU functions differently. Tracing normally
320 * uses RCU, so we need to use the _rcuidle variant.
321 *
322 * (There is no good reason for this. The idle code should
323 * be rearranged to call this before rcu_idle_enter().)
324 */
325 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
326 } else {
327 /* The new ASID is already up to date. */
328 load_new_mm_cr3(next->pgd, new_asid, false);
329
330 /* See above wrt _rcuidle. */
331 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
332 }
333 341
334 /* 342 /*
335 * Record last user mm's context id, so we can avoid 343 * NB: This gets called via leave_mm() in the idle path
336 * flushing branch buffer with IBPB if we switch back 344 * where RCU functions differently. Tracing normally
337 * to the same user. 345 * uses RCU, so we need to use the _rcuidle variant.
346 *
347 * (There is no good reason for this. The idle code should
348 * be rearranged to call this before rcu_idle_enter().)
338 */ 349 */
339 if (next != &init_mm) 350 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
340 this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); 351 } else {
341 352 /* The new ASID is already up to date. */
342 /* Make sure we write CR3 before loaded_mm. */ 353 load_new_mm_cr3(next->pgd, new_asid, false);
343 barrier();
344 354
345 this_cpu_write(cpu_tlbstate.loaded_mm, next); 355 /* See above wrt _rcuidle. */
346 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 356 trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
347 } 357 }
348 358
349 load_mm_cr4(next); 359 /*
350 switch_ldt(real_prev, next); 360 * Record last user mm's context id, so we can avoid
361 * flushing branch buffer with IBPB if we switch back
362 * to the same user.
363 */
364 if (next != &init_mm)
365 this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
366
367 /* Make sure we write CR3 before loaded_mm. */
368 barrier();
369
370 this_cpu_write(cpu_tlbstate.loaded_mm, next);
371 this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
372
373 if (next != real_prev) {
374 load_mm_cr4(next);
375 switch_ldt(real_prev, next);
376 }
351} 377}
352 378
353/* 379/*
@@ -368,20 +394,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
368 if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 394 if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
369 return; 395 return;
370 396
371 if (tlb_defer_switch_to_init_mm()) { 397 this_cpu_write(cpu_tlbstate.is_lazy, true);
372 /*
373 * There's a significant optimization that may be possible
374 * here. We have accurate enough TLB flush tracking that we
375 * don't need to maintain coherence of TLB per se when we're
376 * lazy. We do, however, need to maintain coherence of
377 * paging-structure caches. We could, in principle, leave our
378 * old mm loaded and only switch to init_mm when
379 * tlb_remove_page() happens.
380 */
381 this_cpu_write(cpu_tlbstate.is_lazy, true);
382 } else {
383 switch_mm(NULL, &init_mm, NULL);
384 }
385} 398}
386 399
387/* 400/*
@@ -468,6 +481,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
468 * paging-structure cache to avoid speculatively reading 481 * paging-structure cache to avoid speculatively reading
469 * garbage into our TLB. Since switching to init_mm is barely 482 * garbage into our TLB. Since switching to init_mm is barely
470 * slower than a minimal flush, just switch to init_mm. 483 * slower than a minimal flush, just switch to init_mm.
484 *
485 * This should be rare, with native_flush_tlb_others skipping
486 * IPIs to lazy TLB mode CPUs.
471 */ 487 */
472 switch_mm_irqs_off(NULL, &init_mm, NULL); 488 switch_mm_irqs_off(NULL, &init_mm, NULL);
473 return; 489 return;
@@ -528,17 +544,16 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
528 f->new_tlb_gen == local_tlb_gen + 1 && 544 f->new_tlb_gen == local_tlb_gen + 1 &&
529 f->new_tlb_gen == mm_tlb_gen) { 545 f->new_tlb_gen == mm_tlb_gen) {
530 /* Partial flush */ 546 /* Partial flush */
531 unsigned long addr; 547 unsigned long nr_invalidate = (f->end - f->start) >> f->stride_shift;
532 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; 548 unsigned long addr = f->start;
533 549
534 addr = f->start;
535 while (addr < f->end) { 550 while (addr < f->end) {
536 __flush_tlb_one_user(addr); 551 __flush_tlb_one_user(addr);
537 addr += PAGE_SIZE; 552 addr += 1UL << f->stride_shift;
538 } 553 }
539 if (local) 554 if (local)
540 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); 555 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_invalidate);
541 trace_tlb_flush(reason, nr_pages); 556 trace_tlb_flush(reason, nr_invalidate);
542 } else { 557 } else {
543 /* Full flush. */ 558 /* Full flush. */
544 local_flush_tlb(); 559 local_flush_tlb();
@@ -571,6 +586,11 @@ static void flush_tlb_func_remote(void *info)
571 flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN); 586 flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
572} 587}
573 588
589static bool tlb_is_not_lazy(int cpu, void *data)
590{
591 return !per_cpu(cpu_tlbstate.is_lazy, cpu);
592}
593
574void native_flush_tlb_others(const struct cpumask *cpumask, 594void native_flush_tlb_others(const struct cpumask *cpumask,
575 const struct flush_tlb_info *info) 595 const struct flush_tlb_info *info)
576{ 596{
@@ -606,8 +626,23 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
606 (void *)info, 1); 626 (void *)info, 1);
607 return; 627 return;
608 } 628 }
609 smp_call_function_many(cpumask, flush_tlb_func_remote, 629
630 /*
631 * If no page tables were freed, we can skip sending IPIs to
632 * CPUs in lazy TLB mode. They will flush the CPU themselves
633 * at the next context switch.
634 *
635 * However, if page tables are getting freed, we need to send the
636 * IPI everywhere, to prevent CPUs in lazy TLB mode from tripping
637 * up on the new contents of what used to be page tables, while
638 * doing a speculative memory access.
639 */
640 if (info->freed_tables)
641 smp_call_function_many(cpumask, flush_tlb_func_remote,
610 (void *)info, 1); 642 (void *)info, 1);
643 else
644 on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func_remote,
645 (void *)info, 1, GFP_ATOMIC, cpumask);
611} 646}
612 647
613/* 648/*
@@ -623,12 +658,15 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
623static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; 658static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
624 659
625void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, 660void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
626 unsigned long end, unsigned long vmflag) 661 unsigned long end, unsigned int stride_shift,
662 bool freed_tables)
627{ 663{
628 int cpu; 664 int cpu;
629 665
630 struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { 666 struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
631 .mm = mm, 667 .mm = mm,
668 .stride_shift = stride_shift,
669 .freed_tables = freed_tables,
632 }; 670 };
633 671
634 cpu = get_cpu(); 672 cpu = get_cpu();
@@ -638,8 +676,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
638 676
639 /* Should we flush just the requested range? */ 677 /* Should we flush just the requested range? */
640 if ((end != TLB_FLUSH_ALL) && 678 if ((end != TLB_FLUSH_ALL) &&
641 !(vmflag & VM_HUGETLB) && 679 ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) {
642 ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
643 info.start = start; 680 info.start = start;
644 info.end = end; 681 info.end = end;
645 } else { 682 } else {
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index e3b18ad49889..145506f9fdbe 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -22,6 +22,7 @@
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/nmi.h> 23#include <linux/nmi.h>
24#include <linux/cpuhotplug.h> 24#include <linux/cpuhotplug.h>
25#include <linux/stackprotector.h>
25 26
26#include <asm/paravirt.h> 27#include <asm/paravirt.h>
27#include <asm/desc.h> 28#include <asm/desc.h>
@@ -88,6 +89,7 @@ static void cpu_bringup(void)
88asmlinkage __visible void cpu_bringup_and_idle(void) 89asmlinkage __visible void cpu_bringup_and_idle(void)
89{ 90{
90 cpu_bringup(); 91 cpu_bringup();
92 boot_init_stack_canary();
91 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); 93 cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
92} 94}
93 95
diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c
index 84b3e4445d46..3931c7de7c69 100644
--- a/drivers/iommu/amd_iommu_init.c
+++ b/drivers/iommu/amd_iommu_init.c
@@ -902,12 +902,22 @@ static bool copy_device_table(void)
902 } 902 }
903 } 903 }
904 904
905 old_devtb_phys = entry & PAGE_MASK; 905 /*
906 * When SME is enabled in the first kernel, the entry includes the
907 * memory encryption mask(sme_me_mask), we must remove the memory
908 * encryption mask to obtain the true physical address in kdump kernel.
909 */
910 old_devtb_phys = __sme_clr(entry) & PAGE_MASK;
911
906 if (old_devtb_phys >= 0x100000000ULL) { 912 if (old_devtb_phys >= 0x100000000ULL) {
907 pr_err("The address of old device table is above 4G, not trustworthy!\n"); 913 pr_err("The address of old device table is above 4G, not trustworthy!\n");
908 return false; 914 return false;
909 } 915 }
910 old_devtb = memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB); 916 old_devtb = (sme_active() && is_kdump_kernel())
917 ? (__force void *)ioremap_encrypted(old_devtb_phys,
918 dev_table_size)
919 : memremap(old_devtb_phys, dev_table_size, MEMREMAP_WB);
920
911 if (!old_devtb) 921 if (!old_devtb)
912 return false; 922 return false;
913 923
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cbde728f8ac6..91ae16fbd7d5 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -24,6 +24,8 @@
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/pagemap.h> 25#include <linux/pagemap.h>
26#include <linux/uaccess.h> 26#include <linux/uaccess.h>
27#include <linux/mem_encrypt.h>
28#include <asm/pgtable.h>
27#include <asm/io.h> 29#include <asm/io.h>
28#include "internal.h" 30#include "internal.h"
29 31
@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
98 100
99/* Reads a page from the oldmem device from given offset. */ 101/* Reads a page from the oldmem device from given offset. */
100static ssize_t read_from_oldmem(char *buf, size_t count, 102static ssize_t read_from_oldmem(char *buf, size_t count,
101 u64 *ppos, int userbuf) 103 u64 *ppos, int userbuf,
104 bool encrypted)
102{ 105{
103 unsigned long pfn, offset; 106 unsigned long pfn, offset;
104 size_t nr_bytes; 107 size_t nr_bytes;
@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
120 if (pfn_is_ram(pfn) == 0) 123 if (pfn_is_ram(pfn) == 0)
121 memset(buf, 0, nr_bytes); 124 memset(buf, 0, nr_bytes);
122 else { 125 else {
123 tmp = copy_oldmem_page(pfn, buf, nr_bytes, 126 if (encrypted)
124 offset, userbuf); 127 tmp = copy_oldmem_page_encrypted(pfn, buf,
128 nr_bytes,
129 offset,
130 userbuf);
131 else
132 tmp = copy_oldmem_page(pfn, buf, nr_bytes,
133 offset, userbuf);
134
125 if (tmp < 0) 135 if (tmp < 0)
126 return tmp; 136 return tmp;
127 } 137 }
@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
155 */ 165 */
156ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) 166ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
157{ 167{
158 return read_from_oldmem(buf, count, ppos, 0); 168 return read_from_oldmem(buf, count, ppos, 0, false);
159} 169}
160 170
161/* 171/*
@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
163 */ 173 */
164ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) 174ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
165{ 175{
166 return read_from_oldmem(buf, count, ppos, 0); 176 return read_from_oldmem(buf, count, ppos, 0, sme_active());
167} 177}
168 178
169/* 179/*
@@ -173,10 +183,21 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
173 unsigned long from, unsigned long pfn, 183 unsigned long from, unsigned long pfn,
174 unsigned long size, pgprot_t prot) 184 unsigned long size, pgprot_t prot)
175{ 185{
186 prot = pgprot_encrypted(prot);
176 return remap_pfn_range(vma, from, pfn, size, prot); 187 return remap_pfn_range(vma, from, pfn, size, prot);
177} 188}
178 189
179/* 190/*
191 * Architectures which support memory encryption override this.
192 */
193ssize_t __weak
194copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
195 unsigned long offset, int userbuf)
196{
197 return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
198}
199
200/*
180 * Copy to either kernel or user space 201 * Copy to either kernel or user space
181 */ 202 */
182static int copy_to(void *target, void *src, size_t size, int userbuf) 203static int copy_to(void *target, void *src, size_t size, int userbuf)
@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
351 m->offset + m->size - *fpos, 372 m->offset + m->size - *fpos,
352 buflen); 373 buflen);
353 start = m->paddr + *fpos - m->offset; 374 start = m->paddr + *fpos - m->offset;
354 tmp = read_from_oldmem(buffer, tsz, &start, userbuf); 375 tmp = read_from_oldmem(buffer, tsz, &start,
376 userbuf, sme_active());
355 if (tmp < 0) 377 if (tmp < 0)
356 return tmp; 378 return tmp;
357 buflen -= tsz; 379 buflen -= tsz;
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 3e4ba9d753c8..f774c5eb9e3c 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -26,6 +26,10 @@ extern int remap_oldmem_pfn_range(struct vm_area_struct *vma,
26 26
27extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, 27extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
28 unsigned long, int); 28 unsigned long, int);
29extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf,
30 size_t csize, unsigned long offset,
31 int userbuf);
32
29void vmcore_cleanup(void); 33void vmcore_cleanup(void);
30 34
31/* Architecture code defines this if there are other possible ELF 35/* Architecture code defines this if there are other possible ELF
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9fb239e12b82..a56f08ff3097 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -53,6 +53,10 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
53 smp_call_func_t func, void *info, bool wait, 53 smp_call_func_t func, void *info, bool wait,
54 gfp_t gfp_flags); 54 gfp_t gfp_flags);
55 55
56void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
57 smp_call_func_t func, void *info, bool wait,
58 gfp_t gfp_flags, const struct cpumask *mask);
59
56int smp_call_function_single_async(int cpu, call_single_data_t *csd); 60int smp_call_function_single_async(int cpu, call_single_data_t *csd);
57 61
58#ifdef CONFIG_SMP 62#ifdef CONFIG_SMP
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 23a83a4da38a..86ef06d3dbe3 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -471,6 +471,10 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
471 } 471 }
472 } 472 }
473 473
474 /* Ensure that these pages are decrypted if SME is enabled. */
475 if (pages)
476 arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0);
477
474 return pages; 478 return pages;
475} 479}
476 480
@@ -867,6 +871,7 @@ static int kimage_load_crash_segment(struct kimage *image,
867 result = -ENOMEM; 871 result = -ENOMEM;
868 goto out; 872 goto out;
869 } 873 }
874 arch_kexec_post_alloc_pages(page_address(page), 1, 0);
870 ptr = kmap(page); 875 ptr = kmap(page);
871 ptr += maddr & ~PAGE_MASK; 876 ptr += maddr & ~PAGE_MASK;
872 mchunk = min_t(size_t, mbytes, 877 mchunk = min_t(size_t, mbytes,
@@ -884,6 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image,
884 result = copy_from_user(ptr, buf, uchunk); 889 result = copy_from_user(ptr, buf, uchunk);
885 kexec_flush_icache_page(page); 890 kexec_flush_icache_page(page);
886 kunmap(page); 891 kunmap(page);
892 arch_kexec_pre_free_pages(page_address(page), 1);
887 if (result) { 893 if (result) {
888 result = -EFAULT; 894 result = -EFAULT;
889 goto out; 895 goto out;
diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..b3a3a1fc499e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -318,33 +318,34 @@ int release_resource(struct resource *old)
318 318
319EXPORT_SYMBOL(release_resource); 319EXPORT_SYMBOL(release_resource);
320 320
321/* 321/**
322 * Finds the lowest iomem resource existing within [res->start.res->end). 322 * Finds the lowest iomem resource that covers part of [start..end]. The
323 * The caller must specify res->start, res->end, res->flags, and optionally 323 * caller must specify start, end, flags, and desc (which may be
324 * desc. If found, returns 0, res is overwritten, if not found, returns -1. 324 * IORES_DESC_NONE).
325 * This function walks the whole tree and not just first level children until 325 *
326 * and unless first_level_children_only is true. 326 * If a resource is found, returns 0 and *res is overwritten with the part
327 * of the resource that's within [start..end]; if none is found, returns
328 * -1.
329 *
330 * This function walks the whole tree and not just first level children
331 * unless @first_lvl is true.
327 */ 332 */
328static int find_next_iomem_res(struct resource *res, unsigned long desc, 333static int find_next_iomem_res(resource_size_t start, resource_size_t end,
329 bool first_level_children_only) 334 unsigned long flags, unsigned long desc,
335 bool first_lvl, struct resource *res)
330{ 336{
331 resource_size_t start, end;
332 struct resource *p; 337 struct resource *p;
333 bool sibling_only = false;
334 338
335 BUG_ON(!res); 339 if (!res)
336 340 return -EINVAL;
337 start = res->start;
338 end = res->end;
339 BUG_ON(start >= end);
340 341
341 if (first_level_children_only) 342 if (start >= end)
342 sibling_only = true; 343 return -EINVAL;
343 344
344 read_lock(&resource_lock); 345 read_lock(&resource_lock);
345 346
346 for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { 347 for (p = iomem_resource.child; p; p = next_resource(p, first_lvl)) {
347 if ((p->flags & res->flags) != res->flags) 348 if ((p->flags & flags) != flags)
348 continue; 349 continue;
349 if ((desc != IORES_DESC_NONE) && (desc != p->desc)) 350 if ((desc != IORES_DESC_NONE) && (desc != p->desc))
350 continue; 351 continue;
@@ -352,45 +353,43 @@ static int find_next_iomem_res(struct resource *res, unsigned long desc,
352 p = NULL; 353 p = NULL;
353 break; 354 break;
354 } 355 }
355 if ((p->end >= start) && (p->start < end)) 356 if ((p->end >= start) && (p->start <= end))
356 break; 357 break;
357 } 358 }
358 359
359 read_unlock(&resource_lock); 360 read_unlock(&resource_lock);
360 if (!p) 361 if (!p)
361 return -1; 362 return -1;
363
362 /* copy data */ 364 /* copy data */
363 if (res->start < p->start) 365 res->start = max(start, p->start);
364 res->start = p->start; 366 res->end = min(end, p->end);
365 if (res->end > p->end)
366 res->end = p->end;
367 res->flags = p->flags; 367 res->flags = p->flags;
368 res->desc = p->desc; 368 res->desc = p->desc;
369 return 0; 369 return 0;
370} 370}
371 371
372static int __walk_iomem_res_desc(struct resource *res, unsigned long desc, 372static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end,
373 bool first_level_children_only, 373 unsigned long flags, unsigned long desc,
374 void *arg, 374 bool first_lvl, void *arg,
375 int (*func)(struct resource *, void *)) 375 int (*func)(struct resource *, void *))
376{ 376{
377 u64 orig_end = res->end; 377 struct resource res;
378 int ret = -1; 378 int ret = -1;
379 379
380 while ((res->start < res->end) && 380 while (start < end &&
381 !find_next_iomem_res(res, desc, first_level_children_only)) { 381 !find_next_iomem_res(start, end, flags, desc, first_lvl, &res)) {
382 ret = (*func)(res, arg); 382 ret = (*func)(&res, arg);
383 if (ret) 383 if (ret)
384 break; 384 break;
385 385
386 res->start = res->end + 1; 386 start = res.end + 1;
387 res->end = orig_end;
388 } 387 }
389 388
390 return ret; 389 return ret;
391} 390}
392 391
393/* 392/**
394 * Walks through iomem resources and calls func() with matching resource 393 * Walks through iomem resources and calls func() with matching resource
395 * ranges. This walks through whole tree and not just first level children. 394 * ranges. This walks through whole tree and not just first level children.
396 * All the memory ranges which overlap start,end and also match flags and 395 * All the memory ranges which overlap start,end and also match flags and
@@ -407,13 +406,7 @@ static int __walk_iomem_res_desc(struct resource *res, unsigned long desc,
407int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, 406int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
408 u64 end, void *arg, int (*func)(struct resource *, void *)) 407 u64 end, void *arg, int (*func)(struct resource *, void *))
409{ 408{
410 struct resource res; 409 return __walk_iomem_res_desc(start, end, flags, desc, false, arg, func);
411
412 res.start = start;
413 res.end = end;
414 res.flags = flags;
415
416 return __walk_iomem_res_desc(&res, desc, false, arg, func);
417} 410}
418EXPORT_SYMBOL_GPL(walk_iomem_res_desc); 411EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
419 412
@@ -425,15 +418,11 @@ EXPORT_SYMBOL_GPL(walk_iomem_res_desc);
425 * ranges. 418 * ranges.
426 */ 419 */
427int walk_system_ram_res(u64 start, u64 end, void *arg, 420int walk_system_ram_res(u64 start, u64 end, void *arg,
428 int (*func)(struct resource *, void *)) 421 int (*func)(struct resource *, void *))
429{ 422{
430 struct resource res; 423 unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
431 424
432 res.start = start; 425 return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
433 res.end = end;
434 res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
435
436 return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true,
437 arg, func); 426 arg, func);
438} 427}
439 428
@@ -444,13 +433,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
444int walk_mem_res(u64 start, u64 end, void *arg, 433int walk_mem_res(u64 start, u64 end, void *arg,
445 int (*func)(struct resource *, void *)) 434 int (*func)(struct resource *, void *))
446{ 435{
447 struct resource res; 436 unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
448
449 res.start = start;
450 res.end = end;
451 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
452 437
453 return __walk_iomem_res_desc(&res, IORES_DESC_NONE, true, 438 return __walk_iomem_res_desc(start, end, flags, IORES_DESC_NONE, true,
454 arg, func); 439 arg, func);
455} 440}
456 441
@@ -462,27 +447,27 @@ int walk_mem_res(u64 start, u64 end, void *arg,
462 * It is to be used only for System RAM. 447 * It is to be used only for System RAM.
463 */ 448 */
464int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, 449int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
465 void *arg, int (*func)(unsigned long, unsigned long, void *)) 450 void *arg, int (*func)(unsigned long, unsigned long, void *))
466{ 451{
452 resource_size_t start, end;
453 unsigned long flags;
467 struct resource res; 454 struct resource res;
468 unsigned long pfn, end_pfn; 455 unsigned long pfn, end_pfn;
469 u64 orig_end;
470 int ret = -1; 456 int ret = -1;
471 457
472 res.start = (u64) start_pfn << PAGE_SHIFT; 458 start = (u64) start_pfn << PAGE_SHIFT;
473 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; 459 end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
474 res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; 460 flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
475 orig_end = res.end; 461 while (start < end &&
476 while ((res.start < res.end) && 462 !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
477 (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) { 463 true, &res)) {
478 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; 464 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
479 end_pfn = (res.end + 1) >> PAGE_SHIFT; 465 end_pfn = (res.end + 1) >> PAGE_SHIFT;
480 if (end_pfn > pfn) 466 if (end_pfn > pfn)
481 ret = (*func)(pfn, end_pfn - pfn, arg); 467 ret = (*func)(pfn, end_pfn - pfn, arg);
482 if (ret) 468 if (ret)
483 break; 469 break;
484 res.start = res.end + 1; 470 start = res.end + 1;
485 res.end = orig_end;
486 } 471 }
487 return ret; 472 return ret;
488} 473}
@@ -658,8 +643,8 @@ static int find_resource(struct resource *root, struct resource *new,
658 * @constraint: the size and alignment constraints to be met. 643 * @constraint: the size and alignment constraints to be met.
659 */ 644 */
660static int reallocate_resource(struct resource *root, struct resource *old, 645static int reallocate_resource(struct resource *root, struct resource *old,
661 resource_size_t newsize, 646 resource_size_t newsize,
662 struct resource_constraint *constraint) 647 struct resource_constraint *constraint)
663{ 648{
664 int err=0; 649 int err=0;
665 struct resource new = *old; 650 struct resource new = *old;
@@ -972,7 +957,7 @@ skip:
972 * Existing children of the resource are assumed to be immutable. 957 * Existing children of the resource are assumed to be immutable.
973 */ 958 */
974int adjust_resource(struct resource *res, resource_size_t start, 959int adjust_resource(struct resource *res, resource_size_t start,
975 resource_size_t size) 960 resource_size_t size)
976{ 961{
977 int result; 962 int result;
978 963
@@ -983,9 +968,9 @@ int adjust_resource(struct resource *res, resource_size_t start,
983} 968}
984EXPORT_SYMBOL(adjust_resource); 969EXPORT_SYMBOL(adjust_resource);
985 970
986static void __init __reserve_region_with_split(struct resource *root, 971static void __init
987 resource_size_t start, resource_size_t end, 972__reserve_region_with_split(struct resource *root, resource_size_t start,
988 const char *name) 973 resource_size_t end, const char *name)
989{ 974{
990 struct resource *parent = root; 975 struct resource *parent = root;
991 struct resource *conflict; 976 struct resource *conflict;
@@ -1044,9 +1029,9 @@ static void __init __reserve_region_with_split(struct resource *root,
1044 1029
1045} 1030}
1046 1031
1047void __init reserve_region_with_split(struct resource *root, 1032void __init
1048 resource_size_t start, resource_size_t end, 1033reserve_region_with_split(struct resource *root, resource_size_t start,
1049 const char *name) 1034 resource_size_t end, const char *name)
1050{ 1035{
1051 int abort = 0; 1036 int abort = 0;
1052 1037
@@ -1172,7 +1157,7 @@ EXPORT_SYMBOL(__request_region);
1172 * The described resource region must match a currently busy region. 1157 * The described resource region must match a currently busy region.
1173 */ 1158 */
1174void __release_region(struct resource *parent, resource_size_t start, 1159void __release_region(struct resource *parent, resource_size_t start,
1175 resource_size_t n) 1160 resource_size_t n)
1176{ 1161{
1177 struct resource **p; 1162 struct resource **p;
1178 resource_size_t end; 1163 resource_size_t end;
@@ -1234,7 +1219,7 @@ EXPORT_SYMBOL(__release_region);
1234 * simplicity. Enhance this logic when necessary. 1219 * simplicity. Enhance this logic when necessary.
1235 */ 1220 */
1236int release_mem_region_adjustable(struct resource *parent, 1221int release_mem_region_adjustable(struct resource *parent,
1237 resource_size_t start, resource_size_t size) 1222 resource_size_t start, resource_size_t size)
1238{ 1223{
1239 struct resource **p; 1224 struct resource **p;
1240 struct resource *res; 1225 struct resource *res;
@@ -1410,9 +1395,9 @@ static int devm_region_match(struct device *dev, void *res, void *match_data)
1410 this->start == match->start && this->n == match->n; 1395 this->start == match->start && this->n == match->n;
1411} 1396}
1412 1397
1413struct resource * __devm_request_region(struct device *dev, 1398struct resource *
1414 struct resource *parent, resource_size_t start, 1399__devm_request_region(struct device *dev, struct resource *parent,
1415 resource_size_t n, const char *name) 1400 resource_size_t start, resource_size_t n, const char *name)
1416{ 1401{
1417 struct region_devres *dr = NULL; 1402 struct region_devres *dr = NULL;
1418 struct resource *res; 1403 struct resource *res;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 16f84142f2f4..f5516bae0c1b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -347,21 +347,6 @@ EXPORT_SYMBOL_GPL(play_idle);
347 347
348void cpu_startup_entry(enum cpuhp_state state) 348void cpu_startup_entry(enum cpuhp_state state)
349{ 349{
350 /*
351 * This #ifdef needs to die, but it's too late in the cycle to
352 * make this generic (ARM and SH have never invoked the canary
353 * init for the non boot CPUs!). Will be fixed in 3.11
354 */
355#ifdef CONFIG_X86
356 /*
357 * If we're the non-boot CPU, nothing set the stack canary up
358 * for us. The boot CPU already has it initialized but no harm
359 * in doing it again. This is a good place for updating it, as
360 * we wont ever return from this function (so the invalid
361 * canaries already on the stack wont ever trigger).
362 */
363 boot_init_stack_canary();
364#endif
365 arch_cpu_idle_prepare(); 350 arch_cpu_idle_prepare();
366 cpuhp_online_idle(state); 351 cpuhp_online_idle(state);
367 while (1) 352 while (1)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc24f2b8c646..b8c007713b3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -56,7 +56,6 @@
56#include <linux/profile.h> 56#include <linux/profile.h>
57#include <linux/rcupdate_wait.h> 57#include <linux/rcupdate_wait.h>
58#include <linux/security.h> 58#include <linux/security.h>
59#include <linux/stackprotector.h>
60#include <linux/stop_machine.h> 59#include <linux/stop_machine.h>
61#include <linux/suspend.h> 60#include <linux/suspend.h>
62#include <linux/swait.h> 61#include <linux/swait.h>
diff --git a/kernel/smp.c b/kernel/smp.c
index d86eec5f51c1..163c451af42e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -669,9 +669,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
669 * You must not call this function with disabled interrupts or 669 * You must not call this function with disabled interrupts or
670 * from a hardware interrupt handler or from a bottom half handler. 670 * from a hardware interrupt handler or from a bottom half handler.
671 */ 671 */
672void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), 672void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
673 smp_call_func_t func, void *info, bool wait, 673 smp_call_func_t func, void *info, bool wait,
674 gfp_t gfp_flags) 674 gfp_t gfp_flags, const struct cpumask *mask)
675{ 675{
676 cpumask_var_t cpus; 676 cpumask_var_t cpus;
677 int cpu, ret; 677 int cpu, ret;
@@ -680,9 +680,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
680 680
681 if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { 681 if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
682 preempt_disable(); 682 preempt_disable();
683 for_each_online_cpu(cpu) 683 for_each_cpu(cpu, mask)
684 if (cond_func(cpu, info)) 684 if (cond_func(cpu, info))
685 cpumask_set_cpu(cpu, cpus); 685 __cpumask_set_cpu(cpu, cpus);
686 on_each_cpu_mask(cpus, func, info, wait); 686 on_each_cpu_mask(cpus, func, info, wait);
687 preempt_enable(); 687 preempt_enable();
688 free_cpumask_var(cpus); 688 free_cpumask_var(cpus);
@@ -692,7 +692,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
692 * just have to IPI them one by one. 692 * just have to IPI them one by one.
693 */ 693 */
694 preempt_disable(); 694 preempt_disable();
695 for_each_online_cpu(cpu) 695 for_each_cpu(cpu, mask)
696 if (cond_func(cpu, info)) { 696 if (cond_func(cpu, info)) {
697 ret = smp_call_function_single(cpu, func, 697 ret = smp_call_function_single(cpu, func,
698 info, wait); 698 info, wait);
@@ -701,6 +701,15 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
701 preempt_enable(); 701 preempt_enable();
702 } 702 }
703} 703}
704EXPORT_SYMBOL(on_each_cpu_cond_mask);
705
706void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
707 smp_call_func_t func, void *info, bool wait,
708 gfp_t gfp_flags)
709{
710 on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags,
711 cpu_online_mask);
712}
704EXPORT_SYMBOL(on_each_cpu_cond); 713EXPORT_SYMBOL(on_each_cpu_cond);
705 714
706static void do_nothing(void *unused) 715static void do_nothing(void *unused)
diff --git a/kernel/up.c b/kernel/up.c
index 42c46bf3e0a5..ff536f9cc8a2 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -68,9 +68,9 @@ EXPORT_SYMBOL(on_each_cpu_mask);
68 * Preemption is disabled here to make sure the cond_func is called under the 68 * Preemption is disabled here to make sure the cond_func is called under the
69 * same condtions in UP and SMP. 69 * same condtions in UP and SMP.
70 */ 70 */
71void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), 71void on_each_cpu_cond_mask(bool (*cond_func)(int cpu, void *info),
72 smp_call_func_t func, void *info, bool wait, 72 smp_call_func_t func, void *info, bool wait,
73 gfp_t gfp_flags) 73 gfp_t gfp_flags, const struct cpumask *mask)
74{ 74{
75 unsigned long flags; 75 unsigned long flags;
76 76
@@ -82,6 +82,14 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
82 } 82 }
83 preempt_enable(); 83 preempt_enable();
84} 84}
85EXPORT_SYMBOL(on_each_cpu_cond_mask);
86
87void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
88 smp_call_func_t func, void *info, bool wait,
89 gfp_t gfp_flags)
90{
91 on_each_cpu_cond_mask(cond_func, func, info, wait, gfp_flags, NULL);
92}
85EXPORT_SYMBOL(on_each_cpu_cond); 93EXPORT_SYMBOL(on_each_cpu_cond);
86 94
87int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) 95int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cf2af04b34b9..532c29276fce 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/hugetlb.h>
11#include <asm/tlb.h> 12#include <asm/tlb.h>
12#include <asm-generic/pgtable.h> 13#include <asm-generic/pgtable.h>
13 14