aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-03-15 13:45:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-15 13:45:39 -0400
commit13c76ad87216513db2487aac84155aa57dfd46ce (patch)
tree265661a60dd960bc01e74a65367edd3161b1e018
parent9cf8d6360c1589a97a98313729ed9e5db187f80b (diff)
parent8b8addf891de8a00e4d39fc32f93f7c5eb8feceb (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The main changes in this cycle were: - Enable full ASLR randomization for 32-bit programs (Hector Marco-Gisbert) - Add initial minimal INVPCI support, to flush global mappings (Andy Lutomirski) - Add KASAN enhancements (Andrey Ryabinin) - Fix mmiotrace for huge pages (Karol Herbst) - ... misc cleanups and small enhancements" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm/32: Enable full randomization on i386 and X86_32 x86/mm/kmmio: Fix mmiotrace for hugepages x86/mm: Avoid premature success when changing page attributes x86/mm/ptdump: Remove paravirt_enabled() x86/mm: Fix INVPCID asm constraint x86/dmi: Switch dmi_remap() from ioremap() [uncached] to ioremap_cache() x86/mm: If INVPCID is available, use it to flush global mappings x86/mm: Add a 'noinvpcid' boot option to turn off INVPCID x86/mm: Add INVPCID helpers x86/kasan: Write protect kasan zero shadow x86/kasan: Clear kasan_zero_page after TLB flush x86/mm/numa: Check for failures in numa_clear_kernel_node_hotplug() x86/mm/numa: Clean up numa_clear_kernel_node_hotplug() x86/mm: Make kmap_prot into a #define x86/mm/32: Set NX in __supported_pte_mask before enabling paging x86/mm: Streamline and restore probe_memory_block_size()
-rw-r--r--Documentation/kernel-parameters.txt2
-rw-r--r--arch/x86/include/asm/dmi.h2
-rw-r--r--arch/x86/include/asm/fixmap.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h57
-rw-r--r--arch/x86/kernel/cpu/common.c16
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/mm/dump_pagetables.c11
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c24
-rw-r--r--arch/x86/mm/kasan_init_64.c17
-rw-r--r--arch/x86/mm/kmmio.c88
-rw-r--r--arch/x86/mm/mmap.c14
-rw-r--r--arch/x86/mm/numa.c67
-rw-r--r--arch/x86/mm/pageattr.c4
-rw-r--r--arch/x86/mm/setup_nx.c5
15 files changed, 217 insertions, 101 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 084775f7b052..4324f2437e6a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2566,6 +2566,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2566 2566
2567 nointroute [IA-64] 2567 nointroute [IA-64]
2568 2568
2569 noinvpcid [X86] Disable the INVPCID cpu feature.
2570
2569 nojitter [IA-64] Disables jitter checking for ITC timers. 2571 nojitter [IA-64] Disables jitter checking for ITC timers.
2570 2572
2571 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver 2573 no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
index 535192f6bfad..3c69fed215c5 100644
--- a/arch/x86/include/asm/dmi.h
+++ b/arch/x86/include/asm/dmi.h
@@ -15,7 +15,7 @@ static __always_inline __init void *dmi_alloc(unsigned len)
15/* Use early IO mappings for DMI because it's initialized early */ 15/* Use early IO mappings for DMI because it's initialized early */
16#define dmi_early_remap early_ioremap 16#define dmi_early_remap early_ioremap
17#define dmi_early_unmap early_iounmap 17#define dmi_early_unmap early_iounmap
18#define dmi_remap ioremap 18#define dmi_remap ioremap_cache
19#define dmi_unmap iounmap 19#define dmi_unmap iounmap
20 20
21#endif /* _ASM_X86_DMI_H */ 21#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 6d7d0e52ed5a..8554f960e21b 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -138,7 +138,7 @@ extern void reserve_top_address(unsigned long reserve);
138extern int fixmaps_set; 138extern int fixmaps_set;
139 139
140extern pte_t *kmap_pte; 140extern pte_t *kmap_pte;
141extern pgprot_t kmap_prot; 141#define kmap_prot PAGE_KERNEL
142extern pte_t *pkmap_page_table; 142extern pte_t *pkmap_page_table;
143 143
144void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); 144void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 0bb31cb8c73b..c24b4224d439 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -8,6 +8,54 @@
8#include <asm/cpufeature.h> 8#include <asm/cpufeature.h>
9#include <asm/special_insns.h> 9#include <asm/special_insns.h>
10 10
11static inline void __invpcid(unsigned long pcid, unsigned long addr,
12 unsigned long type)
13{
14 struct { u64 d[2]; } desc = { { pcid, addr } };
15
16 /*
17 * The memory clobber is because the whole point is to invalidate
18 * stale TLB entries and, especially if we're flushing global
19 * mappings, we don't want the compiler to reorder any subsequent
20 * memory accesses before the TLB flush.
21 *
22 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
23 * invpcid (%rcx), %rax in long mode.
24 */
25 asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
26 : : "m" (desc), "a" (type), "c" (&desc) : "memory");
27}
28
29#define INVPCID_TYPE_INDIV_ADDR 0
30#define INVPCID_TYPE_SINGLE_CTXT 1
31#define INVPCID_TYPE_ALL_INCL_GLOBAL 2
32#define INVPCID_TYPE_ALL_NON_GLOBAL 3
33
34/* Flush all mappings for a given pcid and addr, not including globals. */
35static inline void invpcid_flush_one(unsigned long pcid,
36 unsigned long addr)
37{
38 __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
39}
40
41/* Flush all mappings for a given PCID, not including globals. */
42static inline void invpcid_flush_single_context(unsigned long pcid)
43{
44 __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
45}
46
47/* Flush all mappings, including globals, for all PCIDs. */
48static inline void invpcid_flush_all(void)
49{
50 __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
51}
52
53/* Flush all mappings for all PCIDs except globals. */
54static inline void invpcid_flush_all_nonglobals(void)
55{
56 __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
57}
58
11#ifdef CONFIG_PARAVIRT 59#ifdef CONFIG_PARAVIRT
12#include <asm/paravirt.h> 60#include <asm/paravirt.h>
13#else 61#else
@@ -105,6 +153,15 @@ static inline void __native_flush_tlb_global(void)
105{ 153{
106 unsigned long flags; 154 unsigned long flags;
107 155
156 if (static_cpu_has(X86_FEATURE_INVPCID)) {
157 /*
158 * Using INVPCID is considerably faster than a pair of writes
159 * to CR4 sandwiched inside an IRQ flag save/restore.
160 */
161 invpcid_flush_all();
162 return;
163 }
164
108 /* 165 /*
109 * Read-modify-write to CR4 - protect it from preemption and 166 * Read-modify-write to CR4 - protect it from preemption and
110 * from interrupts. (Use the raw variant because this code can 167 * from interrupts. (Use the raw variant because this code can
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e8d25d395ee..249461f95851 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s)
162} 162}
163__setup("nompx", x86_mpx_setup); 163__setup("nompx", x86_mpx_setup);
164 164
165static int __init x86_noinvpcid_setup(char *s)
166{
167 /* noinvpcid doesn't accept parameters */
168 if (s)
169 return -EINVAL;
170
171 /* do not emit a message if the feature is not present */
172 if (!boot_cpu_has(X86_FEATURE_INVPCID))
173 return 0;
174
175 setup_clear_cpu_cap(X86_FEATURE_INVPCID);
176 pr_info("noinvpcid: INVPCID feature disabled\n");
177 return 0;
178}
179early_param("noinvpcid", x86_noinvpcid_setup);
180
165#ifdef CONFIG_X86_32 181#ifdef CONFIG_X86_32
166static int cachesize_override = -1; 182static int cachesize_override = -1;
167static int disable_x86_serial_nr = 1; 183static int disable_x86_serial_nr = 1;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index af1112980dd4..54cdbd2003fe 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -389,6 +389,12 @@ default_entry:
389 /* Make changes effective */ 389 /* Make changes effective */
390 wrmsr 390 wrmsr
391 391
392 /*
393 * And make sure that all the mappings we set up have NX set from
394 * the beginning.
395 */
396 orl $(1 << (_PAGE_BIT_NX - 32)), pa(__supported_pte_mask + 4)
397
392enable_paging: 398enable_paging:
393 399
394/* 400/*
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 4a6f1d9b5106..99bfb192803f 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -358,20 +358,19 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
358#define pgd_none(a) pud_none(__pud(pgd_val(a))) 358#define pgd_none(a) pud_none(__pud(pgd_val(a)))
359#endif 359#endif
360 360
361#ifdef CONFIG_X86_64
362static inline bool is_hypervisor_range(int idx) 361static inline bool is_hypervisor_range(int idx)
363{ 362{
363#ifdef CONFIG_X86_64
364 /* 364 /*
365 * ffff800000000000 - ffff87ffffffffff is reserved for 365 * ffff800000000000 - ffff87ffffffffff is reserved for
366 * the hypervisor. 366 * the hypervisor.
367 */ 367 */
368 return paravirt_enabled() && 368 return (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
369 (idx >= pgd_index(__PAGE_OFFSET) - 16) && 369 (idx < pgd_index(__PAGE_OFFSET));
370 (idx < pgd_index(__PAGE_OFFSET));
371}
372#else 370#else
373static inline bool is_hypervisor_range(int idx) { return false; } 371 return false;
374#endif 372#endif
373}
375 374
376static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, 375static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
377 bool checkwx) 376 bool checkwx)
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2ebfbaf61142..bd7a9b9e2e14 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -388,7 +388,6 @@ repeat:
388} 388}
389 389
390pte_t *kmap_pte; 390pte_t *kmap_pte;
391pgprot_t kmap_prot;
392 391
393static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) 392static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
394{ 393{
@@ -405,8 +404,6 @@ static void __init kmap_init(void)
405 */ 404 */
406 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); 405 kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
407 kmap_pte = kmap_get_fixmap_pte(kmap_vstart); 406 kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
408
409 kmap_prot = PAGE_KERNEL;
410} 407}
411 408
412#ifdef CONFIG_HIGHMEM 409#ifdef CONFIG_HIGHMEM
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a40b755c67e3..214afda97911 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -53,6 +53,7 @@
53#include <asm/numa.h> 53#include <asm/numa.h>
54#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
55#include <asm/init.h> 55#include <asm/init.h>
56#include <asm/uv/uv.h>
56#include <asm/setup.h> 57#include <asm/setup.h>
57 58
58#include "mm_internal.h" 59#include "mm_internal.h"
@@ -1203,26 +1204,13 @@ int kern_addr_valid(unsigned long addr)
1203 1204
1204static unsigned long probe_memory_block_size(void) 1205static unsigned long probe_memory_block_size(void)
1205{ 1206{
1206 /* start from 2g */ 1207 unsigned long bz = MIN_MEMORY_BLOCK_SIZE;
1207 unsigned long bz = 1UL<<31;
1208 1208
1209 if (totalram_pages >= (64ULL << (30 - PAGE_SHIFT))) { 1209 /* if system is UV or has 64GB of RAM or more, use large blocks */
1210 pr_info("Using 2GB memory block size for large-memory system\n"); 1210 if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30)))
1211 return 2UL * 1024 * 1024 * 1024; 1211 bz = 2UL << 30; /* 2GB */
1212 }
1213
1214 /* less than 64g installed */
1215 if ((max_pfn << PAGE_SHIFT) < (16UL << 32))
1216 return MIN_MEMORY_BLOCK_SIZE;
1217
1218 /* get the tail size */
1219 while (bz > MIN_MEMORY_BLOCK_SIZE) {
1220 if (!((max_pfn << PAGE_SHIFT) & (bz - 1)))
1221 break;
1222 bz >>= 1;
1223 }
1224 1212
1225 printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); 1213 pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
1226 1214
1227 return bz; 1215 return bz;
1228} 1216}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index d470cf219a2d..1b1110fa0057 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -120,11 +120,22 @@ void __init kasan_init(void)
120 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), 120 kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
121 (void *)KASAN_SHADOW_END); 121 (void *)KASAN_SHADOW_END);
122 122
123 memset(kasan_zero_page, 0, PAGE_SIZE);
124
125 load_cr3(init_level4_pgt); 123 load_cr3(init_level4_pgt);
126 __flush_tlb_all(); 124 __flush_tlb_all();
127 init_task.kasan_depth = 0;
128 125
126 /*
127 * kasan_zero_page has been used as early shadow memory, thus it may
128 * contain some garbage. Now we can clear and write protect it, since
129 * after the TLB flush no one should write to it.
130 */
131 memset(kasan_zero_page, 0, PAGE_SIZE);
132 for (i = 0; i < PTRS_PER_PTE; i++) {
133 pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
134 set_pte(&kasan_zero_pte[i], pte);
135 }
136 /* Flush TLBs again to be sure that write protection applied. */
137 __flush_tlb_all();
138
139 init_task.kasan_depth = 0;
129 pr_info("KernelAddressSanitizer initialized\n"); 140 pr_info("KernelAddressSanitizer initialized\n");
130} 141}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 637ab34ed632..ddb2244b06a1 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -33,7 +33,7 @@
33struct kmmio_fault_page { 33struct kmmio_fault_page {
34 struct list_head list; 34 struct list_head list;
35 struct kmmio_fault_page *release_next; 35 struct kmmio_fault_page *release_next;
36 unsigned long page; /* location of the fault page */ 36 unsigned long addr; /* the requested address */
37 pteval_t old_presence; /* page presence prior to arming */ 37 pteval_t old_presence; /* page presence prior to arming */
38 bool armed; 38 bool armed;
39 39
@@ -70,9 +70,16 @@ unsigned int kmmio_count;
70static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; 70static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
71static LIST_HEAD(kmmio_probes); 71static LIST_HEAD(kmmio_probes);
72 72
73static struct list_head *kmmio_page_list(unsigned long page) 73static struct list_head *kmmio_page_list(unsigned long addr)
74{ 74{
75 return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)]; 75 unsigned int l;
76 pte_t *pte = lookup_address(addr, &l);
77
78 if (!pte)
79 return NULL;
80 addr &= page_level_mask(l);
81
82 return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)];
76} 83}
77 84
78/* Accessed per-cpu */ 85/* Accessed per-cpu */
@@ -98,15 +105,19 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
98} 105}
99 106
100/* You must be holding RCU read lock. */ 107/* You must be holding RCU read lock. */
101static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) 108static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr)
102{ 109{
103 struct list_head *head; 110 struct list_head *head;
104 struct kmmio_fault_page *f; 111 struct kmmio_fault_page *f;
112 unsigned int l;
113 pte_t *pte = lookup_address(addr, &l);
105 114
106 page &= PAGE_MASK; 115 if (!pte)
107 head = kmmio_page_list(page); 116 return NULL;
117 addr &= page_level_mask(l);
118 head = kmmio_page_list(addr);
108 list_for_each_entry_rcu(f, head, list) { 119 list_for_each_entry_rcu(f, head, list) {
109 if (f->page == page) 120 if (f->addr == addr)
110 return f; 121 return f;
111 } 122 }
112 return NULL; 123 return NULL;
@@ -137,10 +148,10 @@ static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
137static int clear_page_presence(struct kmmio_fault_page *f, bool clear) 148static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
138{ 149{
139 unsigned int level; 150 unsigned int level;
140 pte_t *pte = lookup_address(f->page, &level); 151 pte_t *pte = lookup_address(f->addr, &level);
141 152
142 if (!pte) { 153 if (!pte) {
143 pr_err("no pte for page 0x%08lx\n", f->page); 154 pr_err("no pte for addr 0x%08lx\n", f->addr);
144 return -1; 155 return -1;
145 } 156 }
146 157
@@ -156,7 +167,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
156 return -1; 167 return -1;
157 } 168 }
158 169
159 __flush_tlb_one(f->page); 170 __flush_tlb_one(f->addr);
160 return 0; 171 return 0;
161} 172}
162 173
@@ -176,12 +187,12 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
176 int ret; 187 int ret;
177 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); 188 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
178 if (f->armed) { 189 if (f->armed) {
179 pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n", 190 pr_warning("double-arm: addr 0x%08lx, ref %d, old %d\n",
180 f->page, f->count, !!f->old_presence); 191 f->addr, f->count, !!f->old_presence);
181 } 192 }
182 ret = clear_page_presence(f, true); 193 ret = clear_page_presence(f, true);
183 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"), 194 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"),
184 f->page); 195 f->addr);
185 f->armed = true; 196 f->armed = true;
186 return ret; 197 return ret;
187} 198}
@@ -191,7 +202,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
191{ 202{
192 int ret = clear_page_presence(f, false); 203 int ret = clear_page_presence(f, false);
193 WARN_ONCE(ret < 0, 204 WARN_ONCE(ret < 0,
194 KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); 205 KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr);
195 f->armed = false; 206 f->armed = false;
196} 207}
197 208
@@ -215,6 +226,12 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
215 struct kmmio_context *ctx; 226 struct kmmio_context *ctx;
216 struct kmmio_fault_page *faultpage; 227 struct kmmio_fault_page *faultpage;
217 int ret = 0; /* default to fault not handled */ 228 int ret = 0; /* default to fault not handled */
229 unsigned long page_base = addr;
230 unsigned int l;
231 pte_t *pte = lookup_address(addr, &l);
232 if (!pte)
233 return -EINVAL;
234 page_base &= page_level_mask(l);
218 235
219 /* 236 /*
220 * Preemption is now disabled to prevent process switch during 237 * Preemption is now disabled to prevent process switch during
@@ -227,7 +244,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
227 preempt_disable(); 244 preempt_disable();
228 rcu_read_lock(); 245 rcu_read_lock();
229 246
230 faultpage = get_kmmio_fault_page(addr); 247 faultpage = get_kmmio_fault_page(page_base);
231 if (!faultpage) { 248 if (!faultpage) {
232 /* 249 /*
233 * Either this page fault is not caused by kmmio, or 250 * Either this page fault is not caused by kmmio, or
@@ -239,7 +256,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
239 256
240 ctx = &get_cpu_var(kmmio_ctx); 257 ctx = &get_cpu_var(kmmio_ctx);
241 if (ctx->active) { 258 if (ctx->active) {
242 if (addr == ctx->addr) { 259 if (page_base == ctx->addr) {
243 /* 260 /*
244 * A second fault on the same page means some other 261 * A second fault on the same page means some other
245 * condition needs handling by do_page_fault(), the 262 * condition needs handling by do_page_fault(), the
@@ -267,9 +284,9 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
267 ctx->active++; 284 ctx->active++;
268 285
269 ctx->fpage = faultpage; 286 ctx->fpage = faultpage;
270 ctx->probe = get_kmmio_probe(addr); 287 ctx->probe = get_kmmio_probe(page_base);
271 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 288 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
272 ctx->addr = addr; 289 ctx->addr = page_base;
273 290
274 if (ctx->probe && ctx->probe->pre_handler) 291 if (ctx->probe && ctx->probe->pre_handler)
275 ctx->probe->pre_handler(ctx->probe, regs, addr); 292 ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -354,12 +371,11 @@ out:
354} 371}
355 372
356/* You must be holding kmmio_lock. */ 373/* You must be holding kmmio_lock. */
357static int add_kmmio_fault_page(unsigned long page) 374static int add_kmmio_fault_page(unsigned long addr)
358{ 375{
359 struct kmmio_fault_page *f; 376 struct kmmio_fault_page *f;
360 377
361 page &= PAGE_MASK; 378 f = get_kmmio_fault_page(addr);
362 f = get_kmmio_fault_page(page);
363 if (f) { 379 if (f) {
364 if (!f->count) 380 if (!f->count)
365 arm_kmmio_fault_page(f); 381 arm_kmmio_fault_page(f);
@@ -372,26 +388,25 @@ static int add_kmmio_fault_page(unsigned long page)
372 return -1; 388 return -1;
373 389
374 f->count = 1; 390 f->count = 1;
375 f->page = page; 391 f->addr = addr;
376 392
377 if (arm_kmmio_fault_page(f)) { 393 if (arm_kmmio_fault_page(f)) {
378 kfree(f); 394 kfree(f);
379 return -1; 395 return -1;
380 } 396 }
381 397
382 list_add_rcu(&f->list, kmmio_page_list(f->page)); 398 list_add_rcu(&f->list, kmmio_page_list(f->addr));
383 399
384 return 0; 400 return 0;
385} 401}
386 402
387/* You must be holding kmmio_lock. */ 403/* You must be holding kmmio_lock. */
388static void release_kmmio_fault_page(unsigned long page, 404static void release_kmmio_fault_page(unsigned long addr,
389 struct kmmio_fault_page **release_list) 405 struct kmmio_fault_page **release_list)
390{ 406{
391 struct kmmio_fault_page *f; 407 struct kmmio_fault_page *f;
392 408
393 page &= PAGE_MASK; 409 f = get_kmmio_fault_page(addr);
394 f = get_kmmio_fault_page(page);
395 if (!f) 410 if (!f)
396 return; 411 return;
397 412
@@ -420,18 +435,27 @@ int register_kmmio_probe(struct kmmio_probe *p)
420 int ret = 0; 435 int ret = 0;
421 unsigned long size = 0; 436 unsigned long size = 0;
422 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 437 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
438 unsigned int l;
439 pte_t *pte;
423 440
424 spin_lock_irqsave(&kmmio_lock, flags); 441 spin_lock_irqsave(&kmmio_lock, flags);
425 if (get_kmmio_probe(p->addr)) { 442 if (get_kmmio_probe(p->addr)) {
426 ret = -EEXIST; 443 ret = -EEXIST;
427 goto out; 444 goto out;
428 } 445 }
446
447 pte = lookup_address(p->addr, &l);
448 if (!pte) {
449 ret = -EINVAL;
450 goto out;
451 }
452
429 kmmio_count++; 453 kmmio_count++;
430 list_add_rcu(&p->list, &kmmio_probes); 454 list_add_rcu(&p->list, &kmmio_probes);
431 while (size < size_lim) { 455 while (size < size_lim) {
432 if (add_kmmio_fault_page(p->addr + size)) 456 if (add_kmmio_fault_page(p->addr + size))
433 pr_err("Unable to set page fault.\n"); 457 pr_err("Unable to set page fault.\n");
434 size += PAGE_SIZE; 458 size += page_level_size(l);
435 } 459 }
436out: 460out:
437 spin_unlock_irqrestore(&kmmio_lock, flags); 461 spin_unlock_irqrestore(&kmmio_lock, flags);
@@ -506,11 +530,17 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
506 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 530 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
507 struct kmmio_fault_page *release_list = NULL; 531 struct kmmio_fault_page *release_list = NULL;
508 struct kmmio_delayed_release *drelease; 532 struct kmmio_delayed_release *drelease;
533 unsigned int l;
534 pte_t *pte;
535
536 pte = lookup_address(p->addr, &l);
537 if (!pte)
538 return;
509 539
510 spin_lock_irqsave(&kmmio_lock, flags); 540 spin_lock_irqsave(&kmmio_lock, flags);
511 while (size < size_lim) { 541 while (size < size_lim) {
512 release_kmmio_fault_page(p->addr + size, &release_list); 542 release_kmmio_fault_page(p->addr + size, &release_list);
513 size += PAGE_SIZE; 543 size += page_level_size(l);
514 } 544 }
515 list_del_rcu(&p->list); 545 list_del_rcu(&p->list);
516 kmmio_count--; 546 kmmio_count--;
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 72bb52f93c3d..d2dc0438d654 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -94,18 +94,6 @@ static unsigned long mmap_base(unsigned long rnd)
94} 94}
95 95
96/* 96/*
97 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
98 * does, but not when emulating X86_32
99 */
100static unsigned long mmap_legacy_base(unsigned long rnd)
101{
102 if (mmap_is_ia32())
103 return TASK_UNMAPPED_BASE;
104 else
105 return TASK_UNMAPPED_BASE + rnd;
106}
107
108/*
109 * This function, called very early during the creation of a new 97 * This function, called very early during the creation of a new
110 * process VM image, sets up which VM layout function to use: 98 * process VM image, sets up which VM layout function to use:
111 */ 99 */
@@ -116,7 +104,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
116 if (current->flags & PF_RANDOMIZE) 104 if (current->flags & PF_RANDOMIZE)
117 random_factor = arch_mmap_rnd(); 105 random_factor = arch_mmap_rnd();
118 106
119 mm->mmap_legacy_base = mmap_legacy_base(random_factor); 107 mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
120 108
121 if (mmap_is_legacy()) { 109 if (mmap_is_legacy()) {
122 mm->mmap_base = mm->mmap_legacy_base; 110 mm->mmap_base = mm->mmap_legacy_base;
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index d04f8094bc23..f70c1ff46125 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -465,46 +465,67 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
465 return true; 465 return true;
466} 466}
467 467
468/*
469 * Mark all currently memblock-reserved physical memory (which covers the
470 * kernel's own memory ranges) as hot-unswappable.
471 */
468static void __init numa_clear_kernel_node_hotplug(void) 472static void __init numa_clear_kernel_node_hotplug(void)
469{ 473{
470 int i, nid; 474 nodemask_t reserved_nodemask = NODE_MASK_NONE;
471 nodemask_t numa_kernel_nodes = NODE_MASK_NONE; 475 struct memblock_region *mb_region;
472 phys_addr_t start, end; 476 int i;
473 struct memblock_region *r;
474 477
475 /* 478 /*
479 * We have to do some preprocessing of memblock regions, to
480 * make them suitable for reservation.
481 *
476 * At this time, all memory regions reserved by memblock are 482 * At this time, all memory regions reserved by memblock are
477 * used by the kernel. Set the nid in memblock.reserved will 483 * used by the kernel, but those regions are not split up
478 * mark out all the nodes the kernel resides in. 484 * along node boundaries yet, and don't necessarily have their
485 * node ID set yet either.
486 *
487 * So iterate over all memory known to the x86 architecture,
488 * and use those ranges to set the nid in memblock.reserved.
489 * This will split up the memblock regions along node
490 * boundaries and will set the node IDs as well.
479 */ 491 */
480 for (i = 0; i < numa_meminfo.nr_blks; i++) { 492 for (i = 0; i < numa_meminfo.nr_blks; i++) {
481 struct numa_memblk *mb = &numa_meminfo.blk[i]; 493 struct numa_memblk *mb = numa_meminfo.blk + i;
494 int ret;
482 495
483 memblock_set_node(mb->start, mb->end - mb->start, 496 ret = memblock_set_node(mb->start, mb->end - mb->start, &memblock.reserved, mb->nid);
484 &memblock.reserved, mb->nid); 497 WARN_ON_ONCE(ret);
485 } 498 }
486 499
487 /* 500 /*
488 * Mark all kernel nodes. 501 * Now go over all reserved memblock regions, to construct a
502 * node mask of all kernel reserved memory areas.
489 * 503 *
490 * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo 504 * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
491 * may not include all the memblock.reserved memory ranges because 505 * numa_meminfo might not include all memblock.reserved
492 * trim_snb_memory() reserves specific pages for Sandy Bridge graphics. 506 * memory ranges, because quirks such as trim_snb_memory()
507 * reserve specific pages for Sandy Bridge graphics. ]
493 */ 508 */
494 for_each_memblock(reserved, r) 509 for_each_memblock(reserved, mb_region) {
495 if (r->nid != MAX_NUMNODES) 510 if (mb_region->nid != MAX_NUMNODES)
496 node_set(r->nid, numa_kernel_nodes); 511 node_set(mb_region->nid, reserved_nodemask);
512 }
497 513
498 /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ 514 /*
515 * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
516 * belonging to the reserved node mask.
517 *
518 * Note that this will include memory regions that reside
519 * on nodes that contain kernel memory - entire nodes
520 * become hot-unpluggable:
521 */
499 for (i = 0; i < numa_meminfo.nr_blks; i++) { 522 for (i = 0; i < numa_meminfo.nr_blks; i++) {
500 nid = numa_meminfo.blk[i].nid; 523 struct numa_memblk *mb = numa_meminfo.blk + i;
501 if (!node_isset(nid, numa_kernel_nodes))
502 continue;
503 524
504 start = numa_meminfo.blk[i].start; 525 if (!node_isset(mb->nid, reserved_nodemask))
505 end = numa_meminfo.blk[i].end; 526 continue;
506 527
507 memblock_clear_hotplug(start, end - start); 528 memblock_clear_hotplug(mb->start, mb->end - mb->start);
508 } 529 }
509} 530}
510 531
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1c37e650acac..007ebe2d8157 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1128,8 +1128,10 @@ static int __cpa_process_fault(struct cpa_data *cpa, unsigned long vaddr,
1128 /* 1128 /*
1129 * Ignore all non primary paths. 1129 * Ignore all non primary paths.
1130 */ 1130 */
1131 if (!primary) 1131 if (!primary) {
1132 cpa->numpages = 1;
1132 return 0; 1133 return 0;
1134 }
1133 1135
1134 /* 1136 /*
1135 * Ignore the NULL PTE for kernel identity mapping, as it is expected 1137 * Ignore the NULL PTE for kernel identity mapping, as it is expected
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index f65a33f505b6..8bea84724a7d 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -32,9 +32,8 @@ early_param("noexec", noexec_setup);
32 32
33void x86_configure_nx(void) 33void x86_configure_nx(void)
34{ 34{
35 if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) 35 /* If disable_nx is set, clear NX on all new mappings going forward. */
36 __supported_pte_mask |= _PAGE_NX; 36 if (disable_nx)
37 else
38 __supported_pte_mask &= ~_PAGE_NX; 37 __supported_pte_mask &= ~_PAGE_NX;
39} 38}
40 39