aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/fault.c47
-rw-r--r--arch/x86/mm/init_32.c4
-rw-r--r--arch/x86/mm/init_64.c49
-rw-r--r--arch/x86/mm/k8topology_64.c8
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c2
-rw-r--r--arch/x86/mm/numa_64.c2
-rw-r--r--arch/x86/mm/pgtable.c24
-rw-r--r--arch/x86/mm/srat_64.c8
-rw-r--r--arch/x86/mm/tlb.c48
10 files changed, 148 insertions, 46 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a204..79b0b372d2d0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
229 229
230 spin_lock_irqsave(&pgd_lock, flags); 230 spin_lock_irqsave(&pgd_lock, flags);
231 list_for_each_entry(page, &pgd_list, lru) { 231 list_for_each_entry(page, &pgd_list, lru) {
232 if (!vmalloc_sync_one(page_address(page), address)) 232 spinlock_t *pgt_lock;
233 pmd_t *ret;
234
235 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
236
237 spin_lock(pgt_lock);
238 ret = vmalloc_sync_one(page_address(page), address);
239 spin_unlock(pgt_lock);
240
241 if (!ret)
233 break; 242 break;
234 } 243 }
235 spin_unlock_irqrestore(&pgd_lock, flags); 244 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -251,6 +260,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
251 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 260 if (!(address >= VMALLOC_START && address < VMALLOC_END))
252 return -1; 261 return -1;
253 262
263 WARN_ON_ONCE(in_nmi());
264
254 /* 265 /*
255 * Synchronize this task's top level page-table 266 * Synchronize this task's top level page-table
256 * with the 'reference' page table. 267 * with the 'reference' page table.
@@ -326,29 +337,7 @@ out:
326 337
327void vmalloc_sync_all(void) 338void vmalloc_sync_all(void)
328{ 339{
329 unsigned long address; 340 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
330
331 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
332 address += PGDIR_SIZE) {
333
334 const pgd_t *pgd_ref = pgd_offset_k(address);
335 unsigned long flags;
336 struct page *page;
337
338 if (pgd_none(*pgd_ref))
339 continue;
340
341 spin_lock_irqsave(&pgd_lock, flags);
342 list_for_each_entry(page, &pgd_list, lru) {
343 pgd_t *pgd;
344 pgd = (pgd_t *)page_address(page) + pgd_index(address);
345 if (pgd_none(*pgd))
346 set_pgd(pgd, *pgd_ref);
347 else
348 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
349 }
350 spin_unlock_irqrestore(&pgd_lock, flags);
351 }
352} 341}
353 342
354/* 343/*
@@ -369,6 +358,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
369 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 358 if (!(address >= VMALLOC_START && address < VMALLOC_END))
370 return -1; 359 return -1;
371 360
361 WARN_ON_ONCE(in_nmi());
362
372 /* 363 /*
373 * Copy kernel mappings over when needed. This can also 364 * Copy kernel mappings over when needed. This can also
374 * happen within a race in page table update. In the later 365 * happen within a race in page table update. In the later
@@ -894,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
894 if (pmd_large(*pmd)) 885 if (pmd_large(*pmd))
895 return spurious_fault_check(error_code, (pte_t *) pmd); 886 return spurious_fault_check(error_code, (pte_t *) pmd);
896 887
888 /*
889 * Note: don't use pte_present() here, since it returns true
890 * if the _PAGE_PROTNONE bit is set. However, this aliases the
891 * _PAGE_GLOBAL bit, which for kernel pages give false positives
892 * when CONFIG_DEBUG_PAGEALLOC is used.
893 */
897 pte = pte_offset_kernel(pmd, address); 894 pte = pte_offset_kernel(pmd, address);
898 if (!pte_present(*pte)) 895 if (!(pte_flags(*pte) & _PAGE_PRESENT))
899 return 0; 896 return 0;
900 897
901 ret = spurious_fault_check(error_code, pte); 898 ret = spurious_fault_check(error_code, pte);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d6..558f2d332076 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -67,7 +67,7 @@ static __init void *alloc_low_page(void)
67 panic("alloc_low_page: ran out of memory"); 67 panic("alloc_low_page: ran out of memory");
68 68
69 adr = __va(pfn * PAGE_SIZE); 69 adr = __va(pfn * PAGE_SIZE);
70 memset(adr, 0, PAGE_SIZE); 70 clear_page(adr);
71 return adr; 71 return adr;
72} 72}
73 73
@@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE]
558 558
559static inline void save_pg_dir(void) 559static inline void save_pg_dir(void)
560{ 560{
561 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); 561 copy_page(swsusp_pg_dir, swapper_pg_dir);
562} 562}
563#else /* !CONFIG_ACPI_SLEEP */ 563#else /* !CONFIG_ACPI_SLEEP */
564static inline void save_pg_dir(void) 564static inline void save_pg_dir(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a20..c55f900fbf89 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -98,6 +98,43 @@ static int __init nonx32_setup(char *str)
98__setup("noexec32=", nonx32_setup); 98__setup("noexec32=", nonx32_setup);
99 99
100/* 100/*
101 * When memory was added/removed make sure all the processes MM have
102 * suitable PGD entries in the local PGD level page.
103 */
104void sync_global_pgds(unsigned long start, unsigned long end)
105{
106 unsigned long address;
107
108 for (address = start; address <= end; address += PGDIR_SIZE) {
109 const pgd_t *pgd_ref = pgd_offset_k(address);
110 unsigned long flags;
111 struct page *page;
112
113 if (pgd_none(*pgd_ref))
114 continue;
115
116 spin_lock_irqsave(&pgd_lock, flags);
117 list_for_each_entry(page, &pgd_list, lru) {
118 pgd_t *pgd;
119 spinlock_t *pgt_lock;
120
121 pgd = (pgd_t *)page_address(page) + pgd_index(address);
122 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
123 spin_lock(pgt_lock);
124
125 if (pgd_none(*pgd))
126 set_pgd(pgd, *pgd_ref);
127 else
128 BUG_ON(pgd_page_vaddr(*pgd)
129 != pgd_page_vaddr(*pgd_ref));
130
131 spin_unlock(pgt_lock);
132 }
133 spin_unlock_irqrestore(&pgd_lock, flags);
134 }
135}
136
137/*
101 * NOTE: This function is marked __ref because it calls __init function 138 * NOTE: This function is marked __ref because it calls __init function
102 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 139 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
103 */ 140 */
@@ -293,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
293 panic("alloc_low_page: ran out of memory"); 330 panic("alloc_low_page: ran out of memory");
294 331
295 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 332 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
296 memset(adr, 0, PAGE_SIZE); 333 clear_page(adr);
297 *phys = pfn * PAGE_SIZE; 334 *phys = pfn * PAGE_SIZE;
298 return adr; 335 return adr;
299} 336}
@@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start,
534 unsigned long end, 571 unsigned long end,
535 unsigned long page_size_mask) 572 unsigned long page_size_mask)
536{ 573{
537 574 bool pgd_changed = false;
538 unsigned long next, last_map_addr = end; 575 unsigned long next, last_map_addr = end;
576 unsigned long addr;
539 577
540 start = (unsigned long)__va(start); 578 start = (unsigned long)__va(start);
541 end = (unsigned long)__va(end); 579 end = (unsigned long)__va(end);
580 addr = start;
542 581
543 for (; start < end; start = next) { 582 for (; start < end; start = next) {
544 pgd_t *pgd = pgd_offset_k(start); 583 pgd_t *pgd = pgd_offset_k(start);
@@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start,
563 spin_lock(&init_mm.page_table_lock); 602 spin_lock(&init_mm.page_table_lock);
564 pgd_populate(&init_mm, pgd, __va(pud_phys)); 603 pgd_populate(&init_mm, pgd, __va(pud_phys));
565 spin_unlock(&init_mm.page_table_lock); 604 spin_unlock(&init_mm.page_table_lock);
605 pgd_changed = true;
566 } 606 }
607
608 if (pgd_changed)
609 sync_global_pgds(addr, end);
610
567 __flush_tlb_all(); 611 __flush_tlb_all();
568 612
569 return last_map_addr; 613 return last_map_addr;
@@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
1003 } 1047 }
1004 1048
1005 } 1049 }
1050 sync_global_pgds((unsigned long)start_page, end);
1006 return 0; 1051 return 0;
1007} 1052}
1008 1053
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 970ed579d4e4..52d54bfc1ebb 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,7 +22,7 @@
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h> 23#include <asm/mpspec.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h> 25#include <asm/amd_nb.h>
26 26
27static struct bootnode __initdata nodes[8]; 27static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; 28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
@@ -54,8 +54,8 @@ static __init int find_northbridge(void)
54static __init void early_get_boot_cpu_id(void) 54static __init void early_get_boot_cpu_id(void)
55{ 55{
56 /* 56 /*
57 * need to get boot_cpu_id so can use that to create apicid_to_node 57 * need to get the APIC ID of the BSP so can use that to
58 * in k8_scan_nodes() 58 * create apicid_to_node in k8_scan_nodes()
59 */ 59 */
60#ifdef CONFIG_X86_MPPARSE 60#ifdef CONFIG_X86_MPPARSE
61 /* 61 /*
@@ -212,7 +212,7 @@ int __init k8_scan_nodes(void)
212 bits = boot_cpu_data.x86_coreid_bits; 212 bits = boot_cpu_data.x86_coreid_bits;
213 cores = (1<<bits); 213 cores = (1<<bits);
214 apicid_base = 0; 214 apicid_base = 0;
215 /* need to get boot_cpu_id early for system with apicid lifting */ 215 /* get the APIC ID of the BSP early for systems with apicid lifting */
216 early_get_boot_cpu_id(); 216 early_get_boot_cpu_id();
217 if (boot_cpu_physical_apicid > 0) { 217 if (boot_cpu_physical_apicid > 0) {
218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); 218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e5..d87dd6d042d6 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
631 if (!pte) 631 if (!pte)
632 return false; 632 return false;
633 633
634 WARN_ON_ONCE(in_nmi());
635
634 if (error_code & 2) 636 if (error_code & 2)
635 kmemcheck_access(regs, address, KMEMCHECK_WRITE); 637 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
636 else 638 else
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6f..324aa3f07237 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
9 b == 0xf0 || b == 0xf2 || b == 0xf3 9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */ 10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e 12 || b == 0x64 || b == 0x65
13 /* Group 3 */ 13 /* Group 3 */
14 || b == 0x66 14 || b == 0x66
15 /* Group 4 */ 15 /* Group 4 */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96c..4962f1aeda6f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,7 +18,7 @@
18#include <asm/dma.h> 18#include <asm/dma.h>
19#include <asm/numa.h> 19#include <asm/numa.h>
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/amd_nb.h>
22 22
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590e..8be8c7d7bc89 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
87#define UNSHARED_PTRS_PER_PGD \ 87#define UNSHARED_PTRS_PER_PGD \
88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
89 89
90static void pgd_ctor(pgd_t *pgd) 90
91static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
92{
93 BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
94 virt_to_page(pgd)->index = (pgoff_t)mm;
95}
96
97struct mm_struct *pgd_page_get_mm(struct page *page)
98{
99 return (struct mm_struct *)page->index;
100}
101
102static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
91{ 103{
92 /* If the pgd points to a shared pagetable level (either the 104 /* If the pgd points to a shared pagetable level (either the
93 ptes in non-PAE, or shared PMD in PAE), then just copy the 105 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd)
98 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 110 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
99 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 111 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
100 KERNEL_PGD_PTRS); 112 KERNEL_PGD_PTRS);
101 paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
102 __pa(swapper_pg_dir) >> PAGE_SHIFT,
103 KERNEL_PGD_BOUNDARY,
104 KERNEL_PGD_PTRS);
105 } 113 }
106 114
107 /* list required to sync kernel mapping updates */ 115 /* list required to sync kernel mapping updates */
108 if (!SHARED_KERNEL_PMD) 116 if (!SHARED_KERNEL_PMD) {
117 pgd_set_mm(pgd, mm);
109 pgd_list_add(pgd); 118 pgd_list_add(pgd);
119 }
110} 120}
111 121
112static void pgd_dtor(pgd_t *pgd) 122static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
272 */ 282 */
273 spin_lock_irqsave(&pgd_lock, flags); 283 spin_lock_irqsave(&pgd_lock, flags);
274 284
275 pgd_ctor(pgd); 285 pgd_ctor(mm, pgd);
276 pgd_prepopulate_pmd(mm, pgd, pmds); 286 pgd_prepopulate_pmd(mm, pgd, pmds);
277 287
278 spin_unlock_irqrestore(&pgd_lock, flags); 288 spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index f9897f7a9ef1..9c0d0d399c30 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -420,9 +420,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
420 return -1; 420 return -1;
421 } 421 }
422 422
423 for_each_node_mask(i, nodes_parsed) 423 for (i = 0; i < num_node_memblks; i++)
424 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 424 e820_register_active_regions(memblk_nodeid[i],
425 nodes[i].end >> PAGE_SHIFT); 425 node_memblk_range[i].start >> PAGE_SHIFT,
426 node_memblk_range[i].end >> PAGE_SHIFT);
427
426 /* for out of order entries in SRAT */ 428 /* for out of order entries in SRAT */
427 sort_node_map(); 429 sort_node_map();
428 if (!nodes_cover_memory(nodes)) { 430 if (!nodes_cover_memory(nodes)) {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab6667..49358481c733 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
5#include <linux/smp.h> 5#include <linux/smp.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cpu.h>
8 9
9#include <asm/tlbflush.h> 10#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
52 want false sharing in the per cpu data segment. */ 53 want false sharing in the per cpu data segment. */
53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; 54static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54 55
56static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57
55/* 58/*
56 * We cannot call mmdrop() because we are in interrupt context, 59 * We cannot call mmdrop() because we are in interrupt context,
57 * instead update mm->cpu_vm_mask. 60 * instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
173 union smp_flush_state *f; 176 union smp_flush_state *f;
174 177
175 /* Caller has disabled preemption */ 178 /* Caller has disabled preemption */
176 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 179 sender = this_cpu_read(tlb_vector_offset);
177 f = &flush_state[sender]; 180 f = &flush_state[sender];
178 181
179 /* 182 /*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
218 flush_tlb_others_ipi(cpumask, mm, va); 221 flush_tlb_others_ipi(cpumask, mm, va);
219} 222}
220 223
224static void __cpuinit calculate_tlb_offset(void)
225{
226 int cpu, node, nr_node_vecs;
227 /*
228 * we are changing tlb_vector_offset for each CPU in runtime, but this
229 * will not cause inconsistency, as the write is atomic under X86. we
230 * might see more lock contentions in a short time, but after all CPU's
231 * tlb_vector_offset are changed, everything should go normal
232 *
233 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
234 * waste some vectors.
235 **/
236 if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
237 nr_node_vecs = 1;
238 else
239 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
240
241 for_each_online_node(node) {
242 int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
243 nr_node_vecs;
244 int cpu_offset = 0;
245 for_each_cpu(cpu, cpumask_of_node(node)) {
246 per_cpu(tlb_vector_offset, cpu) = node_offset +
247 cpu_offset;
248 cpu_offset++;
249 cpu_offset = cpu_offset % nr_node_vecs;
250 }
251 }
252}
253
254static int tlb_cpuhp_notify(struct notifier_block *n,
255 unsigned long action, void *hcpu)
256{
257 switch (action & 0xf) {
258 case CPU_ONLINE:
259 case CPU_DEAD:
260 calculate_tlb_offset();
261 }
262 return NOTIFY_OK;
263}
264
221static int __cpuinit init_smp_flush(void) 265static int __cpuinit init_smp_flush(void)
222{ 266{
223 int i; 267 int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 269 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 raw_spin_lock_init(&flush_state[i].tlbstate_lock); 270 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 271
272 calculate_tlb_offset();
273 hotcpu_notifier(tlb_cpuhp_notify, 0);
228 return 0; 274 return 0;
229} 275}
230core_initcall(init_smp_flush); 276core_initcall(init_smp_flush);