aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/extable.c31
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_32.c10
-rw-r--r--arch/x86/mm/init_64.c35
-rw-r--r--arch/x86/mm/ioremap.c50
-rw-r--r--arch/x86/mm/k8topology_64.c101
-rw-r--r--arch/x86/mm/kmmio.c8
-rw-r--r--arch/x86/mm/numa_32.c4
-rw-r--r--arch/x86/mm/numa_64.c252
-rw-r--r--arch/x86/mm/pageattr.c22
-rw-r--r--arch/x86/mm/pat.c20
-rw-r--r--arch/x86/mm/setup_nx.c59
-rw-r--r--arch/x86/mm/srat_64.c33
-rw-r--r--arch/x86/mm/testmmiotrace.c29
-rw-r--r--arch/x86/mm/tlb.c3
16 files changed, 430 insertions, 244 deletions
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 61b41ca3b5a2..d0474ad2a6e5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -35,34 +35,3 @@ int fixup_exception(struct pt_regs *regs)
35 35
36 return 0; 36 return 0;
37} 37}
38
39#ifdef CONFIG_X86_64
40/*
41 * Need to defined our own search_extable on X86_64 to work around
42 * a B stepping K8 bug.
43 */
44const struct exception_table_entry *
45search_extable(const struct exception_table_entry *first,
46 const struct exception_table_entry *last,
47 unsigned long value)
48{
49 /* B stepping K8 bug */
50 if ((value >> 32) == 0)
51 value |= 0xffffffffUL << 32;
52
53 while (first <= last) {
54 const struct exception_table_entry *mid;
55 long diff;
56
57 mid = (last - first) / 2 + first;
58 diff = mid->insn - value;
59 if (diff == 0)
60 return mid;
61 else if (diff < 0)
62 first = mid+1;
63 else
64 last = mid-1;
65 }
66 return NULL;
67}
68#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f4cee9028cf0..f62777940dfb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
38 * Returns 0 if mmiotrace is disabled, or if the fault is not 38 * Returns 0 if mmiotrace is disabled, or if the fault is not
39 * handled by mmiotrace: 39 * handled by mmiotrace:
40 */ 40 */
41static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 41static inline int __kprobes
42kmmio_fault(struct pt_regs *regs, unsigned long addr)
42{ 43{
43 if (unlikely(is_kmmio_active())) 44 if (unlikely(is_kmmio_active()))
44 if (kmmio_handler(regs, addr) == 1) 45 if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
46 return 0; 47 return 0;
47} 48}
48 49
49static inline int notify_page_fault(struct pt_regs *regs) 50static inline int __kprobes notify_page_fault(struct pt_regs *regs)
50{ 51{
51 int ret = 0; 52 int ret = 0;
52 53
@@ -240,7 +241,7 @@ void vmalloc_sync_all(void)
240 * 241 *
241 * Handle a fault on the vmalloc or module mapping area 242 * Handle a fault on the vmalloc or module mapping area
242 */ 243 */
243static noinline int vmalloc_fault(unsigned long address) 244static noinline __kprobes int vmalloc_fault(unsigned long address)
244{ 245{
245 unsigned long pgd_paddr; 246 unsigned long pgd_paddr;
246 pmd_t *pmd_k; 247 pmd_t *pmd_k;
@@ -357,7 +358,7 @@ void vmalloc_sync_all(void)
357 * 358 *
358 * This assumes no large pages in there. 359 * This assumes no large pages in there.
359 */ 360 */
360static noinline int vmalloc_fault(unsigned long address) 361static noinline __kprobes int vmalloc_fault(unsigned long address)
361{ 362{
362 pgd_t *pgd, *pgd_ref; 363 pgd_t *pgd, *pgd_ref;
363 pud_t *pud, *pud_ref; 364 pud_t *pud, *pud_ref;
@@ -658,7 +659,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
658 show_fault_oops(regs, error_code, address); 659 show_fault_oops(regs, error_code, address);
659 660
660 stackend = end_of_stack(tsk); 661 stackend = end_of_stack(tsk);
661 if (*stackend != STACK_END_MAGIC) 662 if (tsk != &init_task && *stackend != STACK_END_MAGIC)
662 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 663 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
663 664
664 tsk->thread.cr2 = address; 665 tsk->thread.cr2 = address;
@@ -860,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
860 * There are no security implications to leaving a stale TLB when 861 * There are no security implications to leaving a stale TLB when
861 * increasing the permissions on a page. 862 * increasing the permissions on a page.
862 */ 863 */
863static noinline int 864static noinline __kprobes int
864spurious_fault(unsigned long error_code, unsigned long address) 865spurious_fault(unsigned long error_code, unsigned long address)
865{ 866{
866 pgd_t *pgd; 867 pgd_t *pgd;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd5536f62..d406c5239019 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
146 use_gbpages = direct_gbpages; 146 use_gbpages = direct_gbpages;
147#endif 147#endif
148 148
149 set_nx();
150 if (nx_enabled)
151 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
152
153 /* Enable PSE if available */ 149 /* Enable PSE if available */
154 if (cpu_has_pse) 150 if (cpu_has_pse)
155 set_in_cr4(X86_CR4_PSE); 151 set_in_cr4(X86_CR4_PSE);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1d8d5d..c973f8e2a6cf 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
412 pkmap_page_table = pte; 412 pkmap_page_table = pte;
413} 413}
414 414
415static void __init add_one_highpage_init(struct page *page, int pfn) 415static void __init add_one_highpage_init(struct page *page)
416{ 416{
417 ClearPageReserved(page); 417 ClearPageReserved(page);
418 init_page_count(page); 418 init_page_count(page);
@@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
445 if (!pfn_valid(node_pfn)) 445 if (!pfn_valid(node_pfn))
446 continue; 446 continue;
447 page = pfn_to_page(node_pfn); 447 page = pfn_to_page(node_pfn);
448 add_one_highpage_init(page, node_pfn); 448 add_one_highpage_init(page);
449 } 449 }
450 450
451 return 0; 451 return 0;
@@ -703,8 +703,8 @@ void __init find_low_pfn_range(void)
703} 703}
704 704
705#ifndef CONFIG_NEED_MULTIPLE_NODES 705#ifndef CONFIG_NEED_MULTIPLE_NODES
706void __init initmem_init(unsigned long start_pfn, 706void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
707 unsigned long end_pfn) 707 int acpi, int k8)
708{ 708{
709#ifdef CONFIG_HIGHMEM 709#ifdef CONFIG_HIGHMEM
710 highstart_pfn = highend_pfn = max_pfn; 710 highstart_pfn = highend_pfn = max_pfn;
@@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void)
997const int rodata_test_data = 0xC3; 997const int rodata_test_data = 0xC3;
998EXPORT_SYMBOL_GPL(rodata_test_data); 998EXPORT_SYMBOL_GPL(rodata_test_data);
999 999
1000static int kernel_set_to_readonly; 1000int kernel_set_to_readonly __read_mostly;
1001 1001
1002void set_kernel_text_rw(void) 1002void set_kernel_text_rw(void)
1003{ 1003{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a6006b..5198b9bb34ef 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start,
568} 568}
569 569
570#ifndef CONFIG_NUMA 570#ifndef CONFIG_NUMA
571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
572 int acpi, int k8)
572{ 573{
573 unsigned long bootmap_size, bootmap; 574 unsigned long bootmap_size, bootmap;
574 575
@@ -694,12 +695,12 @@ void __init mem_init(void)
694const int rodata_test_data = 0xC3; 695const int rodata_test_data = 0xC3;
695EXPORT_SYMBOL_GPL(rodata_test_data); 696EXPORT_SYMBOL_GPL(rodata_test_data);
696 697
697static int kernel_set_to_readonly; 698int kernel_set_to_readonly;
698 699
699void set_kernel_text_rw(void) 700void set_kernel_text_rw(void)
700{ 701{
701 unsigned long start = PFN_ALIGN(_stext); 702 unsigned long start = PFN_ALIGN(_text);
702 unsigned long end = PFN_ALIGN(__start_rodata); 703 unsigned long end = PFN_ALIGN(__stop___ex_table);
703 704
704 if (!kernel_set_to_readonly) 705 if (!kernel_set_to_readonly)
705 return; 706 return;
@@ -707,13 +708,18 @@ void set_kernel_text_rw(void)
707 pr_debug("Set kernel text: %lx - %lx for read write\n", 708 pr_debug("Set kernel text: %lx - %lx for read write\n",
708 start, end); 709 start, end);
709 710
711 /*
712 * Make the kernel identity mapping for text RW. Kernel text
713 * mapping will always be RO. Refer to the comment in
714 * static_protections() in pageattr.c
715 */
710 set_memory_rw(start, (end - start) >> PAGE_SHIFT); 716 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
711} 717}
712 718
713void set_kernel_text_ro(void) 719void set_kernel_text_ro(void)
714{ 720{
715 unsigned long start = PFN_ALIGN(_stext); 721 unsigned long start = PFN_ALIGN(_text);
716 unsigned long end = PFN_ALIGN(__start_rodata); 722 unsigned long end = PFN_ALIGN(__stop___ex_table);
717 723
718 if (!kernel_set_to_readonly) 724 if (!kernel_set_to_readonly)
719 return; 725 return;
@@ -721,14 +727,21 @@ void set_kernel_text_ro(void)
721 pr_debug("Set kernel text: %lx - %lx for read only\n", 727 pr_debug("Set kernel text: %lx - %lx for read only\n",
722 start, end); 728 start, end);
723 729
730 /*
731 * Set the kernel identity mapping for text RO.
732 */
724 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 733 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
725} 734}
726 735
727void mark_rodata_ro(void) 736void mark_rodata_ro(void)
728{ 737{
729 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 738 unsigned long start = PFN_ALIGN(_text);
730 unsigned long rodata_start = 739 unsigned long rodata_start =
731 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 740 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
741 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
742 unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
743 unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
744 unsigned long data_start = (unsigned long) &_sdata;
732 745
733 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 746 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
734 (end - start) >> 10); 747 (end - start) >> 10);
@@ -751,6 +764,14 @@ void mark_rodata_ro(void)
751 printk(KERN_INFO "Testing CPA: again\n"); 764 printk(KERN_INFO "Testing CPA: again\n");
752 set_memory_ro(start, (end-start) >> PAGE_SHIFT); 765 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
753#endif 766#endif
767
768 free_init_pages("unused kernel memory",
769 (unsigned long) page_address(virt_to_page(text_end)),
770 (unsigned long)
771 page_address(virt_to_page(rodata_start)));
772 free_init_pages("unused kernel memory",
773 (unsigned long) page_address(virt_to_page(rodata_end)),
774 (unsigned long) page_address(virt_to_page(data_start)));
754} 775}
755 776
756#endif 777#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 334e63ca7b2b..c246d259822d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -170,8 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 (unsigned long long)phys_addr, 170 (unsigned long long)phys_addr,
171 (unsigned long long)(phys_addr + size), 171 (unsigned long long)(phys_addr + size),
172 prot_val, new_prot_val); 172 prot_val, new_prot_val);
173 free_memtype(phys_addr, phys_addr + size); 173 goto err_free_memtype;
174 return NULL;
175 } 174 }
176 prot_val = new_prot_val; 175 prot_val = new_prot_val;
177 } 176 }
@@ -197,26 +196,25 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
197 */ 196 */
198 area = get_vm_area_caller(size, VM_IOREMAP, caller); 197 area = get_vm_area_caller(size, VM_IOREMAP, caller);
199 if (!area) 198 if (!area)
200 return NULL; 199 goto err_free_memtype;
201 area->phys_addr = phys_addr; 200 area->phys_addr = phys_addr;
202 vaddr = (unsigned long) area->addr; 201 vaddr = (unsigned long) area->addr;
203 202
204 if (kernel_map_sync_memtype(phys_addr, size, prot_val)) { 203 if (kernel_map_sync_memtype(phys_addr, size, prot_val))
205 free_memtype(phys_addr, phys_addr + size); 204 goto err_free_area;
206 free_vm_area(area);
207 return NULL;
208 }
209 205
210 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { 206 if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot))
211 free_memtype(phys_addr, phys_addr + size); 207 goto err_free_area;
212 free_vm_area(area);
213 return NULL;
214 }
215 208
216 ret_addr = (void __iomem *) (vaddr + offset); 209 ret_addr = (void __iomem *) (vaddr + offset);
217 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); 210 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
218 211
219 return ret_addr; 212 return ret_addr;
213err_free_area:
214 free_vm_area(area);
215err_free_memtype:
216 free_memtype(phys_addr, phys_addr + size);
217 return NULL;
220} 218}
221 219
222/** 220/**
@@ -283,30 +281,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
283} 281}
284EXPORT_SYMBOL(ioremap_cache); 282EXPORT_SYMBOL(ioremap_cache);
285 283
286static void __iomem *ioremap_default(resource_size_t phys_addr,
287 unsigned long size)
288{
289 unsigned long flags;
290 void __iomem *ret;
291 int err;
292
293 /*
294 * - WB for WB-able memory and no other conflicting mappings
295 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
296 * - Inherit from confliting mappings otherwise
297 */
298 err = reserve_memtype(phys_addr, phys_addr + size,
299 _PAGE_CACHE_WB, &flags);
300 if (err < 0)
301 return NULL;
302
303 ret = __ioremap_caller(phys_addr, size, flags,
304 __builtin_return_address(0));
305
306 free_memtype(phys_addr, phys_addr + size);
307 return ret;
308}
309
310void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, 284void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
311 unsigned long prot_val) 285 unsigned long prot_val)
312{ 286{
@@ -382,7 +356,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
382 if (page_is_ram(start >> PAGE_SHIFT)) 356 if (page_is_ram(start >> PAGE_SHIFT))
383 return __va(phys); 357 return __va(phys);
384 358
385 addr = (void __force *)ioremap_default(start, PAGE_SIZE); 359 addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
386 if (addr) 360 if (addr)
387 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 361 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
388 362
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 268f8255280f..970ed579d4e4 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h> 25#include <asm/k8.h>
26 26
27static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
29
27static __init int find_northbridge(void) 30static __init int find_northbridge(void)
28{ 31{
29 int num; 32 int num;
@@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void)
54 * need to get boot_cpu_id so can use that to create apicid_to_node 57 * need to get boot_cpu_id so can use that to create apicid_to_node
55 * in k8_scan_nodes() 58 * in k8_scan_nodes()
56 */ 59 */
57 /*
58 * Find possible boot-time SMP configuration:
59 */
60#ifdef CONFIG_X86_MPPARSE
61 early_find_smp_config();
62#endif
63#ifdef CONFIG_ACPI
64 /*
65 * Read APIC information from ACPI tables.
66 */
67 early_acpi_boot_init();
68#endif
69#ifdef CONFIG_X86_MPPARSE 60#ifdef CONFIG_X86_MPPARSE
70 /* 61 /*
71 * get boot-time SMP configuration: 62 * get boot-time SMP configuration:
@@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void)
76 early_init_lapic_mapping(); 67 early_init_lapic_mapping();
77} 68}
78 69
79int __init k8_scan_nodes(unsigned long start, unsigned long end) 70int __init k8_get_nodes(struct bootnode *physnodes)
80{ 71{
81 unsigned numnodes, cores, bits, apicid_base; 72 int i;
73 int ret = 0;
74
75 for_each_node_mask(i, nodes_parsed) {
76 physnodes[ret].start = nodes[i].start;
77 physnodes[ret].end = nodes[i].end;
78 ret++;
79 }
80 return ret;
81}
82
83int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
84{
85 unsigned long start = PFN_PHYS(start_pfn);
86 unsigned long end = PFN_PHYS(end_pfn);
87 unsigned numnodes;
82 unsigned long prevbase; 88 unsigned long prevbase;
83 struct bootnode nodes[8]; 89 int i, nb, found = 0;
84 int i, j, nb, found = 0;
85 u32 nodeid, reg; 90 u32 nodeid, reg;
86 91
87 if (!early_pci_allowed()) 92 if (!early_pci_allowed())
@@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
91 if (nb < 0) 96 if (nb < 0)
92 return nb; 97 return nb;
93 98
94 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 99 pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
95 100
96 reg = read_pci_config(0, nb, 0, 0x60); 101 reg = read_pci_config(0, nb, 0, 0x60);
97 numnodes = ((reg >> 4) & 0xF) + 1; 102 numnodes = ((reg >> 4) & 0xF) + 1;
98 if (numnodes <= 1) 103 if (numnodes <= 1)
99 return -1; 104 return -1;
100 105
101 printk(KERN_INFO "Number of nodes %d\n", numnodes); 106 pr_info("Number of physical nodes %d\n", numnodes);
102 107
103 memset(&nodes, 0, sizeof(nodes));
104 prevbase = 0; 108 prevbase = 0;
105 for (i = 0; i < 8; i++) { 109 for (i = 0; i < 8; i++) {
106 unsigned long base, limit; 110 unsigned long base, limit;
@@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
111 nodeid = limit & 7; 115 nodeid = limit & 7;
112 if ((base & 3) == 0) { 116 if ((base & 3) == 0) {
113 if (i < numnodes) 117 if (i < numnodes)
114 printk("Skipping disabled node %d\n", i); 118 pr_info("Skipping disabled node %d\n", i);
115 continue; 119 continue;
116 } 120 }
117 if (nodeid >= numnodes) { 121 if (nodeid >= numnodes) {
118 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, 122 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
119 base, limit); 123 base, limit);
120 continue; 124 continue;
121 } 125 }
122 126
123 if (!limit) { 127 if (!limit) {
124 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", 128 pr_info("Skipping node entry %d (base %lx)\n",
125 i, base); 129 i, base);
126 continue; 130 continue;
127 } 131 }
128 if ((base >> 8) & 3 || (limit >> 8) & 3) { 132 if ((base >> 8) & 3 || (limit >> 8) & 3) {
129 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 133 pr_err("Node %d using interleaving mode %lx/%lx\n",
130 nodeid, (base>>8)&3, (limit>>8) & 3); 134 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
131 return -1; 135 return -1;
132 } 136 }
133 if (node_isset(nodeid, node_possible_map)) { 137 if (node_isset(nodeid, nodes_parsed)) {
134 printk(KERN_INFO "Node %d already present. Skipping\n", 138 pr_info("Node %d already present, skipping\n",
135 nodeid); 139 nodeid);
136 continue; 140 continue;
137 } 141 }
138 142
@@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
141 limit |= (1<<24)-1; 145 limit |= (1<<24)-1;
142 limit++; 146 limit++;
143 147
144 if (limit > max_pfn << PAGE_SHIFT) 148 if (limit > end)
145 limit = max_pfn << PAGE_SHIFT; 149 limit = end;
146 if (limit <= base) 150 if (limit <= base)
147 continue; 151 continue;
148 152
@@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
154 if (limit > end) 158 if (limit > end)
155 limit = end; 159 limit = end;
156 if (limit == base) { 160 if (limit == base) {
157 printk(KERN_ERR "Empty node %d\n", nodeid); 161 pr_err("Empty node %d\n", nodeid);
158 continue; 162 continue;
159 } 163 }
160 if (limit < base) { 164 if (limit < base) {
161 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", 165 pr_err("Node %d bogus settings %lx-%lx.\n",
162 nodeid, base, limit); 166 nodeid, base, limit);
163 continue; 167 continue;
164 } 168 }
165 169
166 /* Could sort here, but pun for now. Should not happen anyroads. */ 170 /* Could sort here, but pun for now. Should not happen anyroads. */
167 if (prevbase > base) { 171 if (prevbase > base) {
168 printk(KERN_ERR "Node map not sorted %lx,%lx\n", 172 pr_err("Node map not sorted %lx,%lx\n",
169 prevbase, base); 173 prevbase, base);
170 return -1; 174 return -1;
171 } 175 }
172 176
173 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 177 pr_info("Node %d MemBase %016lx Limit %016lx\n",
174 nodeid, base, limit); 178 nodeid, base, limit);
175 179
176 found++; 180 found++;
177 181
@@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
180 184
181 prevbase = base; 185 prevbase = base;
182 186
183 node_set(nodeid, node_possible_map); 187 node_set(nodeid, nodes_parsed);
184 } 188 }
185 189
186 if (!found) 190 if (!found)
187 return -1; 191 return -1;
192 return 0;
193}
194
195int __init k8_scan_nodes(void)
196{
197 unsigned int bits;
198 unsigned int cores;
199 unsigned int apicid_base;
200 int i;
188 201
202 BUG_ON(nodes_empty(nodes_parsed));
203 node_possible_map = nodes_parsed;
189 memnode_shift = compute_hash_shift(nodes, 8, NULL); 204 memnode_shift = compute_hash_shift(nodes, 8, NULL);
190 if (memnode_shift < 0) { 205 if (memnode_shift < 0) {
191 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 206 pr_err("No NUMA node hash function found. Contact maintainer\n");
192 return -1; 207 return -1;
193 } 208 }
194 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 209 pr_info("Using node hash shift of %d\n", memnode_shift);
195 210
196 /* use the coreid bits from early_identify_cpu */ 211 /* use the coreid bits from early_identify_cpu */
197 bits = boot_cpu_data.x86_coreid_bits; 212 bits = boot_cpu_data.x86_coreid_bits;
@@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
200 /* need to get boot_cpu_id early for system with apicid lifting */ 215 /* need to get boot_cpu_id early for system with apicid lifting */
201 early_get_boot_cpu_id(); 216 early_get_boot_cpu_id();
202 if (boot_cpu_physical_apicid > 0) { 217 if (boot_cpu_physical_apicid > 0) {
203 printk(KERN_INFO "BSP APIC ID: %02x\n", 218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
204 boot_cpu_physical_apicid);
205 apicid_base = boot_cpu_physical_apicid; 219 apicid_base = boot_cpu_physical_apicid;
206 } 220 }
207 221
208 for (i = 0; i < 8; i++) { 222 for_each_node_mask(i, node_possible_map) {
209 if (nodes[i].start == nodes[i].end) 223 int j;
210 continue;
211 224
212 e820_register_active_regions(i, 225 e820_register_active_regions(i,
213 nodes[i].start >> PAGE_SHIFT, 226 nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd77917f..11a4ad4d6253 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -540,8 +540,14 @@ kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
540 struct die_args *arg = args; 540 struct die_args *arg = args;
541 541
542 if (val == DIE_DEBUG && (arg->err & DR_STEP)) 542 if (val == DIE_DEBUG && (arg->err & DR_STEP))
543 if (post_kmmio_handler(arg->err, arg->regs) == 1) 543 if (post_kmmio_handler(arg->err, arg->regs) == 1) {
544 /*
545 * Reset the BS bit in dr6 (pointed by args->err) to
546 * denote completion of processing
547 */
548 (*(unsigned long *)ERR_PTR(arg->err)) &= ~DR_STEP;
544 return NOTIFY_STOP; 549 return NOTIFY_STOP;
550 }
545 551
546 return NOTIFY_DONE; 552 return NOTIFY_DONE;
547} 553}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d2530062fe00..b20760ca7244 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
347 (ulong) node_remap_end_vaddr[nid]); 347 (ulong) node_remap_end_vaddr[nid]);
348} 348}
349 349
350void __init initmem_init(unsigned long start_pfn, 350void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
351 unsigned long end_pfn) 351 int acpi, int k8)
352{ 352{
353 int nid; 353 int nid;
354 long kva_target_pfn; 354 long kva_target_pfn;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913beac71..83bbc70d11bb 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 239 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 241 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) 242 if (nodedata_phys < start || nodedata_phys >= end) {
243 free_bootmem(nodedata_phys, pgdat_size); 243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
244 node_data[nodeid] = NULL; 250 node_data[nodeid] = NULL;
245 return; 251 return;
246 } 252 }
@@ -306,8 +312,71 @@ void __init numa_init_array(void)
306 312
307#ifdef CONFIG_NUMA_EMU 313#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */ 314/* Numa emulation */
315static struct bootnode nodes[MAX_NUMNODES] __initdata;
316static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309static char *cmdline __initdata; 317static char *cmdline __initdata;
310 318
319static int __init setup_physnodes(unsigned long start, unsigned long end,
320 int acpi, int k8)
321{
322 int nr_nodes = 0;
323 int ret = 0;
324 int i;
325
326#ifdef CONFIG_ACPI_NUMA
327 if (acpi)
328 nr_nodes = acpi_get_nodes(physnodes);
329#endif
330#ifdef CONFIG_K8_NUMA
331 if (k8)
332 nr_nodes = k8_get_nodes(physnodes);
333#endif
334 /*
335 * Basic sanity checking on the physical node map: there may be errors
336 * if the SRAT or K8 incorrectly reported the topology or the mem=
337 * kernel parameter is used.
338 */
339 for (i = 0; i < nr_nodes; i++) {
340 if (physnodes[i].start == physnodes[i].end)
341 continue;
342 if (physnodes[i].start > end) {
343 physnodes[i].end = physnodes[i].start;
344 continue;
345 }
346 if (physnodes[i].end < start) {
347 physnodes[i].start = physnodes[i].end;
348 continue;
349 }
350 if (physnodes[i].start < start)
351 physnodes[i].start = start;
352 if (physnodes[i].end > end)
353 physnodes[i].end = end;
354 }
355
356 /*
357 * Remove all nodes that have no memory or were truncated because of the
358 * limited address range.
359 */
360 for (i = 0; i < nr_nodes; i++) {
361 if (physnodes[i].start == physnodes[i].end)
362 continue;
363 physnodes[ret].start = physnodes[i].start;
364 physnodes[ret].end = physnodes[i].end;
365 ret++;
366 }
367
368 /*
369 * If no physical topology was detected, a single node is faked to cover
370 * the entire address space.
371 */
372 if (!ret) {
373 physnodes[ret].start = start;
374 physnodes[ret].end = end;
375 ret = 1;
376 }
377 return ret;
378}
379
311/* 380/*
312 * Setups up nid to range from addr to addr + size. If the end 381 * Setups up nid to range from addr to addr + size. If the end
313 * boundary is greater than max_addr, then max_addr is used instead. 382 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +384,9 @@ static char *cmdline __initdata;
315 * allocation past addr and -1 otherwise. addr is adjusted to be at 384 * allocation past addr and -1 otherwise. addr is adjusted to be at
316 * the end of the node. 385 * the end of the node.
317 */ 386 */
318static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 387static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319 u64 size, u64 max_addr)
320{ 388{
321 int ret = 0; 389 int ret = 0;
322
323 nodes[nid].start = *addr; 390 nodes[nid].start = *addr;
324 *addr += size; 391 *addr += size;
325 if (*addr >= max_addr) { 392 if (*addr >= max_addr) {
@@ -335,12 +402,111 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
335} 402}
336 403
337/* 404/*
405 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
406 * to max_addr. The return value is the number of nodes allocated.
407 */
408static int __init split_nodes_interleave(u64 addr, u64 max_addr,
409 int nr_phys_nodes, int nr_nodes)
410{
411 nodemask_t physnode_mask = NODE_MASK_NONE;
412 u64 size;
413 int big;
414 int ret = 0;
415 int i;
416
417 if (nr_nodes <= 0)
418 return -1;
419 if (nr_nodes > MAX_NUMNODES) {
420 pr_info("numa=fake=%d too large, reducing to %d\n",
421 nr_nodes, MAX_NUMNODES);
422 nr_nodes = MAX_NUMNODES;
423 }
424
425 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
426 /*
427 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder.
429 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
431 FAKE_NODE_MIN_SIZE;
432
433 size &= FAKE_NODE_MIN_HASH_MASK;
434 if (!size) {
435 pr_err("Not enough memory for each node. "
436 "NUMA emulation disabled.\n");
437 return -1;
438 }
439
440 for (i = 0; i < nr_phys_nodes; i++)
441 if (physnodes[i].start != physnodes[i].end)
442 node_set(i, physnode_mask);
443
444 /*
445 * Continue to fill physical nodes with fake nodes until there is no
446 * memory left on any of them.
447 */
448 while (nodes_weight(physnode_mask)) {
449 for_each_node_mask(i, physnode_mask) {
450 u64 end = physnodes[i].start + size;
451 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
452
453 if (ret < big)
454 end += FAKE_NODE_MIN_SIZE;
455
456 /*
457 * Continue to add memory to this fake node if its
458 * non-reserved memory is less than the per-node size.
459 */
460 while (end - physnodes[i].start -
461 e820_hole_size(physnodes[i].start, end) < size) {
462 end += FAKE_NODE_MIN_SIZE;
463 if (end > physnodes[i].end) {
464 end = physnodes[i].end;
465 break;
466 }
467 }
468
469 /*
470 * If there won't be at least FAKE_NODE_MIN_SIZE of
471 * non-reserved memory in ZONE_DMA32 for the next node,
472 * this one must extend to the boundary.
473 */
474 if (end < dma32_end && dma32_end - end -
475 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
476 end = dma32_end;
477
478 /*
479 * If there won't be enough non-reserved memory for the
480 * next node, this one must extend to the end of the
481 * physical node.
482 */
483 if (physnodes[i].end - end -
484 e820_hole_size(end, physnodes[i].end) < size)
485 end = physnodes[i].end;
486
487 /*
488 * Avoid allocating more nodes than requested, which can
489 * happen as a result of rounding down each node's size
490 * to FAKE_NODE_MIN_SIZE.
491 */
492 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
493 end = physnodes[i].end;
494
495 if (setup_node_range(ret++, &physnodes[i].start,
496 end - physnodes[i].start,
497 physnodes[i].end) < 0)
498 node_clear(i, physnode_mask);
499 }
500 }
501 return ret;
502}
503
504/*
338 * Splits num_nodes nodes up equally starting at node_start. The return value 505 * Splits num_nodes nodes up equally starting at node_start. The return value
339 * is the number of nodes split up and addr is adjusted to be at the end of the 506 * is the number of nodes split up and addr is adjusted to be at the end of the
340 * last node allocated. 507 * last node allocated.
341 */ 508 */
342static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
343 u64 max_addr, int node_start,
344 int num_nodes) 510 int num_nodes)
345{ 511{
346 unsigned int big; 512 unsigned int big;
@@ -388,7 +554,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
388 break; 554 break;
389 } 555 }
390 } 556 }
391 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
392 break; 558 break;
393 } 559 }
394 return i - node_start + 1; 560 return i - node_start + 1;
@@ -399,12 +565,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
399 * always assigned to a final node and can be asymmetric. Returns the number of 565 * always assigned to a final node and can be asymmetric. Returns the number of
400 * nodes split. 566 * nodes split.
401 */ 567 */
402static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
403 u64 max_addr, int node_start, u64 size) 569 u64 size)
404{ 570{
405 int i = node_start; 571 int i = node_start;
406 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
407 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 573 while (!setup_node_range(i++, addr, size, max_addr))
408 ; 574 ;
409 return i - node_start; 575 return i - node_start;
410} 576}
@@ -413,15 +579,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
413 * Sets up the system RAM area from start_pfn to last_pfn according to the 579 * Sets up the system RAM area from start_pfn to last_pfn according to the
414 * numa=fake command-line option. 580 * numa=fake command-line option.
415 */ 581 */
416static struct bootnode nodes[MAX_NUMNODES] __initdata; 582static int __init numa_emulation(unsigned long start_pfn,
417 583 unsigned long last_pfn, int acpi, int k8)
418static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419{ 584{
420 u64 size, addr = start_pfn << PAGE_SHIFT; 585 u64 size, addr = start_pfn << PAGE_SHIFT;
421 u64 max_addr = last_pfn << PAGE_SHIFT; 586 u64 max_addr = last_pfn << PAGE_SHIFT;
422 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes;
423 589
424 memset(&nodes, 0, sizeof(nodes)); 590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
425 /* 591 /*
426 * If the numa=fake command-line is just a single number N, split the 592 * If the numa=fake command-line is just a single number N, split the
427 * system RAM into N fake nodes. 593 * system RAM into N fake nodes.
@@ -429,7 +595,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
429 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
430 long n = simple_strtol(cmdline, NULL, 0); 596 long n = simple_strtol(cmdline, NULL, 0);
431 597
432 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
433 if (num_nodes < 0) 600 if (num_nodes < 0)
434 return num_nodes; 601 return num_nodes;
435 goto out; 602 goto out;
@@ -456,8 +623,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
456 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
457 if (size) 624 if (size)
458 for (i = 0; i < coeff; i++, num_nodes++) 625 for (i = 0; i < coeff; i++, num_nodes++)
459 if (setup_node_range(num_nodes, nodes, 626 if (setup_node_range(num_nodes, &addr,
460 &addr, size, max_addr) < 0) 627 size, max_addr) < 0)
461 goto done; 628 goto done;
462 if (!*cmdline) 629 if (!*cmdline)
463 break; 630 break;
@@ -473,7 +640,7 @@ done:
473 if (addr < max_addr) { 640 if (addr < max_addr) {
474 if (coeff_flag && coeff < 0) { 641 if (coeff_flag && coeff < 0) {
475 /* Split remaining nodes into num-sized chunks */ 642 /* Split remaining nodes into num-sized chunks */
476 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 643 num_nodes += split_nodes_by_size(&addr, max_addr,
477 num_nodes, num); 644 num_nodes, num);
478 goto out; 645 goto out;
479 } 646 }
@@ -482,7 +649,7 @@ done:
482 /* Split remaining nodes into coeff chunks */ 649 /* Split remaining nodes into coeff chunks */
483 if (coeff <= 0) 650 if (coeff <= 0)
484 break; 651 break;
485 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 652 num_nodes += split_nodes_equally(&addr, max_addr,
486 num_nodes, coeff); 653 num_nodes, coeff);
487 break; 654 break;
488 case ',': 655 case ',':
@@ -490,8 +657,8 @@ done:
490 break; 657 break;
491 default: 658 default:
492 /* Give one final node */ 659 /* Give one final node */
493 setup_node_range(num_nodes, nodes, &addr, 660 setup_node_range(num_nodes, &addr, max_addr - addr,
494 max_addr - addr, max_addr); 661 max_addr);
495 num_nodes++; 662 num_nodes++;
496 } 663 }
497 } 664 }
@@ -505,14 +672,10 @@ out:
505 } 672 }
506 673
507 /* 674 /*
508 * We need to vacate all active ranges that may have been registered by 675 * We need to vacate all active ranges that may have been registered for
509 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 676 * the e820 memory map.
510 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511 */ 677 */
512 remove_all_active_ranges(); 678 remove_all_active_ranges();
513#ifdef CONFIG_ACPI_NUMA
514 acpi_numa = -1;
515#endif
516 for_each_node_mask(i, node_possible_map) { 679 for_each_node_mask(i, node_possible_map) {
517 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 680 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518 nodes[i].end >> PAGE_SHIFT); 681 nodes[i].end >> PAGE_SHIFT);
@@ -524,7 +687,8 @@ out:
524} 687}
525#endif /* CONFIG_NUMA_EMU */ 688#endif /* CONFIG_NUMA_EMU */
526 689
527void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) 690void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
691 int acpi, int k8)
528{ 692{
529 int i; 693 int i;
530 694
@@ -532,23 +696,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
532 nodes_clear(node_online_map); 696 nodes_clear(node_online_map);
533 697
534#ifdef CONFIG_NUMA_EMU 698#ifdef CONFIG_NUMA_EMU
535 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 699 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
536 return; 700 return;
537 nodes_clear(node_possible_map); 701 nodes_clear(node_possible_map);
538 nodes_clear(node_online_map); 702 nodes_clear(node_online_map);
539#endif 703#endif
540 704
541#ifdef CONFIG_ACPI_NUMA 705#ifdef CONFIG_ACPI_NUMA
542 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 706 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
543 last_pfn << PAGE_SHIFT)) 707 last_pfn << PAGE_SHIFT))
544 return; 708 return;
545 nodes_clear(node_possible_map); 709 nodes_clear(node_possible_map);
546 nodes_clear(node_online_map); 710 nodes_clear(node_online_map);
547#endif 711#endif
548 712
549#ifdef CONFIG_K8_NUMA 713#ifdef CONFIG_K8_NUMA
550 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 714 if (!numa_off && k8 && !k8_scan_nodes())
551 last_pfn<<PAGE_SHIFT))
552 return; 715 return;
553 nodes_clear(node_possible_map); 716 nodes_clear(node_possible_map);
554 nodes_clear(node_online_map); 717 nodes_clear(node_online_map);
@@ -601,6 +764,25 @@ static __init int numa_setup(char *opt)
601early_param("numa", numa_setup); 764early_param("numa", numa_setup);
602 765
603#ifdef CONFIG_NUMA 766#ifdef CONFIG_NUMA
767
768static __init int find_near_online_node(int node)
769{
770 int n, val;
771 int min_val = INT_MAX;
772 int best_node = -1;
773
774 for_each_online_node(n) {
775 val = node_distance(node, n);
776
777 if (val < min_val) {
778 min_val = val;
779 best_node = n;
780 }
781 }
782
783 return best_node;
784}
785
604/* 786/*
605 * Setup early cpu_to_node. 787 * Setup early cpu_to_node.
606 * 788 *
@@ -632,7 +814,7 @@ void __init init_cpu_to_node(void)
632 if (node == NUMA_NO_NODE) 814 if (node == NUMA_NO_NODE)
633 continue; 815 continue;
634 if (!node_online(node)) 816 if (!node_online(node))
635 continue; 817 node = find_near_online_node(node);
636 numa_set_node(cpu, node); 818 numa_set_node(cpu, node);
637 } 819 }
638} 820}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dd38bfbefd1f..1d4eb93d333c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -279,6 +279,22 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 280 pgprot_val(forbidden) |= _PAGE_RW;
281 281
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /*
284 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
285 * kernel text mappings for the large page aligned text, rodata sections
286 * will be always read-only. For the kernel identity mappings covering
287 * the holes caused by this alignment can be anything that user asks.
288 *
289 * This will preserve the large page mappings for kernel text/data
290 * at no extra cost.
291 */
292 if (kernel_set_to_readonly &&
293 within(address, (unsigned long)_text,
294 (unsigned long)__end_rodata_hpage_align))
295 pgprot_val(forbidden) |= _PAGE_RW;
296#endif
297
282 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 298 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
283 299
284 return prot; 300 return prot;
@@ -1069,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb);
1069 1085
1070int set_memory_x(unsigned long addr, int numpages) 1086int set_memory_x(unsigned long addr, int numpages)
1071{ 1087{
1088 if (!(__supported_pte_mask & _PAGE_NX))
1089 return 0;
1090
1072 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1091 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1073} 1092}
1074EXPORT_SYMBOL(set_memory_x); 1093EXPORT_SYMBOL(set_memory_x);
1075 1094
1076int set_memory_nx(unsigned long addr, int numpages) 1095int set_memory_nx(unsigned long addr, int numpages)
1077{ 1096{
1097 if (!(__supported_pte_mask & _PAGE_NX))
1098 return 0;
1099
1078 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1100 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1079} 1101}
1080EXPORT_SYMBOL(set_memory_nx); 1102EXPORT_SYMBOL(set_memory_nx);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0ec2bcf..66b55d6e69ed 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -20,6 +20,7 @@
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/processor.h> 21#include <asm/processor.h>
22#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
23#include <asm/x86_init.h>
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/fcntl.h> 25#include <asm/fcntl.h>
25#include <asm/e820.h> 26#include <asm/e820.h>
@@ -355,9 +356,6 @@ static int free_ram_pages_type(u64 start, u64 end)
355 * - _PAGE_CACHE_UC_MINUS 356 * - _PAGE_CACHE_UC_MINUS
356 * - _PAGE_CACHE_UC 357 * - _PAGE_CACHE_UC
357 * 358 *
358 * req_type will have a special case value '-1', when requester want to inherit
359 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
360 *
361 * If new_type is NULL, function will return an error if it cannot reserve the 359 * If new_type is NULL, function will return an error if it cannot reserve the
362 * region with req_type. If new_type is non-NULL, function will return 360 * region with req_type. If new_type is non-NULL, function will return
363 * available type in new_type in case of no error. In case of any error 361 * available type in new_type in case of no error. In case of any error
@@ -377,9 +375,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
377 if (!pat_enabled) { 375 if (!pat_enabled) {
378 /* This is identical to page table setting without PAT */ 376 /* This is identical to page table setting without PAT */
379 if (new_type) { 377 if (new_type) {
380 if (req_type == -1) 378 if (req_type == _PAGE_CACHE_WC)
381 *new_type = _PAGE_CACHE_WB;
382 else if (req_type == _PAGE_CACHE_WC)
383 *new_type = _PAGE_CACHE_UC_MINUS; 379 *new_type = _PAGE_CACHE_UC_MINUS;
384 else 380 else
385 *new_type = req_type & _PAGE_CACHE_MASK; 381 *new_type = req_type & _PAGE_CACHE_MASK;
@@ -388,7 +384,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
388 } 384 }
389 385
390 /* Low ISA region is always mapped WB in page table. No need to track */ 386 /* Low ISA region is always mapped WB in page table. No need to track */
391 if (is_ISA_range(start, end - 1)) { 387 if (x86_platform.is_untracked_pat_range(start, end)) {
392 if (new_type) 388 if (new_type)
393 *new_type = _PAGE_CACHE_WB; 389 *new_type = _PAGE_CACHE_WB;
394 return 0; 390 return 0;
@@ -499,7 +495,7 @@ int free_memtype(u64 start, u64 end)
499 return 0; 495 return 0;
500 496
501 /* Low ISA region is always mapped WB. No need to track */ 497 /* Low ISA region is always mapped WB. No need to track */
502 if (is_ISA_range(start, end - 1)) 498 if (x86_platform.is_untracked_pat_range(start, end))
503 return 0; 499 return 0;
504 500
505 is_range_ram = pat_pagerange_is_ram(start, end); 501 is_range_ram = pat_pagerange_is_ram(start, end);
@@ -582,7 +578,7 @@ static unsigned long lookup_memtype(u64 paddr)
582 int rettype = _PAGE_CACHE_WB; 578 int rettype = _PAGE_CACHE_WB;
583 struct memtype *entry; 579 struct memtype *entry;
584 580
585 if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) 581 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
586 return rettype; 582 return rettype;
587 583
588 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 584 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
@@ -1018,8 +1014,10 @@ static const struct file_operations memtype_fops = {
1018 1014
1019static int __init pat_memtype_list_init(void) 1015static int __init pat_memtype_list_init(void)
1020{ 1016{
1021 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, 1017 if (pat_enabled) {
1022 NULL, &memtype_fops); 1018 debugfs_create_file("pat_memtype_list", S_IRUSR,
1019 arch_debugfs_dir, NULL, &memtype_fops);
1020 }
1023 return 0; 1021 return 0;
1024} 1022}
1025 1023
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 513d8ed5d2ec..a3250aa34086 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -3,10 +3,8 @@
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5#include <asm/pgtable.h> 5#include <asm/pgtable.h>
6#include <asm/proto.h>
6 7
7int nx_enabled;
8
9#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
10static int disable_nx __cpuinitdata; 8static int disable_nx __cpuinitdata;
11 9
12/* 10/*
@@ -22,48 +20,41 @@ static int __init noexec_setup(char *str)
22 if (!str) 20 if (!str)
23 return -EINVAL; 21 return -EINVAL;
24 if (!strncmp(str, "on", 2)) { 22 if (!strncmp(str, "on", 2)) {
25 __supported_pte_mask |= _PAGE_NX;
26 disable_nx = 0; 23 disable_nx = 0;
27 } else if (!strncmp(str, "off", 3)) { 24 } else if (!strncmp(str, "off", 3)) {
28 disable_nx = 1; 25 disable_nx = 1;
29 __supported_pte_mask &= ~_PAGE_NX;
30 } 26 }
27 x86_configure_nx();
31 return 0; 28 return 0;
32} 29}
33early_param("noexec", noexec_setup); 30early_param("noexec", noexec_setup);
34#endif
35 31
36#ifdef CONFIG_X86_PAE 32void __cpuinit x86_configure_nx(void)
37void __init set_nx(void)
38{ 33{
39 unsigned int v[4], l, h; 34 if (cpu_has_nx && !disable_nx)
40 35 __supported_pte_mask |= _PAGE_NX;
41 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 36 else
42 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 37 __supported_pte_mask &= ~_PAGE_NX;
38}
43 39
44 if ((v[3] & (1 << 20)) && !disable_nx) { 40void __init x86_report_nx(void)
45 rdmsr(MSR_EFER, l, h); 41{
46 l |= EFER_NX; 42 if (!cpu_has_nx) {
47 wrmsr(MSR_EFER, l, h); 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
48 nx_enabled = 1; 44 "missing in CPU or disabled in BIOS!\n");
49 __supported_pte_mask |= _PAGE_NX; 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) {
48 printk(KERN_INFO "NX (Execute Disable) protection: "
49 "disabled by kernel command line option\n");
50 } else {
51 printk(KERN_INFO "NX (Execute Disable) protection: "
52 "active\n");
50 } 53 }
51 }
52}
53#else 54#else
54void set_nx(void) 55 /* 32bit non-PAE kernel, NX cannot be used */
55{ 56 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
56} 57 "cannot be enabled: non-PAE kernel!\n");
57#endif 58#endif
58 59 }
59#ifdef CONFIG_X86_64
60void __cpuinit check_efer(void)
61{
62 unsigned long efer;
63
64 rdmsrl(MSR_EFER, efer);
65 if (!(efer & EFER_NX) || disable_nx)
66 __supported_pte_mask &= ~_PAGE_NX;
67} 60}
68#endif
69
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index dbb5381f7b3b..d89075489664 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -136,7 +136,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
136 apicid_to_node[apic_id] = node; 136 apicid_to_node[apic_id] = node;
137 node_set(node, cpu_nodes_parsed); 137 node_set(node, cpu_nodes_parsed);
138 acpi_numa = 1; 138 acpi_numa = 1;
139 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 139 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
140 pxm, apic_id, node); 140 pxm, apic_id, node);
141} 141}
142 142
@@ -170,7 +170,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
170 apicid_to_node[apic_id] = node; 170 apicid_to_node[apic_id] = node;
171 node_set(node, cpu_nodes_parsed); 171 node_set(node, cpu_nodes_parsed);
172 acpi_numa = 1; 172 acpi_numa = 1;
173 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 173 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
174 pxm, apic_id, node); 174 pxm, apic_id, node);
175} 175}
176 176
@@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
290 290
291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
292 start, end); 292 start, end);
293 e820_register_active_regions(node, start >> PAGE_SHIFT,
294 end >> PAGE_SHIFT);
295 293
296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 294 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
297 update_nodes_add(node, start, end); 295 update_nodes_add(node, start, end);
@@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
338 336
339void __init acpi_numa_arch_fixup(void) {} 337void __init acpi_numa_arch_fixup(void) {}
340 338
339int __init acpi_get_nodes(struct bootnode *physnodes)
340{
341 int i;
342 int ret = 0;
343
344 for_each_node_mask(i, nodes_parsed) {
345 physnodes[ret].start = nodes[i].start;
346 physnodes[ret].end = nodes[i].end;
347 ret++;
348 }
349 return ret;
350}
351
341/* Use the information discovered above to actually set up the nodes. */ 352/* Use the information discovered above to actually set up the nodes. */
342int __init acpi_scan_nodes(unsigned long start, unsigned long end) 353int __init acpi_scan_nodes(unsigned long start, unsigned long end)
343{ 354{
@@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
350 for (i = 0; i < MAX_NUMNODES; i++) 361 for (i = 0; i < MAX_NUMNODES; i++)
351 cutoff_node(i, start, end); 362 cutoff_node(i, start, end);
352 363
353 if (!nodes_cover_memory(nodes)) {
354 bad_srat();
355 return -1;
356 }
357
358 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 364 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
359 memblk_nodeid); 365 memblk_nodeid);
360 if (memnode_shift < 0) { 366 if (memnode_shift < 0) {
@@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
364 return -1; 370 return -1;
365 } 371 }
366 372
373 for_each_node_mask(i, nodes_parsed)
374 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
375 nodes[i].end >> PAGE_SHIFT);
376 if (!nodes_cover_memory(nodes)) {
377 bad_srat();
378 return -1;
379 }
380
367 /* Account for nodes with cpus and no memory */ 381 /* Account for nodes with cpus and no memory */
368 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 382 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
369 383
@@ -454,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
454 for (i = 0; i < num_nodes; i++) 468 for (i = 0; i < num_nodes; i++)
455 if (fake_nodes[i].start != fake_nodes[i].end) 469 if (fake_nodes[i].start != fake_nodes[i].end)
456 node_set(i, nodes_parsed); 470 node_set(i, nodes_parsed);
457 WARN_ON(!nodes_cover_memory(fake_nodes));
458} 471}
459 472
460static int null_slit_node_compare(int a, int b) 473static int null_slit_node_compare(int a, int b)
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df5..8565d944f7cf 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
1/* 1/*
2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> 2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
3 */ 3 */
4
5#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6
4#include <linux/module.h> 7#include <linux/module.h>
5#include <linux/io.h> 8#include <linux/io.h>
6#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
7 10
8#define MODULE_NAME "testmmiotrace"
9
10static unsigned long mmio_address; 11static unsigned long mmio_address;
11module_param(mmio_address, ulong, 0); 12module_param(mmio_address, ulong, 0);
12MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " 13MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
30static void do_write_test(void __iomem *p) 31static void do_write_test(void __iomem *p)
31{ 32{
32 unsigned int i; 33 unsigned int i;
33 pr_info(MODULE_NAME ": write test.\n"); 34 pr_info("write test.\n");
34 mmiotrace_printk("Write test.\n"); 35 mmiotrace_printk("Write test.\n");
35 36
36 for (i = 0; i < 256; i++) 37 for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
47{ 48{
48 unsigned int i; 49 unsigned int i;
49 unsigned errs[3] = { 0 }; 50 unsigned errs[3] = { 0 };
50 pr_info(MODULE_NAME ": read test.\n"); 51 pr_info("read test.\n");
51 mmiotrace_printk("Read test.\n"); 52 mmiotrace_printk("Read test.\n");
52 53
53 for (i = 0; i < 256; i++) 54 for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
68 69
69static void do_read_far_test(void __iomem *p) 70static void do_read_far_test(void __iomem *p)
70{ 71{
71 pr_info(MODULE_NAME ": read far test.\n"); 72 pr_info("read far test.\n");
72 mmiotrace_printk("Read far test.\n"); 73 mmiotrace_printk("Read far test.\n");
73 74
74 ioread32(p + read_far); 75 ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
78{ 79{
79 void __iomem *p = ioremap_nocache(mmio_address, size); 80 void __iomem *p = ioremap_nocache(mmio_address, size);
80 if (!p) { 81 if (!p) {
81 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 82 pr_err("could not ioremap, aborting.\n");
82 return; 83 return;
83 } 84 }
84 mmiotrace_printk("ioremap returned %p.\n", p); 85 mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
94 unsigned long size = (read_far) ? (8 << 20) : (16 << 10); 95 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
95 96
96 if (mmio_address == 0) { 97 if (mmio_address == 0) {
97 pr_err(MODULE_NAME ": you have to use the module argument " 98 pr_err("you have to use the module argument mmio_address.\n");
98 "mmio_address.\n"); 99 pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
99 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
100 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
101 return -ENXIO; 100 return -ENXIO;
102 } 101 }
103 102
104 pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " 103 pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
105 "address space, and writing 16 kB of rubbish in there.\n", 104 "and writing 16 kB of rubbish in there.\n",
106 size >> 10, mmio_address); 105 size >> 10, mmio_address);
107 do_test(size); 106 do_test(size);
108 pr_info(MODULE_NAME ": All done.\n"); 107 pr_info("All done.\n");
109 return 0; 108 return 0;
110} 109}
111 110
112static void __exit cleanup(void) 111static void __exit cleanup(void)
113{ 112{
114 pr_debug(MODULE_NAME ": unloaded.\n"); 113 pr_debug("unloaded.\n");
115} 114}
116 115
117module_init(init); 116module_init(init);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08eeb5c3..65b58e4b0b8b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@
8 8
9#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
11#include <asm/cache.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/uv/uv.h> 13#include <asm/uv/uv.h>
13 14
@@ -43,7 +44,7 @@ union smp_flush_state {
43 spinlock_t tlbstate_lock; 44 spinlock_t tlbstate_lock;
44 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
45 }; 46 };
46 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
47} ____cacheline_internodealigned_in_smp; 48} ____cacheline_internodealigned_in_smp;
48 49
49/* State is put into the per CPU data section, but padded 50/* State is put into the per CPU data section, but padded