aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/mm
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/amdtopology.c (renamed from arch/x86/mm/k8topology_64.c)122
-rw-r--r--arch/x86/mm/fault.c148
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/highmem_32.c76
-rw-r--r--arch/x86/mm/hugetlbpage.c6
-rw-r--r--arch/x86/mm/init.c85
-rw-r--r--arch/x86/mm/init_32.c202
-rw-r--r--arch/x86/mm/init_64.c239
-rw-r--r--arch/x86/mm/iomap_32.c43
-rw-r--r--arch/x86/mm/ioremap.c19
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c2
-rw-r--r--arch/x86/mm/memblock.c348
-rw-r--r--arch/x86/mm/memtest.c7
-rw-r--r--arch/x86/mm/numa.c773
-rw-r--r--arch/x86/mm/numa_32.c400
-rw-r--r--arch/x86/mm/numa_64.c882
-rw-r--r--arch/x86/mm/numa_emulation.c492
-rw-r--r--arch/x86/mm/numa_internal.h39
-rw-r--r--arch/x86/mm/pageattr.c45
-rw-r--r--arch/x86/mm/pf_in.c14
-rw-r--r--arch/x86/mm/pgtable.c104
-rw-r--r--arch/x86/mm/setup_nx.c2
-rw-r--r--arch/x86/mm/srat.c184
-rw-r--r--arch/x86/mm/srat_32.c285
-rw-r--r--arch/x86/mm/srat_64.c564
-rw-r--r--arch/x86/mm/tlb.c63
29 files changed, 2562 insertions, 2621 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a4c768397baa..3d11327c9ab4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,10 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
24 24
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_K8_NUMA) += k8topology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat.o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
29
30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
28 31
29obj-$(CONFIG_MEMTEST) += memtest.o 32obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology.c
index 970ed579d4e4..5247d01329ca 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/amdtopology.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * AMD K8 NUMA support. 2 * AMD NUMA support.
3 * Discover the memory map and associated nodes. 3 * Discover the memory map and associated nodes.
4 * 4 *
5 * This version reads it directly from the K8 northbridge. 5 * This version reads it directly from the AMD northbridge.
6 * 6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */ 8 */
@@ -11,6 +11,9 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/memblock.h>
15#include <linux/bootmem.h>
16
14#include <asm/io.h> 17#include <asm/io.h>
15#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
16#include <linux/acpi.h> 19#include <linux/acpi.h>
@@ -22,10 +25,9 @@
22#include <asm/numa.h> 25#include <asm/numa.h>
23#include <asm/mpspec.h> 26#include <asm/mpspec.h>
24#include <asm/apic.h> 27#include <asm/apic.h>
25#include <asm/k8.h> 28#include <asm/amd_nb.h>
26 29
27static struct bootnode __initdata nodes[8]; 30static unsigned char __initdata nodeids[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
29 31
30static __init int find_northbridge(void) 32static __init int find_northbridge(void)
31{ 33{
@@ -48,14 +50,14 @@ static __init int find_northbridge(void)
48 return num; 50 return num;
49 } 51 }
50 52
51 return -1; 53 return -ENOENT;
52} 54}
53 55
54static __init void early_get_boot_cpu_id(void) 56static __init void early_get_boot_cpu_id(void)
55{ 57{
56 /* 58 /*
57 * need to get boot_cpu_id so can use that to create apicid_to_node 59 * need to get the APIC ID of the BSP so can use that to
58 * in k8_scan_nodes() 60 * create apicid_to_node in amd_scan_nodes()
59 */ 61 */
60#ifdef CONFIG_X86_MPPARSE 62#ifdef CONFIG_X86_MPPARSE
61 /* 63 /*
@@ -64,33 +66,20 @@ static __init void early_get_boot_cpu_id(void)
64 if (smp_found_config) 66 if (smp_found_config)
65 early_get_smp_config(); 67 early_get_smp_config();
66#endif 68#endif
67 early_init_lapic_mapping();
68}
69
70int __init k8_get_nodes(struct bootnode *physnodes)
71{
72 int i;
73 int ret = 0;
74
75 for_each_node_mask(i, nodes_parsed) {
76 physnodes[ret].start = nodes[i].start;
77 physnodes[ret].end = nodes[i].end;
78 ret++;
79 }
80 return ret;
81} 69}
82 70
83int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) 71int __init amd_numa_init(void)
84{ 72{
85 unsigned long start = PFN_PHYS(start_pfn); 73 u64 start = PFN_PHYS(0);
86 unsigned long end = PFN_PHYS(end_pfn); 74 u64 end = PFN_PHYS(max_pfn);
87 unsigned numnodes; 75 unsigned numnodes;
88 unsigned long prevbase; 76 u64 prevbase;
89 int i, nb, found = 0; 77 int i, j, nb;
90 u32 nodeid, reg; 78 u32 nodeid, reg;
79 unsigned int bits, cores, apicid_base;
91 80
92 if (!early_pci_allowed()) 81 if (!early_pci_allowed())
93 return -1; 82 return -EINVAL;
94 83
95 nb = find_northbridge(); 84 nb = find_northbridge();
96 if (nb < 0) 85 if (nb < 0)
@@ -101,40 +90,40 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
101 reg = read_pci_config(0, nb, 0, 0x60); 90 reg = read_pci_config(0, nb, 0, 0x60);
102 numnodes = ((reg >> 4) & 0xF) + 1; 91 numnodes = ((reg >> 4) & 0xF) + 1;
103 if (numnodes <= 1) 92 if (numnodes <= 1)
104 return -1; 93 return -ENOENT;
105 94
106 pr_info("Number of physical nodes %d\n", numnodes); 95 pr_info("Number of physical nodes %d\n", numnodes);
107 96
108 prevbase = 0; 97 prevbase = 0;
109 for (i = 0; i < 8; i++) { 98 for (i = 0; i < 8; i++) {
110 unsigned long base, limit; 99 u64 base, limit;
111 100
112 base = read_pci_config(0, nb, 1, 0x40 + i*8); 101 base = read_pci_config(0, nb, 1, 0x40 + i*8);
113 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 102 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
114 103
115 nodeid = limit & 7; 104 nodeids[i] = nodeid = limit & 7;
116 if ((base & 3) == 0) { 105 if ((base & 3) == 0) {
117 if (i < numnodes) 106 if (i < numnodes)
118 pr_info("Skipping disabled node %d\n", i); 107 pr_info("Skipping disabled node %d\n", i);
119 continue; 108 continue;
120 } 109 }
121 if (nodeid >= numnodes) { 110 if (nodeid >= numnodes) {
122 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, 111 pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
123 base, limit); 112 base, limit);
124 continue; 113 continue;
125 } 114 }
126 115
127 if (!limit) { 116 if (!limit) {
128 pr_info("Skipping node entry %d (base %lx)\n", 117 pr_info("Skipping node entry %d (base %Lx)\n",
129 i, base); 118 i, base);
130 continue; 119 continue;
131 } 120 }
132 if ((base >> 8) & 3 || (limit >> 8) & 3) { 121 if ((base >> 8) & 3 || (limit >> 8) & 3) {
133 pr_err("Node %d using interleaving mode %lx/%lx\n", 122 pr_err("Node %d using interleaving mode %Lx/%Lx\n",
134 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 123 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
135 return -1; 124 return -EINVAL;
136 } 125 }
137 if (node_isset(nodeid, nodes_parsed)) { 126 if (node_isset(nodeid, numa_nodes_parsed)) {
138 pr_info("Node %d already present, skipping\n", 127 pr_info("Node %d already present, skipping\n",
139 nodeid); 128 nodeid);
140 continue; 129 continue;
@@ -162,74 +151,47 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
162 continue; 151 continue;
163 } 152 }
164 if (limit < base) { 153 if (limit < base) {
165 pr_err("Node %d bogus settings %lx-%lx.\n", 154 pr_err("Node %d bogus settings %Lx-%Lx.\n",
166 nodeid, base, limit); 155 nodeid, base, limit);
167 continue; 156 continue;
168 } 157 }
169 158
170 /* Could sort here, but pun for now. Should not happen anyroads. */ 159 /* Could sort here, but pun for now. Should not happen anyroads. */
171 if (prevbase > base) { 160 if (prevbase > base) {
172 pr_err("Node map not sorted %lx,%lx\n", 161 pr_err("Node map not sorted %Lx,%Lx\n",
173 prevbase, base); 162 prevbase, base);
174 return -1; 163 return -EINVAL;
175 } 164 }
176 165
177 pr_info("Node %d MemBase %016lx Limit %016lx\n", 166 pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
178 nodeid, base, limit); 167 nodeid, base, limit);
179 168
180 found++;
181
182 nodes[nodeid].start = base;
183 nodes[nodeid].end = limit;
184
185 prevbase = base; 169 prevbase = base;
186 170 numa_add_memblk(nodeid, base, limit);
187 node_set(nodeid, nodes_parsed); 171 node_set(nodeid, numa_nodes_parsed);
188 } 172 }
189 173
190 if (!found) 174 if (!nodes_weight(numa_nodes_parsed))
191 return -1; 175 return -ENOENT;
192 return 0;
193}
194 176
195int __init k8_scan_nodes(void) 177 /*
196{ 178 * We seem to have valid NUMA configuration. Map apicids to nodes
197 unsigned int bits; 179 * using the coreid bits from early_identify_cpu.
198 unsigned int cores; 180 */
199 unsigned int apicid_base;
200 int i;
201
202 BUG_ON(nodes_empty(nodes_parsed));
203 node_possible_map = nodes_parsed;
204 memnode_shift = compute_hash_shift(nodes, 8, NULL);
205 if (memnode_shift < 0) {
206 pr_err("No NUMA node hash function found. Contact maintainer\n");
207 return -1;
208 }
209 pr_info("Using node hash shift of %d\n", memnode_shift);
210
211 /* use the coreid bits from early_identify_cpu */
212 bits = boot_cpu_data.x86_coreid_bits; 181 bits = boot_cpu_data.x86_coreid_bits;
213 cores = (1<<bits); 182 cores = 1 << bits;
214 apicid_base = 0; 183 apicid_base = 0;
215 /* need to get boot_cpu_id early for system with apicid lifting */ 184
185 /* get the APIC ID of the BSP early for systems with apicid lifting */
216 early_get_boot_cpu_id(); 186 early_get_boot_cpu_id();
217 if (boot_cpu_physical_apicid > 0) { 187 if (boot_cpu_physical_apicid > 0) {
218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); 188 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
219 apicid_base = boot_cpu_physical_apicid; 189 apicid_base = boot_cpu_physical_apicid;
220 } 190 }
221 191
222 for_each_node_mask(i, node_possible_map) { 192 for_each_node_mask(i, numa_nodes_parsed)
223 int j;
224
225 e820_register_active_regions(i,
226 nodes[i].start >> PAGE_SHIFT,
227 nodes[i].end >> PAGE_SHIFT);
228 for (j = apicid_base; j < cores + apicid_base; j++) 193 for (j = apicid_base; j < cores + apicid_base; j++)
229 apicid_to_node[(i << bits) + j] = i; 194 set_apicid_to_node((i << bits) + j, i);
230 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
231 }
232 195
233 numa_init_array();
234 return 0; 196 return 0;
235} 197}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a204..2dbf6bf4c7e5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,8 @@
11#include <linux/kprobes.h> /* __kprobes, ... */ 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_event.h> /* perf_sw_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14#include <linux/hugetlb.h> /* hstate_index_to_shift */
15#include <linux/prefetch.h> /* prefetchw */
14 16
15#include <asm/traps.h> /* dotraplinkage, ... */ 17#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 18#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -160,15 +162,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
160 162
161static void 163static void
162force_sig_info_fault(int si_signo, int si_code, unsigned long address, 164force_sig_info_fault(int si_signo, int si_code, unsigned long address,
163 struct task_struct *tsk) 165 struct task_struct *tsk, int fault)
164{ 166{
167 unsigned lsb = 0;
165 siginfo_t info; 168 siginfo_t info;
166 169
167 info.si_signo = si_signo; 170 info.si_signo = si_signo;
168 info.si_errno = 0; 171 info.si_errno = 0;
169 info.si_code = si_code; 172 info.si_code = si_code;
170 info.si_addr = (void __user *)address; 173 info.si_addr = (void __user *)address;
171 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 174 if (fault & VM_FAULT_HWPOISON_LARGE)
175 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
176 if (fault & VM_FAULT_HWPOISON)
177 lsb = PAGE_SHIFT;
178 info.si_addr_lsb = lsb;
172 179
173 force_sig_info(si_signo, &info, tsk); 180 force_sig_info(si_signo, &info, tsk);
174} 181}
@@ -223,16 +230,24 @@ void vmalloc_sync_all(void)
223 for (address = VMALLOC_START & PMD_MASK; 230 for (address = VMALLOC_START & PMD_MASK;
224 address >= TASK_SIZE && address < FIXADDR_TOP; 231 address >= TASK_SIZE && address < FIXADDR_TOP;
225 address += PMD_SIZE) { 232 address += PMD_SIZE) {
226
227 unsigned long flags;
228 struct page *page; 233 struct page *page;
229 234
230 spin_lock_irqsave(&pgd_lock, flags); 235 spin_lock(&pgd_lock);
231 list_for_each_entry(page, &pgd_list, lru) { 236 list_for_each_entry(page, &pgd_list, lru) {
232 if (!vmalloc_sync_one(page_address(page), address)) 237 spinlock_t *pgt_lock;
238 pmd_t *ret;
239
240 /* the pgt_lock only for Xen */
241 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
242
243 spin_lock(pgt_lock);
244 ret = vmalloc_sync_one(page_address(page), address);
245 spin_unlock(pgt_lock);
246
247 if (!ret)
233 break; 248 break;
234 } 249 }
235 spin_unlock_irqrestore(&pgd_lock, flags); 250 spin_unlock(&pgd_lock);
236 } 251 }
237} 252}
238 253
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
251 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 266 if (!(address >= VMALLOC_START && address < VMALLOC_END))
252 return -1; 267 return -1;
253 268
269 WARN_ON_ONCE(in_nmi());
270
254 /* 271 /*
255 * Synchronize this task's top level page-table 272 * Synchronize this task's top level page-table
256 * with the 'reference' page table. 273 * with the 'reference' page table.
@@ -326,29 +343,7 @@ out:
326 343
327void vmalloc_sync_all(void) 344void vmalloc_sync_all(void)
328{ 345{
329 unsigned long address; 346 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
330
331 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
332 address += PGDIR_SIZE) {
333
334 const pgd_t *pgd_ref = pgd_offset_k(address);
335 unsigned long flags;
336 struct page *page;
337
338 if (pgd_none(*pgd_ref))
339 continue;
340
341 spin_lock_irqsave(&pgd_lock, flags);
342 list_for_each_entry(page, &pgd_list, lru) {
343 pgd_t *pgd;
344 pgd = (pgd_t *)page_address(page) + pgd_index(address);
345 if (pgd_none(*pgd))
346 set_pgd(pgd, *pgd_ref);
347 else
348 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
349 }
350 spin_unlock_irqrestore(&pgd_lock, flags);
351 }
352} 347}
353 348
354/* 349/*
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
369 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 364 if (!(address >= VMALLOC_START && address < VMALLOC_END))
370 return -1; 365 return -1;
371 366
367 WARN_ON_ONCE(in_nmi());
368
372 /* 369 /*
373 * Copy kernel mappings over when needed. This can also 370 * Copy kernel mappings over when needed. This can also
374 * happen within a race in page table update. In the later 371 * happen within a race in page table update. In the later
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
731 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 728 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
732 tsk->thread.trap_no = 14; 729 tsk->thread.trap_no = 14;
733 730
734 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 731 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
735 732
736 return; 733 return;
737 } 734 }
@@ -816,28 +813,51 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
816 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
817 814
818#ifdef CONFIG_MEMORY_FAILURE 815#ifdef CONFIG_MEMORY_FAILURE
819 if (fault & VM_FAULT_HWPOISON) { 816 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
820 printk(KERN_ERR 817 printk(KERN_ERR
821 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
822 tsk->comm, tsk->pid, address); 819 tsk->comm, tsk->pid, address);
823 code = BUS_MCEERR_AR; 820 code = BUS_MCEERR_AR;
824 } 821 }
825#endif 822#endif
826 force_sig_info_fault(SIGBUS, code, address, tsk); 823 force_sig_info_fault(SIGBUS, code, address, tsk, fault);
827} 824}
828 825
829static noinline void 826static noinline int
830mm_fault_error(struct pt_regs *regs, unsigned long error_code, 827mm_fault_error(struct pt_regs *regs, unsigned long error_code,
831 unsigned long address, unsigned int fault) 828 unsigned long address, unsigned int fault)
832{ 829{
830 /*
831 * Pagefault was interrupted by SIGKILL. We have no reason to
832 * continue pagefault.
833 */
834 if (fatal_signal_pending(current)) {
835 if (!(fault & VM_FAULT_RETRY))
836 up_read(&current->mm->mmap_sem);
837 if (!(error_code & PF_USER))
838 no_context(regs, error_code, address);
839 return 1;
840 }
841 if (!(fault & VM_FAULT_ERROR))
842 return 0;
843
833 if (fault & VM_FAULT_OOM) { 844 if (fault & VM_FAULT_OOM) {
845 /* Kernel mode? Handle exceptions or die: */
846 if (!(error_code & PF_USER)) {
847 up_read(&current->mm->mmap_sem);
848 no_context(regs, error_code, address);
849 return 1;
850 }
851
834 out_of_memory(regs, error_code, address); 852 out_of_memory(regs, error_code, address);
835 } else { 853 } else {
836 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 854 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
855 VM_FAULT_HWPOISON_LARGE))
837 do_sigbus(regs, error_code, address, fault); 856 do_sigbus(regs, error_code, address, fault);
838 else 857 else
839 BUG(); 858 BUG();
840 } 859 }
860 return 1;
841} 861}
842 862
843static int spurious_fault_check(unsigned long error_code, pte_t *pte) 863static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -894,8 +914,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
894 if (pmd_large(*pmd)) 914 if (pmd_large(*pmd))
895 return spurious_fault_check(error_code, (pte_t *) pmd); 915 return spurious_fault_check(error_code, (pte_t *) pmd);
896 916
917 /*
918 * Note: don't use pte_present() here, since it returns true
919 * if the _PAGE_PROTNONE bit is set. However, this aliases the
920 * _PAGE_GLOBAL bit, which for kernel pages give false positives
921 * when CONFIG_DEBUG_PAGEALLOC is used.
922 */
897 pte = pte_offset_kernel(pmd, address); 923 pte = pte_offset_kernel(pmd, address);
898 if (!pte_present(*pte)) 924 if (!(pte_flags(*pte) & _PAGE_PRESENT))
899 return 0; 925 return 0;
900 926
901 ret = spurious_fault_check(error_code, pte); 927 ret = spurious_fault_check(error_code, pte);
@@ -915,9 +941,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
915int show_unhandled_signals = 1; 941int show_unhandled_signals = 1;
916 942
917static inline int 943static inline int
918access_error(unsigned long error_code, int write, struct vm_area_struct *vma) 944access_error(unsigned long error_code, struct vm_area_struct *vma)
919{ 945{
920 if (write) { 946 if (error_code & PF_WRITE) {
921 /* write, present and write, not present: */ 947 /* write, present and write, not present: */
922 if (unlikely(!(vma->vm_flags & VM_WRITE))) 948 if (unlikely(!(vma->vm_flags & VM_WRITE)))
923 return 1; 949 return 1;
@@ -952,8 +978,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
952 struct task_struct *tsk; 978 struct task_struct *tsk;
953 unsigned long address; 979 unsigned long address;
954 struct mm_struct *mm; 980 struct mm_struct *mm;
955 int write;
956 int fault; 981 int fault;
982 int write = error_code & PF_WRITE;
983 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
984 (write ? FAULT_FLAG_WRITE : 0);
957 985
958 tsk = current; 986 tsk = current;
959 mm = tsk->mm; 987 mm = tsk->mm;
@@ -1064,6 +1092,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1064 bad_area_nosemaphore(regs, error_code, address); 1092 bad_area_nosemaphore(regs, error_code, address);
1065 return; 1093 return;
1066 } 1094 }
1095retry:
1067 down_read(&mm->mmap_sem); 1096 down_read(&mm->mmap_sem);
1068 } else { 1097 } else {
1069 /* 1098 /*
@@ -1107,9 +1136,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1107 * we can handle it.. 1136 * we can handle it..
1108 */ 1137 */
1109good_area: 1138good_area:
1110 write = error_code & PF_WRITE; 1139 if (unlikely(access_error(error_code, vma))) {
1111
1112 if (unlikely(access_error(error_code, write, vma))) {
1113 bad_area_access_error(regs, error_code, address); 1140 bad_area_access_error(regs, error_code, address);
1114 return; 1141 return;
1115 } 1142 }
@@ -1119,21 +1146,34 @@ good_area:
1119 * make sure we exit gracefully rather than endlessly redo 1146 * make sure we exit gracefully rather than endlessly redo
1120 * the fault: 1147 * the fault:
1121 */ 1148 */
1122 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); 1149 fault = handle_mm_fault(mm, vma, address, flags);
1123 1150
1124 if (unlikely(fault & VM_FAULT_ERROR)) { 1151 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
1125 mm_fault_error(regs, error_code, address, fault); 1152 if (mm_fault_error(regs, error_code, address, fault))
1126 return; 1153 return;
1127 } 1154 }
1128 1155
1129 if (fault & VM_FAULT_MAJOR) { 1156 /*
1130 tsk->maj_flt++; 1157 * Major/minor page fault accounting is only done on the
1131 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1158 * initial attempt. If we go through a retry, it is extremely
1132 regs, address); 1159 * likely that the page will be found in page cache at that point.
1133 } else { 1160 */
1134 tsk->min_flt++; 1161 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1135 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1162 if (fault & VM_FAULT_MAJOR) {
1136 regs, address); 1163 tsk->maj_flt++;
1164 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1165 regs, address);
1166 } else {
1167 tsk->min_flt++;
1168 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1169 regs, address);
1170 }
1171 if (fault & VM_FAULT_RETRY) {
1172 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1173 * of starvation. */
1174 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1175 goto retry;
1176 }
1137 } 1177 }
1138 1178
1139 check_v8086_mode(regs, address, tsk); 1179 check_v8086_mode(regs, address, tsk);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/vmstat.h> 9#include <linux/vmstat.h>
10#include <linux/highmem.h> 10#include <linux/highmem.h>
11#include <linux/swap.h>
11 12
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13 14
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 90 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte); 91 page = pte_page(pte);
91 get_page(page); 92 get_page(page);
93 SetPageReferenced(page);
92 pages[*nr] = page; 94 pages[*nr] = page;
93 (*nr)++; 95 (*nr)++;
94 96
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
103 VM_BUG_ON(page != compound_head(page)); 105 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0); 106 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count); 107 atomic_add(nr, &page->_count);
108 SetPageReferenced(page);
109}
110
111static inline void get_huge_page_tail(struct page *page)
112{
113 /*
114 * __split_huge_page_refcount() cannot run
115 * from under us.
116 */
117 VM_BUG_ON(atomic_read(&page->_count) < 0);
118 atomic_inc(&page->_count);
106} 119}
107 120
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 121static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
128 do { 141 do {
129 VM_BUG_ON(compound_head(page) != head); 142 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page; 143 pages[*nr] = page;
144 if (PageTail(page))
145 get_huge_page_tail(page);
131 (*nr)++; 146 (*nr)++;
132 page++; 147 page++;
133 refs++; 148 refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
148 pmd_t pmd = *pmdp; 163 pmd_t pmd = *pmdp;
149 164
150 next = pmd_addr_end(addr, end); 165 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd)) 166 /*
167 * The pmd_trans_splitting() check below explains why
168 * pmdp_splitting_flush has to flush the tlb, to stop
169 * this gup-fast code from running while we set the
170 * splitting bit in the pmd. Returning zero will take
171 * the slow path that will call wait_split_huge_page()
172 * if the pmd is still in splitting state. gup-fast
173 * can't because it has irq disabled and
174 * wait_split_huge_page() would never return as the
175 * tlb flush IPI wouldn't run.
176 */
177 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
152 return 0; 178 return 0;
153 if (unlikely(pmd_large(pmd))) { 179 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 180 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 5e8fa12ef861..b49962662101 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -9,6 +9,7 @@ void *kmap(struct page *page)
9 return page_address(page); 9 return page_address(page);
10 return kmap_high(page); 10 return kmap_high(page);
11} 11}
12EXPORT_SYMBOL(kmap);
12 13
13void kunmap(struct page *page) 14void kunmap(struct page *page)
14{ 15{
@@ -18,6 +19,7 @@ void kunmap(struct page *page)
18 return; 19 return;
19 kunmap_high(page); 20 kunmap_high(page);
20} 21}
22EXPORT_SYMBOL(kunmap);
21 23
22/* 24/*
23 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because 25 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
@@ -27,10 +29,10 @@ void kunmap(struct page *page)
27 * However when holding an atomic kmap it is not legal to sleep, so atomic 29 * However when holding an atomic kmap it is not legal to sleep, so atomic
28 * kmaps are appropriate for short, tight code paths only. 30 * kmaps are appropriate for short, tight code paths only.
29 */ 31 */
30void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) 32void *kmap_atomic_prot(struct page *page, pgprot_t prot)
31{ 33{
32 enum fixed_addresses idx;
33 unsigned long vaddr; 34 unsigned long vaddr;
35 int idx, type;
34 36
35 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 37 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
36 pagefault_disable(); 38 pagefault_disable();
@@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
38 if (!PageHighMem(page)) 40 if (!PageHighMem(page))
39 return page_address(page); 41 return page_address(page);
40 42
41 debug_kmap_atomic(type); 43 type = kmap_atomic_idx_push();
42
43 idx = type + KM_TYPE_NR*smp_processor_id(); 44 idx = type + KM_TYPE_NR*smp_processor_id();
44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 45 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
45 BUG_ON(!pte_none(*(kmap_pte-idx))); 46 BUG_ON(!pte_none(*(kmap_pte-idx)));
@@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
47 48
48 return (void *)vaddr; 49 return (void *)vaddr;
49} 50}
51EXPORT_SYMBOL(kmap_atomic_prot);
52
53void *__kmap_atomic(struct page *page)
54{
55 return kmap_atomic_prot(page, kmap_prot);
56}
57EXPORT_SYMBOL(__kmap_atomic);
50 58
51void *kmap_atomic(struct page *page, enum km_type type) 59/*
60 * This is the same as kmap_atomic() but can map memory that doesn't
61 * have a struct page associated with it.
62 */
63void *kmap_atomic_pfn(unsigned long pfn)
52{ 64{
53 return kmap_atomic_prot(page, type, kmap_prot); 65 return kmap_atomic_prot_pfn(pfn, kmap_prot);
54} 66}
67EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
55 68
56void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) 69void __kunmap_atomic(void *kvaddr)
57{ 70{
58 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 71 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
59 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); 72
60 73 if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
61 /* 74 vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
62 * Force other mappings to Oops if they'll try to access this pte 75 int idx, type;
63 * without first remap it. Keeping stale mappings around is a bad idea 76
64 * also, in case the page changes cacheability attributes or becomes 77 type = kmap_atomic_idx();
65 * a protected page in a hypervisor. 78 idx = type + KM_TYPE_NR * smp_processor_id();
66 */ 79
67 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 80#ifdef CONFIG_DEBUG_HIGHMEM
81 WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
82#endif
83 /*
84 * Force other mappings to Oops if they'll try to access this
85 * pte without first remap it. Keeping stale mappings around
86 * is a bad idea also, in case the page changes cacheability
87 * attributes or becomes a protected page in a hypervisor.
88 */
68 kpte_clear_flush(kmap_pte-idx, vaddr); 89 kpte_clear_flush(kmap_pte-idx, vaddr);
69 else { 90 kmap_atomic_idx_pop();
91 }
70#ifdef CONFIG_DEBUG_HIGHMEM 92#ifdef CONFIG_DEBUG_HIGHMEM
93 else {
71 BUG_ON(vaddr < PAGE_OFFSET); 94 BUG_ON(vaddr < PAGE_OFFSET);
72 BUG_ON(vaddr >= (unsigned long)high_memory); 95 BUG_ON(vaddr >= (unsigned long)high_memory);
73#endif
74 } 96 }
97#endif
75 98
76 pagefault_enable(); 99 pagefault_enable();
77} 100}
78 101EXPORT_SYMBOL(__kunmap_atomic);
79/*
80 * This is the same as kmap_atomic() but can map memory that doesn't
81 * have a struct page associated with it.
82 */
83void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
84{
85 return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
86}
87EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
88 102
89struct page *kmap_atomic_to_page(void *ptr) 103struct page *kmap_atomic_to_page(void *ptr)
90{ 104{
@@ -98,12 +112,6 @@ struct page *kmap_atomic_to_page(void *ptr)
98 pte = kmap_pte - (idx - FIX_KMAP_BEGIN); 112 pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
99 return pte_page(*pte); 113 return pte_page(*pte);
100} 114}
101
102EXPORT_SYMBOL(kmap);
103EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic_notypecheck);
106EXPORT_SYMBOL(kmap_atomic_prot);
107EXPORT_SYMBOL(kmap_atomic_to_page); 115EXPORT_SYMBOL(kmap_atomic_to_page);
108 116
109void __init set_highmem_pages_init(void) 117void __init set_highmem_pages_init(void)
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 069ce7c37c01..f581a18c0d4d 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
72 if (!vma_shareable(vma, addr)) 72 if (!vma_shareable(vma, addr))
73 return; 73 return;
74 74
75 spin_lock(&mapping->i_mmap_lock); 75 mutex_lock(&mapping->i_mmap_mutex);
76 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { 76 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
77 if (svma == vma) 77 if (svma == vma)
78 continue; 78 continue;
@@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
97 put_page(virt_to_page(spte)); 97 put_page(virt_to_page(spte));
98 spin_unlock(&mm->page_table_lock); 98 spin_unlock(&mm->page_table_lock);
99out: 99out:
100 spin_unlock(&mapping->i_mmap_lock); 100 mutex_unlock(&mapping->i_mmap_mutex);
101} 101}
102 102
103/* 103/*
@@ -326,7 +326,7 @@ try_again:
326 if (mm->free_area_cache < len) 326 if (mm->free_area_cache < len)
327 goto fail; 327 goto fail;
328 328
329 /* either no address requested or cant fit in requested address hole */ 329 /* either no address requested or can't fit in requested address hole */
330 addr = (mm->free_area_cache - len) & huge_page_mask(h); 330 addr = (mm->free_area_cache - len) & huge_page_mask(h);
331 do { 331 do {
332 /* 332 /*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index b278535b14aa..30326443ab81 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -2,6 +2,7 @@
2#include <linux/initrd.h> 2#include <linux/initrd.h>
3#include <linux/ioport.h> 3#include <linux/ioport.h>
4#include <linux/swap.h> 4#include <linux/swap.h>
5#include <linux/memblock.h>
5 6
6#include <asm/cacheflush.h> 7#include <asm/cacheflush.h>
7#include <asm/e820.h> 8#include <asm/e820.h>
@@ -15,11 +16,9 @@
15#include <asm/tlb.h> 16#include <asm/tlb.h>
16#include <asm/proto.h> 17#include <asm/proto.h>
17 18
18DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 19unsigned long __initdata pgt_buf_start;
19 20unsigned long __meminitdata pgt_buf_end;
20unsigned long __initdata e820_table_start; 21unsigned long __meminitdata pgt_buf_top;
21unsigned long __meminitdata e820_table_end;
22unsigned long __meminitdata e820_table_top;
23 22
24int after_bootmem; 23int after_bootmem;
25 24
@@ -32,7 +31,8 @@ int direct_gbpages
32static void __init find_early_table_space(unsigned long end, int use_pse, 31static void __init find_early_table_space(unsigned long end, int use_pse,
33 int use_gbpages) 32 int use_gbpages)
34{ 33{
35 unsigned long puds, pmds, ptes, tables, start; 34 unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
35 phys_addr_t base;
36 36
37 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 37 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
38 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); 38 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
@@ -63,29 +63,25 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
63#ifdef CONFIG_X86_32 63#ifdef CONFIG_X86_32
64 /* for fixmap */ 64 /* for fixmap */
65 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 65 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
66#endif
67 66
68 /* 67 good_end = max_pfn_mapped << PAGE_SHIFT;
69 * RED-PEN putting page tables only on node 0 could
70 * cause a hotspot and fill up ZONE_DMA. The page tables
71 * need roughly 0.5KB per GB.
72 */
73#ifdef CONFIG_X86_32
74 start = 0x7000;
75#else
76 start = 0x8000;
77#endif 68#endif
78 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 69
79 tables, PAGE_SIZE); 70 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
80 if (e820_table_start == -1UL) 71 if (base == MEMBLOCK_ERROR)
81 panic("Cannot find space for the kernel page tables"); 72 panic("Cannot find space for the kernel page tables");
82 73
83 e820_table_start >>= PAGE_SHIFT; 74 pgt_buf_start = base >> PAGE_SHIFT;
84 e820_table_end = e820_table_start; 75 pgt_buf_end = pgt_buf_start;
85 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); 76 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
86 77
87 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 78 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
88 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); 79 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
80}
81
82void __init native_pagetable_reserve(u64 start, u64 end)
83{
84 memblock_x86_reserve_range(start, end, "PGTABLE");
89} 85}
90 86
91struct map_range { 87struct map_range {
@@ -277,30 +273,26 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
277 load_cr3(swapper_pg_dir); 273 load_cr3(swapper_pg_dir);
278#endif 274#endif
279 275
280#ifdef CONFIG_X86_64
281 if (!after_bootmem && !start) {
282 pud_t *pud;
283 pmd_t *pmd;
284
285 mmu_cr4_features = read_cr4();
286
287 /*
288 * _brk_end cannot change anymore, but it and _end may be
289 * located on different 2M pages. cleanup_highmap(), however,
290 * can only consider _end when it runs, so destroy any
291 * mappings beyond _brk_end here.
292 */
293 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
294 pmd = pmd_offset(pud, _brk_end - 1);
295 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
296 pmd_clear(pmd);
297 }
298#endif
299 __flush_tlb_all(); 276 __flush_tlb_all();
300 277
301 if (!after_bootmem && e820_table_end > e820_table_start) 278 /*
302 reserve_early(e820_table_start << PAGE_SHIFT, 279 * Reserve the kernel pagetable pages we used (pgt_buf_start -
303 e820_table_end << PAGE_SHIFT, "PGTABLE"); 280 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
281 * so that they can be reused for other purposes.
282 *
283 * On native it just means calling memblock_x86_reserve_range, on Xen it
284 * also means marking RW the pagetable pages that we allocated before
285 * but that haven't been used.
286 *
287 * In fact on xen we mark RO the whole range pgt_buf_start -
288 * pgt_buf_top, because we have to make sure that when
289 * init_memory_mapping reaches the pagetable pages area, it maps
290 * RO all the pagetable pages, including the ones that are beyond
291 * pgt_buf_end at that time.
292 */
293 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
294 x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
295 PFN_PHYS(pgt_buf_end));
304 296
305 if (!after_bootmem) 297 if (!after_bootmem)
306 early_memtest(start, end); 298 early_memtest(start, end);
@@ -362,8 +354,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
362 /* 354 /*
363 * We just marked the kernel text read only above, now that 355 * We just marked the kernel text read only above, now that
364 * we are going to free part of that, we need to make that 356 * we are going to free part of that, we need to make that
365 * writeable first. 357 * writeable and non-executable first.
366 */ 358 */
359 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
367 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 360 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
368 361
369 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 362 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d6..29f7c6d98179 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -25,6 +25,7 @@
25#include <linux/pfn.h> 25#include <linux/pfn.h>
26#include <linux/poison.h> 26#include <linux/poison.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/memblock.h>
28#include <linux/proc_fs.h> 29#include <linux/proc_fs.h>
29#include <linux/memory_hotplug.h> 30#include <linux/memory_hotplug.h>
30#include <linux/initrd.h> 31#include <linux/initrd.h>
@@ -44,6 +45,7 @@
44#include <asm/bugs.h> 45#include <asm/bugs.h>
45#include <asm/tlb.h> 46#include <asm/tlb.h>
46#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/olpc_ofw.h>
47#include <asm/pgalloc.h> 49#include <asm/pgalloc.h>
48#include <asm/sections.h> 50#include <asm/sections.h>
49#include <asm/paravirt.h> 51#include <asm/paravirt.h>
@@ -60,14 +62,14 @@ bool __read_mostly __vmalloc_start_set = false;
60 62
61static __init void *alloc_low_page(void) 63static __init void *alloc_low_page(void)
62{ 64{
63 unsigned long pfn = e820_table_end++; 65 unsigned long pfn = pgt_buf_end++;
64 void *adr; 66 void *adr;
65 67
66 if (pfn >= e820_table_top) 68 if (pfn >= pgt_buf_top)
67 panic("alloc_low_page: ran out of memory"); 69 panic("alloc_low_page: ran out of memory");
68 70
69 adr = __va(pfn * PAGE_SIZE); 71 adr = __va(pfn * PAGE_SIZE);
70 memset(adr, 0, PAGE_SIZE); 72 clear_page(adr);
71 return adr; 73 return adr;
72} 74}
73 75
@@ -161,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
161 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
162 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
163 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
164 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start 166 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
165 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { 167 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
166 pte_t *newpte; 168 pte_t *newpte;
167 int i; 169 int i;
168 170
@@ -225,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
225 227
226static inline int is_kernel_text(unsigned long addr) 228static inline int is_kernel_text(unsigned long addr)
227{ 229{
228 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) 230 if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
229 return 1; 231 return 1;
230 return 0; 232 return 0;
231} 233}
@@ -422,49 +424,28 @@ static void __init add_one_highpage_init(struct page *page)
422 totalhigh_pages++; 424 totalhigh_pages++;
423} 425}
424 426
425struct add_highpages_data { 427void __init add_highpages_with_active_regions(int nid,
426 unsigned long start_pfn; 428 unsigned long start_pfn, unsigned long end_pfn)
427 unsigned long end_pfn;
428};
429
430static int __init add_highpages_work_fn(unsigned long start_pfn,
431 unsigned long end_pfn, void *datax)
432{ 429{
433 int node_pfn; 430 struct range *range;
434 struct page *page; 431 int nr_range;
435 unsigned long final_start_pfn, final_end_pfn; 432 int i;
436 struct add_highpages_data *data;
437 433
438 data = (struct add_highpages_data *)datax; 434 nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
439 435
440 final_start_pfn = max(start_pfn, data->start_pfn); 436 for (i = 0; i < nr_range; i++) {
441 final_end_pfn = min(end_pfn, data->end_pfn); 437 struct page *page;
442 if (final_start_pfn >= final_end_pfn) 438 int node_pfn;
443 return 0;
444 439
445 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; 440 for (node_pfn = range[i].start; node_pfn < range[i].end;
446 node_pfn++) { 441 node_pfn++) {
447 if (!pfn_valid(node_pfn)) 442 if (!pfn_valid(node_pfn))
448 continue; 443 continue;
449 page = pfn_to_page(node_pfn); 444 page = pfn_to_page(node_pfn);
450 add_one_highpage_init(page); 445 add_one_highpage_init(page);
446 }
451 } 447 }
452
453 return 0;
454
455} 448}
456
457void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
458 unsigned long end_pfn)
459{
460 struct add_highpages_data data;
461
462 data.start_pfn = start_pfn;
463 data.end_pfn = end_pfn;
464
465 work_with_active_regions(nid, add_highpages_work_fn, &data);
466}
467
468#else 449#else
469static inline void permanent_kmaps_init(pgd_t *pgd_base) 450static inline void permanent_kmaps_init(pgd_t *pgd_base)
470{ 451{
@@ -548,48 +529,6 @@ static void __init pagetable_init(void)
548 permanent_kmaps_init(pgd_base); 529 permanent_kmaps_init(pgd_base);
549} 530}
550 531
551#ifdef CONFIG_ACPI_SLEEP
552/*
553 * ACPI suspend needs this for resume, because things like the intel-agp
554 * driver might have split up a kernel 4MB mapping.
555 */
556char swsusp_pg_dir[PAGE_SIZE]
557 __attribute__ ((aligned(PAGE_SIZE)));
558
559static inline void save_pg_dir(void)
560{
561 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
562}
563#else /* !CONFIG_ACPI_SLEEP */
564static inline void save_pg_dir(void)
565{
566}
567#endif /* !CONFIG_ACPI_SLEEP */
568
569void zap_low_mappings(bool early)
570{
571 int i;
572
573 /*
574 * Zap initial low-memory mappings.
575 *
576 * Note that "pgd_clear()" doesn't do it for
577 * us, because pgd_clear() is a no-op on i386.
578 */
579 for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
580#ifdef CONFIG_X86_PAE
581 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
582#else
583 set_pgd(swapper_pg_dir+i, __pgd(0));
584#endif
585 }
586
587 if (early)
588 __flush_tlb();
589 else
590 flush_tlb_all();
591}
592
593pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); 532pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
594EXPORT_SYMBOL_GPL(__supported_pte_mask); 533EXPORT_SYMBOL_GPL(__supported_pte_mask);
595 534
@@ -705,21 +644,20 @@ void __init find_low_pfn_range(void)
705} 644}
706 645
707#ifndef CONFIG_NEED_MULTIPLE_NODES 646#ifndef CONFIG_NEED_MULTIPLE_NODES
708void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 647void __init initmem_init(void)
709 int acpi, int k8)
710{ 648{
711#ifdef CONFIG_HIGHMEM 649#ifdef CONFIG_HIGHMEM
712 highstart_pfn = highend_pfn = max_pfn; 650 highstart_pfn = highend_pfn = max_pfn;
713 if (max_pfn > max_low_pfn) 651 if (max_pfn > max_low_pfn)
714 highstart_pfn = max_low_pfn; 652 highstart_pfn = max_low_pfn;
715 e820_register_active_regions(0, 0, highend_pfn); 653 memblock_x86_register_active_regions(0, 0, highend_pfn);
716 sparse_memory_present_with_active_regions(0); 654 sparse_memory_present_with_active_regions(0);
717 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 655 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
718 pages_to_mb(highend_pfn - highstart_pfn)); 656 pages_to_mb(highend_pfn - highstart_pfn));
719 num_physpages = highend_pfn; 657 num_physpages = highend_pfn;
720 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 658 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
721#else 659#else
722 e820_register_active_regions(0, 0, max_low_pfn); 660 memblock_x86_register_active_regions(0, 0, max_low_pfn);
723 sparse_memory_present_with_active_regions(0); 661 sparse_memory_present_with_active_regions(0);
724 num_physpages = max_low_pfn; 662 num_physpages = max_low_pfn;
725 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 663 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
@@ -740,8 +678,10 @@ static void __init zone_sizes_init(void)
740{ 678{
741 unsigned long max_zone_pfns[MAX_NR_ZONES]; 679 unsigned long max_zone_pfns[MAX_NR_ZONES];
742 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
681#ifdef CONFIG_ZONE_DMA
743 max_zone_pfns[ZONE_DMA] = 682 max_zone_pfns[ZONE_DMA] =
744 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 683 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
684#endif
745 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 685 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
746#ifdef CONFIG_HIGHMEM 686#ifdef CONFIG_HIGHMEM
747 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 687 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
@@ -750,68 +690,12 @@ static void __init zone_sizes_init(void)
750 free_area_init_nodes(max_zone_pfns); 690 free_area_init_nodes(max_zone_pfns);
751} 691}
752 692
753#ifndef CONFIG_NO_BOOTMEM
754static unsigned long __init setup_node_bootmem(int nodeid,
755 unsigned long start_pfn,
756 unsigned long end_pfn,
757 unsigned long bootmap)
758{
759 unsigned long bootmap_size;
760
761 /* don't touch min_low_pfn */
762 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
763 bootmap >> PAGE_SHIFT,
764 start_pfn, end_pfn);
765 printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
766 nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
767 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
768 nodeid, bootmap, bootmap + bootmap_size);
769 free_bootmem_with_active_regions(nodeid, end_pfn);
770
771 return bootmap + bootmap_size;
772}
773#endif
774
775void __init setup_bootmem_allocator(void) 693void __init setup_bootmem_allocator(void)
776{ 694{
777#ifndef CONFIG_NO_BOOTMEM
778 int nodeid;
779 unsigned long bootmap_size, bootmap;
780 /*
781 * Initialize the boot-time allocator (with low memory only):
782 */
783 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
784 bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
785 PAGE_SIZE);
786 if (bootmap == -1L)
787 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
788 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
789#endif
790
791 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 695 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
792 max_pfn_mapped<<PAGE_SHIFT); 696 max_pfn_mapped<<PAGE_SHIFT);
793 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 697 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
794 698
795#ifndef CONFIG_NO_BOOTMEM
796 for_each_online_node(nodeid) {
797 unsigned long start_pfn, end_pfn;
798
799#ifdef CONFIG_NEED_MULTIPLE_NODES
800 start_pfn = node_start_pfn[nodeid];
801 end_pfn = node_end_pfn[nodeid];
802 if (start_pfn > max_low_pfn)
803 continue;
804 if (end_pfn > max_low_pfn)
805 end_pfn = max_low_pfn;
806#else
807 start_pfn = 0;
808 end_pfn = max_low_pfn;
809#endif
810 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
811 bootmap);
812 }
813#endif
814
815 after_bootmem = 1; 699 after_bootmem = 1;
816} 700}
817 701
@@ -833,6 +717,8 @@ void __init paging_init(void)
833 /* 717 /*
834 * NOTE: at this point the bootmem allocator is fully available. 718 * NOTE: at this point the bootmem allocator is fully available.
835 */ 719 */
720 olpc_dt_build_devicetree();
721 sparse_memory_present_with_active_regions(MAX_NUMNODES);
836 sparse_init(); 722 sparse_init();
837 zone_sizes_init(); 723 zone_sizes_init();
838} 724}
@@ -958,9 +844,6 @@ void __init mem_init(void)
958 844
959 if (boot_cpu_data.wp_works_ok < 0) 845 if (boot_cpu_data.wp_works_ok < 0)
960 test_wp_bit(); 846 test_wp_bit();
961
962 save_pg_dir();
963 zap_low_mappings(true);
964} 847}
965 848
966#ifdef CONFIG_MEMORY_HOTPLUG 849#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1033,6 +916,23 @@ void set_kernel_text_ro(void)
1033 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 916 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1034} 917}
1035 918
919static void mark_nxdata_nx(void)
920{
921 /*
922 * When this called, init has already been executed and released,
923 * so everything past _etext should be NX.
924 */
925 unsigned long start = PFN_ALIGN(_etext);
926 /*
927 * This comes from is_kernel_text upper limit. Also HPAGE where used:
928 */
929 unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
930
931 if (__supported_pte_mask & _PAGE_NX)
932 printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
933 set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
934}
935
1036void mark_rodata_ro(void) 936void mark_rodata_ro(void)
1037{ 937{
1038 unsigned long start = PFN_ALIGN(_text); 938 unsigned long start = PFN_ALIGN(_text);
@@ -1067,11 +967,7 @@ void mark_rodata_ro(void)
1067 printk(KERN_INFO "Testing CPA: write protecting again\n"); 967 printk(KERN_INFO "Testing CPA: write protecting again\n");
1068 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 968 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1069#endif 969#endif
970 mark_nxdata_nx();
1070} 971}
1071#endif 972#endif
1072 973
1073int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1074 int flags)
1075{
1076 return reserve_bootmem(phys, len, flags);
1077}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a20..bbaaa005bf0e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -21,12 +21,14 @@
21#include <linux/initrd.h> 21#include <linux/initrd.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24#include <linux/proc_fs.h> 25#include <linux/proc_fs.h>
25#include <linux/pci.h> 26#include <linux/pci.h>
26#include <linux/pfn.h> 27#include <linux/pfn.h>
27#include <linux/poison.h> 28#include <linux/poison.h>
28#include <linux/dma-mapping.h> 29#include <linux/dma-mapping.h>
29#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/memory.h>
30#include <linux/memory_hotplug.h> 32#include <linux/memory_hotplug.h>
31#include <linux/nmi.h> 33#include <linux/nmi.h>
32#include <linux/gfp.h> 34#include <linux/gfp.h>
@@ -50,9 +52,8 @@
50#include <asm/numa.h> 52#include <asm/numa.h>
51#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
52#include <asm/init.h> 54#include <asm/init.h>
53#include <linux/bootmem.h> 55#include <asm/uv/uv.h>
54 56#include <asm/setup.h>
55static unsigned long dma_reserve __initdata;
56 57
57static int __init parse_direct_gbpages_off(char *arg) 58static int __init parse_direct_gbpages_off(char *arg)
58{ 59{
@@ -98,6 +99,43 @@ static int __init nonx32_setup(char *str)
98__setup("noexec32=", nonx32_setup); 99__setup("noexec32=", nonx32_setup);
99 100
100/* 101/*
102 * When memory was added/removed make sure all the processes MM have
103 * suitable PGD entries in the local PGD level page.
104 */
105void sync_global_pgds(unsigned long start, unsigned long end)
106{
107 unsigned long address;
108
109 for (address = start; address <= end; address += PGDIR_SIZE) {
110 const pgd_t *pgd_ref = pgd_offset_k(address);
111 struct page *page;
112
113 if (pgd_none(*pgd_ref))
114 continue;
115
116 spin_lock(&pgd_lock);
117 list_for_each_entry(page, &pgd_list, lru) {
118 pgd_t *pgd;
119 spinlock_t *pgt_lock;
120
121 pgd = (pgd_t *)page_address(page) + pgd_index(address);
122 /* the pgt_lock only for Xen */
123 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
124 spin_lock(pgt_lock);
125
126 if (pgd_none(*pgd))
127 set_pgd(pgd, *pgd_ref);
128 else
129 BUG_ON(pgd_page_vaddr(*pgd)
130 != pgd_page_vaddr(*pgd_ref));
131
132 spin_unlock(pgt_lock);
133 }
134 spin_unlock(&pgd_lock);
135 }
136}
137
138/*
101 * NOTE: This function is marked __ref because it calls __init function 139 * NOTE: This function is marked __ref because it calls __init function
102 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 140 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
103 */ 141 */
@@ -258,18 +296,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
258 * to the compile time generated pmds. This results in invalid pmds up 296 * to the compile time generated pmds. This results in invalid pmds up
259 * to the point where we hit the physaddr 0 mapping. 297 * to the point where we hit the physaddr 0 mapping.
260 * 298 *
261 * We limit the mappings to the region from _text to _end. _end is 299 * We limit the mappings to the region from _text to _brk_end. _brk_end
262 * rounded up to the 2MB boundary. This catches the invalid pmds as 300 * is rounded up to the 2MB boundary. This catches the invalid pmds as
263 * well, as they are located before _text: 301 * well, as they are located before _text:
264 */ 302 */
265void __init cleanup_highmap(void) 303void __init cleanup_highmap(void)
266{ 304{
267 unsigned long vaddr = __START_KERNEL_map; 305 unsigned long vaddr = __START_KERNEL_map;
268 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; 306 unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
307 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
269 pmd_t *pmd = level2_kernel_pgt; 308 pmd_t *pmd = level2_kernel_pgt;
270 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
271 309
272 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { 310 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
273 if (pmd_none(*pmd)) 311 if (pmd_none(*pmd))
274 continue; 312 continue;
275 if (vaddr < (unsigned long) _text || vaddr > end) 313 if (vaddr < (unsigned long) _text || vaddr > end)
@@ -279,7 +317,7 @@ void __init cleanup_highmap(void)
279 317
280static __ref void *alloc_low_page(unsigned long *phys) 318static __ref void *alloc_low_page(unsigned long *phys)
281{ 319{
282 unsigned long pfn = e820_table_end++; 320 unsigned long pfn = pgt_buf_end++;
283 void *adr; 321 void *adr;
284 322
285 if (after_bootmem) { 323 if (after_bootmem) {
@@ -289,21 +327,37 @@ static __ref void *alloc_low_page(unsigned long *phys)
289 return adr; 327 return adr;
290 } 328 }
291 329
292 if (pfn >= e820_table_top) 330 if (pfn >= pgt_buf_top)
293 panic("alloc_low_page: ran out of memory"); 331 panic("alloc_low_page: ran out of memory");
294 332
295 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 333 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
296 memset(adr, 0, PAGE_SIZE); 334 clear_page(adr);
297 *phys = pfn * PAGE_SIZE; 335 *phys = pfn * PAGE_SIZE;
298 return adr; 336 return adr;
299} 337}
300 338
339static __ref void *map_low_page(void *virt)
340{
341 void *adr;
342 unsigned long phys, left;
343
344 if (after_bootmem)
345 return virt;
346
347 phys = __pa(virt);
348 left = phys & (PAGE_SIZE - 1);
349 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
350 adr = (void *)(((unsigned long)adr) | left);
351
352 return adr;
353}
354
301static __ref void unmap_low_page(void *adr) 355static __ref void unmap_low_page(void *adr)
302{ 356{
303 if (after_bootmem) 357 if (after_bootmem)
304 return; 358 return;
305 359
306 early_iounmap(adr, PAGE_SIZE); 360 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
307} 361}
308 362
309static unsigned long __meminit 363static unsigned long __meminit
@@ -351,15 +405,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
351} 405}
352 406
353static unsigned long __meminit 407static unsigned long __meminit
354phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
355 pgprot_t prot)
356{
357 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
358
359 return phys_pte_init(pte, address, end, prot);
360}
361
362static unsigned long __meminit
363phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 408phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
364 unsigned long page_size_mask, pgprot_t prot) 409 unsigned long page_size_mask, pgprot_t prot)
365{ 410{
@@ -385,8 +430,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
385 if (pmd_val(*pmd)) { 430 if (pmd_val(*pmd)) {
386 if (!pmd_large(*pmd)) { 431 if (!pmd_large(*pmd)) {
387 spin_lock(&init_mm.page_table_lock); 432 spin_lock(&init_mm.page_table_lock);
388 last_map_addr = phys_pte_update(pmd, address, 433 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
434 last_map_addr = phys_pte_init(pte, address,
389 end, prot); 435 end, prot);
436 unmap_low_page(pte);
390 spin_unlock(&init_mm.page_table_lock); 437 spin_unlock(&init_mm.page_table_lock);
391 continue; 438 continue;
392 } 439 }
@@ -433,18 +480,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
433} 480}
434 481
435static unsigned long __meminit 482static unsigned long __meminit
436phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
437 unsigned long page_size_mask, pgprot_t prot)
438{
439 pmd_t *pmd = pmd_offset(pud, 0);
440 unsigned long last_map_addr;
441
442 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
443 __flush_tlb_all();
444 return last_map_addr;
445}
446
447static unsigned long __meminit
448phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, 483phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
449 unsigned long page_size_mask) 484 unsigned long page_size_mask)
450{ 485{
@@ -469,8 +504,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
469 504
470 if (pud_val(*pud)) { 505 if (pud_val(*pud)) {
471 if (!pud_large(*pud)) { 506 if (!pud_large(*pud)) {
472 last_map_addr = phys_pmd_update(pud, addr, end, 507 pmd = map_low_page(pmd_offset(pud, 0));
508 last_map_addr = phys_pmd_init(pmd, addr, end,
473 page_size_mask, prot); 509 page_size_mask, prot);
510 unmap_low_page(pmd);
511 __flush_tlb_all();
474 continue; 512 continue;
475 } 513 }
476 /* 514 /*
@@ -518,27 +556,18 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
518 return last_map_addr; 556 return last_map_addr;
519} 557}
520 558
521static unsigned long __meminit
522phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
523 unsigned long page_size_mask)
524{
525 pud_t *pud;
526
527 pud = (pud_t *)pgd_page_vaddr(*pgd);
528
529 return phys_pud_init(pud, addr, end, page_size_mask);
530}
531
532unsigned long __meminit 559unsigned long __meminit
533kernel_physical_mapping_init(unsigned long start, 560kernel_physical_mapping_init(unsigned long start,
534 unsigned long end, 561 unsigned long end,
535 unsigned long page_size_mask) 562 unsigned long page_size_mask)
536{ 563{
537 564 bool pgd_changed = false;
538 unsigned long next, last_map_addr = end; 565 unsigned long next, last_map_addr = end;
566 unsigned long addr;
539 567
540 start = (unsigned long)__va(start); 568 start = (unsigned long)__va(start);
541 end = (unsigned long)__va(end); 569 end = (unsigned long)__va(end);
570 addr = start;
542 571
543 for (; start < end; start = next) { 572 for (; start < end; start = next) {
544 pgd_t *pgd = pgd_offset_k(start); 573 pgd_t *pgd = pgd_offset_k(start);
@@ -550,8 +579,10 @@ kernel_physical_mapping_init(unsigned long start,
550 next = end; 579 next = end;
551 580
552 if (pgd_val(*pgd)) { 581 if (pgd_val(*pgd)) {
553 last_map_addr = phys_pud_update(pgd, __pa(start), 582 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
583 last_map_addr = phys_pud_init(pud, __pa(start),
554 __pa(end), page_size_mask); 584 __pa(end), page_size_mask);
585 unmap_low_page(pud);
555 continue; 586 continue;
556 } 587 }
557 588
@@ -563,33 +594,21 @@ kernel_physical_mapping_init(unsigned long start,
563 spin_lock(&init_mm.page_table_lock); 594 spin_lock(&init_mm.page_table_lock);
564 pgd_populate(&init_mm, pgd, __va(pud_phys)); 595 pgd_populate(&init_mm, pgd, __va(pud_phys));
565 spin_unlock(&init_mm.page_table_lock); 596 spin_unlock(&init_mm.page_table_lock);
597 pgd_changed = true;
566 } 598 }
599
600 if (pgd_changed)
601 sync_global_pgds(addr, end);
602
567 __flush_tlb_all(); 603 __flush_tlb_all();
568 604
569 return last_map_addr; 605 return last_map_addr;
570} 606}
571 607
572#ifndef CONFIG_NUMA 608#ifndef CONFIG_NUMA
573void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 609void __init initmem_init(void)
574 int acpi, int k8) 610{
575{ 611 memblock_x86_register_active_regions(0, 0, max_pfn);
576#ifndef CONFIG_NO_BOOTMEM
577 unsigned long bootmap_size, bootmap;
578
579 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
580 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
581 PAGE_SIZE);
582 if (bootmap == -1L)
583 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
584 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
585 /* don't touch min_low_pfn */
586 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
587 0, end_pfn);
588 e820_register_active_regions(0, start_pfn, end_pfn);
589 free_bootmem_with_active_regions(0, end_pfn);
590#else
591 e820_register_active_regions(0, start_pfn, end_pfn);
592#endif
593} 612}
594#endif 613#endif
595 614
@@ -598,7 +617,9 @@ void __init paging_init(void)
598 unsigned long max_zone_pfns[MAX_NR_ZONES]; 617 unsigned long max_zone_pfns[MAX_NR_ZONES];
599 618
600 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 619 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
620#ifdef CONFIG_ZONE_DMA
601 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 621 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
622#endif
602 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 623 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
603 max_zone_pfns[ZONE_NORMAL] = max_pfn; 624 max_zone_pfns[ZONE_NORMAL] = max_pfn;
604 625
@@ -661,14 +682,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
661} 682}
662EXPORT_SYMBOL_GPL(arch_add_memory); 683EXPORT_SYMBOL_GPL(arch_add_memory);
663 684
664#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
665int memory_add_physaddr_to_nid(u64 start)
666{
667 return 0;
668}
669EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
670#endif
671
672#endif /* CONFIG_MEMORY_HOTPLUG */ 685#endif /* CONFIG_MEMORY_HOTPLUG */
673 686
674static struct kcore_list kcore_vsyscall; 687static struct kcore_list kcore_vsyscall;
@@ -799,52 +812,6 @@ void mark_rodata_ro(void)
799 812
800#endif 813#endif
801 814
802int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
803 int flags)
804{
805#ifdef CONFIG_NUMA
806 int nid, next_nid;
807 int ret;
808#endif
809 unsigned long pfn = phys >> PAGE_SHIFT;
810
811 if (pfn >= max_pfn) {
812 /*
813 * This can happen with kdump kernels when accessing
814 * firmware tables:
815 */
816 if (pfn < max_pfn_mapped)
817 return -EFAULT;
818
819 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
820 phys, len);
821 return -EFAULT;
822 }
823
824 /* Should check here against the e820 map to avoid double free */
825#ifdef CONFIG_NUMA
826 nid = phys_to_nid(phys);
827 next_nid = phys_to_nid(phys + len - 1);
828 if (nid == next_nid)
829 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
830 else
831 ret = reserve_bootmem(phys, len, flags);
832
833 if (ret != 0)
834 return ret;
835
836#else
837 reserve_bootmem(phys, len, flags);
838#endif
839
840 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
841 dma_reserve += len / PAGE_SIZE;
842 set_dma_reserve(dma_reserve);
843 }
844
845 return 0;
846}
847
848int kern_addr_valid(unsigned long addr) 815int kern_addr_valid(unsigned long addr)
849{ 816{
850 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; 817 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
@@ -890,18 +857,18 @@ static struct vm_area_struct gate_vma = {
890 .vm_flags = VM_READ | VM_EXEC 857 .vm_flags = VM_READ | VM_EXEC
891}; 858};
892 859
893struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 860struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
894{ 861{
895#ifdef CONFIG_IA32_EMULATION 862#ifdef CONFIG_IA32_EMULATION
896 if (test_tsk_thread_flag(tsk, TIF_IA32)) 863 if (!mm || mm->context.ia32_compat)
897 return NULL; 864 return NULL;
898#endif 865#endif
899 return &gate_vma; 866 return &gate_vma;
900} 867}
901 868
902int in_gate_area(struct task_struct *task, unsigned long addr) 869int in_gate_area(struct mm_struct *mm, unsigned long addr)
903{ 870{
904 struct vm_area_struct *vma = get_gate_vma(task); 871 struct vm_area_struct *vma = get_gate_vma(mm);
905 872
906 if (!vma) 873 if (!vma)
907 return 0; 874 return 0;
@@ -910,11 +877,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr)
910} 877}
911 878
912/* 879/*
913 * Use this when you have no reliable task/vma, typically from interrupt 880 * Use this when you have no reliable mm, typically from interrupt
914 * context. It is less reliable than using the task's vma and may give 881 * context. It is less reliable than using a task's mm and may give
915 * false positives: 882 * false positives.
916 */ 883 */
917int in_gate_area_no_task(unsigned long addr) 884int in_gate_area_no_mm(unsigned long addr)
918{ 885{
919 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); 886 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
920} 887}
@@ -928,6 +895,17 @@ const char *arch_vma_name(struct vm_area_struct *vma)
928 return NULL; 895 return NULL;
929} 896}
930 897
898#ifdef CONFIG_X86_UV
899unsigned long memory_block_size_bytes(void)
900{
901 if (is_uv_system()) {
902 printk(KERN_INFO "UV: memory block size 2GB\n");
903 return 2UL * 1024 * 1024 * 1024;
904 }
905 return MIN_MEMORY_BLOCK_SIZE;
906}
907#endif
908
931#ifdef CONFIG_SPARSEMEM_VMEMMAP 909#ifdef CONFIG_SPARSEMEM_VMEMMAP
932/* 910/*
933 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 911 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
@@ -1003,6 +981,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
1003 } 981 }
1004 982
1005 } 983 }
984 sync_global_pgds((unsigned long)start_page, end);
1006 return 0; 985 return 0;
1007} 986}
1008 987
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 72fc70cf6184..7b179b499fa3 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
48} 48}
49EXPORT_SYMBOL_GPL(iomap_create_wc); 49EXPORT_SYMBOL_GPL(iomap_create_wc);
50 50
51void 51void iomap_free(resource_size_t base, unsigned long size)
52iomap_free(resource_size_t base, unsigned long size)
53{ 52{
54 io_free_memtype(base, base + size); 53 io_free_memtype(base, base + size);
55} 54}
56EXPORT_SYMBOL_GPL(iomap_free); 55EXPORT_SYMBOL_GPL(iomap_free);
57 56
58void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 57void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
59{ 58{
60 enum fixed_addresses idx;
61 unsigned long vaddr; 59 unsigned long vaddr;
60 int idx, type;
62 61
63 pagefault_disable(); 62 pagefault_disable();
64 63
65 debug_kmap_atomic(type); 64 type = kmap_atomic_idx_push();
66 idx = type + KM_TYPE_NR * smp_processor_id(); 65 idx = type + KM_TYPE_NR * smp_processor_id();
67 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 66 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
68 set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); 67 set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
@@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
72} 71}
73 72
74/* 73/*
75 * Map 'pfn' using fixed map 'type' and protections 'prot' 74 * Map 'pfn' using protections 'prot'
76 */ 75 */
77void __iomem * 76void __iomem *
78iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 77iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
79{ 78{
80 /* 79 /*
81 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. 80 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
@@ -86,24 +85,34 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
86 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) 85 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
87 prot = PAGE_KERNEL_UC_MINUS; 86 prot = PAGE_KERNEL_UC_MINUS;
88 87
89 return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); 88 return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
90} 89}
91EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); 90EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
92 91
93void 92void
94iounmap_atomic(void __iomem *kvaddr, enum km_type type) 93iounmap_atomic(void __iomem *kvaddr)
95{ 94{
96 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 95 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
97 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
98 96
99 /* 97 if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
100 * Force other mappings to Oops if they'll try to access this pte 98 vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
101 * without first remap it. Keeping stale mappings around is a bad idea 99 int idx, type;
102 * also, in case the page changes cacheability attributes or becomes 100
103 * a protected page in a hypervisor. 101 type = kmap_atomic_idx();
104 */ 102 idx = type + KM_TYPE_NR * smp_processor_id();
105 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 103
104#ifdef CONFIG_DEBUG_HIGHMEM
105 WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
106#endif
107 /*
108 * Force other mappings to Oops if they'll try to access this
109 * pte without first remap it. Keeping stale mappings around
110 * is a bad idea also, in case the page changes cacheability
111 * attributes or becomes a protected page in a hypervisor.
112 */
106 kpte_clear_flush(kmap_pte-idx, vaddr); 113 kpte_clear_flush(kmap_pte-idx, vaddr);
114 kmap_atomic_idx_pop();
115 }
107 116
108 pagefault_enable(); 117 pagefault_enable();
109} 118}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3ba6e0608c55..be1ef574ce9a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
91 return (__force void __iomem *)phys_to_virt(phys_addr); 91 return (__force void __iomem *)phys_to_virt(phys_addr);
92 92
93 /* 93 /*
94 * Check if the request spans more than any BAR in the iomem resource
95 * tree.
96 */
97 WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
98 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
99
100 /*
101 * Don't allow anybody to remap normal RAM that we're using.. 94 * Don't allow anybody to remap normal RAM that we're using..
102 */ 95 */
103 last_pfn = last_addr >> PAGE_SHIFT; 96 last_pfn = last_addr >> PAGE_SHIFT;
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 ret_addr = (void __iomem *) (vaddr + offset); 163 ret_addr = (void __iomem *) (vaddr + offset);
171 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); 164 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
172 165
166 /*
167 * Check if the request spans more than any BAR in the iomem resource
168 * tree.
169 */
170 WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
171 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
172
173 return ret_addr; 173 return ret_addr;
174err_free_area: 174err_free_area:
175 free_vm_area(area); 175 free_vm_area(area);
@@ -362,6 +362,11 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
362 return &bm_pte[pte_index(addr)]; 362 return &bm_pte[pte_index(addr)];
363} 363}
364 364
365bool __init is_early_ioremap_ptep(pte_t *ptep)
366{
367 return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
368}
369
365static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; 370static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
366 371
367void __init early_ioremap_init(void) 372void __init early_ioremap_init(void)
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
185 e->trace.entries = e->trace_entries; 185 e->trace.entries = e->trace_entries;
186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries); 186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
187 e->trace.skip = 0; 187 e->trace.skip = 0;
188 save_stack_trace_bp(&e->trace, regs->bp); 188 save_stack_trace_regs(&e->trace, regs);
189 189
190 /* Round address down to nearest 16 bytes */ 190 /* Round address down to nearest 16 bytes */
191 shadow_copy = kmemcheck_shadow_lookup(address 191 shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e5..d87dd6d042d6 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
631 if (!pte) 631 if (!pte)
632 return false; 632 return false;
633 633
634 WARN_ON_ONCE(in_nmi());
635
634 if (error_code & 2) 636 if (error_code & 2)
635 kmemcheck_access(regs, address, KMEMCHECK_WRITE); 637 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
636 else 638 else
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6f..324aa3f07237 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
9 b == 0xf0 || b == 0xf2 || b == 0xf3 9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */ 10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e 12 || b == 0x64 || b == 0x65
13 /* Group 3 */ 13 /* Group 3 */
14 || b == 0x66 14 || b == 0x66
15 /* Group 4 */ 15 /* Group 4 */
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
new file mode 100644
index 000000000000..992da5ec5a64
--- /dev/null
+++ b/arch/x86/mm/memblock.c
@@ -0,0 +1,348 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bitops.h>
5#include <linux/memblock.h>
6#include <linux/bootmem.h>
7#include <linux/mm.h>
8#include <linux/range.h>
9
10/* Check for already reserved areas */
11bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align)
12{
13 struct memblock_region *r;
14 u64 addr = *addrp, last;
15 u64 size = *sizep;
16 bool changed = false;
17
18again:
19 last = addr + size;
20 for_each_memblock(reserved, r) {
21 if (last > r->base && addr < r->base) {
22 size = r->base - addr;
23 changed = true;
24 goto again;
25 }
26 if (last > (r->base + r->size) && addr < (r->base + r->size)) {
27 addr = round_up(r->base + r->size, align);
28 size = last - addr;
29 changed = true;
30 goto again;
31 }
32 if (last <= (r->base + r->size) && addr >= r->base) {
33 *sizep = 0;
34 return false;
35 }
36 }
37 if (changed) {
38 *addrp = addr;
39 *sizep = size;
40 }
41 return changed;
42}
43
44/*
45 * Find next free range after start, and size is returned in *sizep
46 */
47u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
48{
49 struct memblock_region *r;
50
51 for_each_memblock(memory, r) {
52 u64 ei_start = r->base;
53 u64 ei_last = ei_start + r->size;
54 u64 addr;
55
56 addr = round_up(ei_start, align);
57 if (addr < start)
58 addr = round_up(start, align);
59 if (addr >= ei_last)
60 continue;
61 *sizep = ei_last - addr;
62 while (memblock_x86_check_reserved_size(&addr, sizep, align))
63 ;
64
65 if (*sizep)
66 return addr;
67 }
68
69 return MEMBLOCK_ERROR;
70}
71
72static __init struct range *find_range_array(int count)
73{
74 u64 end, size, mem;
75 struct range *range;
76
77 size = sizeof(struct range) * count;
78 end = memblock.current_limit;
79
80 mem = memblock_find_in_range(0, end, size, sizeof(struct range));
81 if (mem == MEMBLOCK_ERROR)
82 panic("can not find more space for range array");
83
84 /*
85 * This range is tempoaray, so don't reserve it, it will not be
86 * overlapped because We will not alloccate new buffer before
87 * We discard this one
88 */
89 range = __va(mem);
90 memset(range, 0, size);
91
92 return range;
93}
94
95static void __init memblock_x86_subtract_reserved(struct range *range, int az)
96{
97 u64 final_start, final_end;
98 struct memblock_region *r;
99
100 /* Take out region array itself at first*/
101 memblock_free_reserved_regions();
102
103 memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
104
105 for_each_memblock(reserved, r) {
106 memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
107 final_start = PFN_DOWN(r->base);
108 final_end = PFN_UP(r->base + r->size);
109 if (final_start >= final_end)
110 continue;
111 subtract_range(range, az, final_start, final_end);
112 }
113
114 /* Put region array back ? */
115 memblock_reserve_reserved_regions();
116}
117
118struct count_data {
119 int nr;
120};
121
122static int __init count_work_fn(unsigned long start_pfn,
123 unsigned long end_pfn, void *datax)
124{
125 struct count_data *data = datax;
126
127 data->nr++;
128
129 return 0;
130}
131
132static int __init count_early_node_map(int nodeid)
133{
134 struct count_data data;
135
136 data.nr = 0;
137 work_with_active_regions(nodeid, count_work_fn, &data);
138
139 return data.nr;
140}
141
142int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
143 unsigned long start_pfn, unsigned long end_pfn)
144{
145 int count;
146 struct range *range;
147 int nr_range;
148
149 count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
150
151 range = find_range_array(count);
152 nr_range = 0;
153
154 /*
155 * Use early_node_map[] and memblock.reserved.region to get range array
156 * at first
157 */
158 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
159 subtract_range(range, count, 0, start_pfn);
160 subtract_range(range, count, end_pfn, -1ULL);
161
162 memblock_x86_subtract_reserved(range, count);
163 nr_range = clean_sort_range(range, count);
164
165 *rangep = range;
166 return nr_range;
167}
168
169int __init get_free_all_memory_range(struct range **rangep, int nodeid)
170{
171 unsigned long end_pfn = -1UL;
172
173#ifdef CONFIG_X86_32
174 end_pfn = max_low_pfn;
175#endif
176 return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
177}
178
179static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
180{
181 int i, count;
182 struct range *range;
183 int nr_range;
184 u64 final_start, final_end;
185 u64 free_size;
186 struct memblock_region *r;
187
188 count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
189
190 range = find_range_array(count);
191 nr_range = 0;
192
193 addr = PFN_UP(addr);
194 limit = PFN_DOWN(limit);
195
196 for_each_memblock(memory, r) {
197 final_start = PFN_UP(r->base);
198 final_end = PFN_DOWN(r->base + r->size);
199 if (final_start >= final_end)
200 continue;
201 if (final_start >= limit || final_end <= addr)
202 continue;
203
204 nr_range = add_range(range, count, nr_range, final_start, final_end);
205 }
206 subtract_range(range, count, 0, addr);
207 subtract_range(range, count, limit, -1ULL);
208
209 /* Subtract memblock.reserved.region in range ? */
210 if (!get_free)
211 goto sort_and_count_them;
212 for_each_memblock(reserved, r) {
213 final_start = PFN_DOWN(r->base);
214 final_end = PFN_UP(r->base + r->size);
215 if (final_start >= final_end)
216 continue;
217 if (final_start >= limit || final_end <= addr)
218 continue;
219
220 subtract_range(range, count, final_start, final_end);
221 }
222
223sort_and_count_them:
224 nr_range = clean_sort_range(range, count);
225
226 free_size = 0;
227 for (i = 0; i < nr_range; i++)
228 free_size += range[i].end - range[i].start;
229
230 return free_size << PAGE_SHIFT;
231}
232
233u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
234{
235 return __memblock_x86_memory_in_range(addr, limit, true);
236}
237
238u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
239{
240 return __memblock_x86_memory_in_range(addr, limit, false);
241}
242
243void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
244{
245 if (start == end)
246 return;
247
248 if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
249 return;
250
251 memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
252
253 memblock_reserve(start, end - start);
254}
255
256void __init memblock_x86_free_range(u64 start, u64 end)
257{
258 if (start == end)
259 return;
260
261 if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
262 return;
263
264 memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
265
266 memblock_free(start, end - start);
267}
268
269/*
270 * Need to call this function after memblock_x86_register_active_regions,
271 * so early_node_map[] is filled already.
272 */
273u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
274{
275 u64 addr;
276 addr = find_memory_core_early(nid, size, align, start, end);
277 if (addr != MEMBLOCK_ERROR)
278 return addr;
279
280 /* Fallback, should already have start end within node range */
281 return memblock_find_in_range(start, end, size, align);
282}
283
284/*
285 * Finds an active region in the address range from start_pfn to last_pfn and
286 * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
287 */
288static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
289 unsigned long start_pfn,
290 unsigned long last_pfn,
291 unsigned long *ei_startpfn,
292 unsigned long *ei_endpfn)
293{
294 u64 align = PAGE_SIZE;
295
296 *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
297 *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
298
299 /* Skip map entries smaller than a page */
300 if (*ei_startpfn >= *ei_endpfn)
301 return 0;
302
303 /* Skip if map is outside the node */
304 if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
305 return 0;
306
307 /* Check for overlaps */
308 if (*ei_startpfn < start_pfn)
309 *ei_startpfn = start_pfn;
310 if (*ei_endpfn > last_pfn)
311 *ei_endpfn = last_pfn;
312
313 return 1;
314}
315
316/* Walk the memblock.memory map and register active regions within a node */
317void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
318 unsigned long last_pfn)
319{
320 unsigned long ei_startpfn;
321 unsigned long ei_endpfn;
322 struct memblock_region *r;
323
324 for_each_memblock(memory, r)
325 if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
326 &ei_startpfn, &ei_endpfn))
327 add_active_range(nid, ei_startpfn, ei_endpfn);
328}
329
330/*
331 * Find the hole size (in bytes) in the memory range.
332 * @start: starting address of the memory range to scan
333 * @end: ending address of the memory range to scan
334 */
335u64 __init memblock_x86_hole_size(u64 start, u64 end)
336{
337 unsigned long start_pfn = start >> PAGE_SHIFT;
338 unsigned long last_pfn = end >> PAGE_SHIFT;
339 unsigned long ei_startpfn, ei_endpfn, ram = 0;
340 struct memblock_region *r;
341
342 for_each_memblock(memory, r)
343 if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
344 &ei_startpfn, &ei_endpfn))
345 ram += ei_endpfn - ei_startpfn;
346
347 return end - start - ((u64)ram << PAGE_SHIFT);
348}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 18d244f70205..92faf3a1c53e 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -6,8 +6,7 @@
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/pfn.h> 8#include <linux/pfn.h>
9 9#include <linux/memblock.h>
10#include <asm/e820.h>
11 10
12static u64 patterns[] __initdata = { 11static u64 patterns[] __initdata = {
13 0, 12 0,
@@ -35,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
35 (unsigned long long) pattern, 34 (unsigned long long) pattern,
36 (unsigned long long) start_bad, 35 (unsigned long long) start_bad,
37 (unsigned long long) end_bad); 36 (unsigned long long) end_bad);
38 reserve_early(start_bad, end_bad, "BAD RAM"); 37 memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM");
39} 38}
40 39
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 40static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -74,7 +73,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
74 u64 size = 0; 73 u64 size = 0;
75 74
76 while (start < end) { 75 while (start < end) {
77 start = find_e820_area_size(start, &size, 1); 76 start = memblock_x86_find_in_range_size(start, &size, 1);
78 77
79 /* done ? */ 78 /* done ? */
80 if (start >= end) 79 if (start >= end)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..f5510d889a22 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,15 +1,112 @@
1/* Common code for 32 and 64-bit NUMA */ 1/* Common code for 32 and 64-bit NUMA */
2#include <linux/topology.h> 2#include <linux/kernel.h>
3#include <linux/module.h> 3#include <linux/mm.h>
4#include <linux/string.h>
5#include <linux/init.h>
4#include <linux/bootmem.h> 6#include <linux/bootmem.h>
7#include <linux/memblock.h>
8#include <linux/mmzone.h>
9#include <linux/ctype.h>
10#include <linux/module.h>
11#include <linux/nodemask.h>
12#include <linux/sched.h>
13#include <linux/topology.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/acpi.h>
19#include <asm/amd_nb.h>
20
21#include "numa_internal.h"
22
23int __initdata numa_off;
24nodemask_t numa_nodes_parsed __initdata;
25
26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27EXPORT_SYMBOL(node_data);
28
29static struct numa_meminfo numa_meminfo
30#ifndef CONFIG_MEMORY_HOTPLUG
31__initdata
32#endif
33;
34
35static int numa_distance_cnt;
36static u8 *numa_distance;
37
38static __init int numa_setup(char *opt)
39{
40 if (!opt)
41 return -EINVAL;
42 if (!strncmp(opt, "off", 3))
43 numa_off = 1;
44#ifdef CONFIG_NUMA_EMU
45 if (!strncmp(opt, "fake=", 5))
46 numa_emu_cmdline(opt + 5);
47#endif
48#ifdef CONFIG_ACPI_NUMA
49 if (!strncmp(opt, "noacpi", 6))
50 acpi_numa = -1;
51#endif
52 return 0;
53}
54early_param("numa", numa_setup);
5 55
6/* 56/*
7 * Which logical CPUs are on which nodes 57 * apicid, cpu, node mappings
8 */ 58 */
59s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
60 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
61};
62
63int __cpuinit numa_cpu_node(int cpu)
64{
65 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
66
67 if (apicid != BAD_APICID)
68 return __apicid_to_node[apicid];
69 return NUMA_NO_NODE;
70}
71
9cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 72cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
10EXPORT_SYMBOL(node_to_cpumask_map); 73EXPORT_SYMBOL(node_to_cpumask_map);
11 74
12/* 75/*
76 * Map cpu index to node index
77 */
78DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
79EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
80
81void __cpuinit numa_set_node(int cpu, int node)
82{
83 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
84
85 /* early setting, no percpu area yet */
86 if (cpu_to_node_map) {
87 cpu_to_node_map[cpu] = node;
88 return;
89 }
90
91#ifdef CONFIG_DEBUG_PER_CPU_MAPS
92 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
93 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
94 dump_stack();
95 return;
96 }
97#endif
98 per_cpu(x86_cpu_to_node_map, cpu) = node;
99
100 if (node != NUMA_NO_NODE)
101 set_cpu_numa_node(cpu, node);
102}
103
104void __cpuinit numa_clear_node(int cpu)
105{
106 numa_set_node(cpu, NUMA_NO_NODE);
107}
108
109/*
13 * Allocate node_to_cpumask_map based on number of available nodes 110 * Allocate node_to_cpumask_map based on number of available nodes
14 * Requires node_possible_map to be valid. 111 * Requires node_possible_map to be valid.
15 * 112 *
@@ -35,7 +132,659 @@ void __init setup_node_to_cpumask_map(void)
35 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 132 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
36} 133}
37 134
38#ifdef CONFIG_DEBUG_PER_CPU_MAPS 135static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
136 struct numa_meminfo *mi)
137{
138 /* ignore zero length blks */
139 if (start == end)
140 return 0;
141
142 /* whine about and ignore invalid blks */
143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
144 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
145 nid, start, end);
146 return 0;
147 }
148
149 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
150 pr_err("NUMA: too many memblk ranges\n");
151 return -EINVAL;
152 }
153
154 mi->blk[mi->nr_blks].start = start;
155 mi->blk[mi->nr_blks].end = end;
156 mi->blk[mi->nr_blks].nid = nid;
157 mi->nr_blks++;
158 return 0;
159}
160
161/**
162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
163 * @idx: Index of memblk to remove
164 * @mi: numa_meminfo to remove memblk from
165 *
166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
167 * decrementing @mi->nr_blks.
168 */
169void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
170{
171 mi->nr_blks--;
172 memmove(&mi->blk[idx], &mi->blk[idx + 1],
173 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
174}
175
176/**
177 * numa_add_memblk - Add one numa_memblk to numa_meminfo
178 * @nid: NUMA node ID of the new memblk
179 * @start: Start address of the new memblk
180 * @end: End address of the new memblk
181 *
182 * Add a new memblk to the default numa_meminfo.
183 *
184 * RETURNS:
185 * 0 on success, -errno on failure.
186 */
187int __init numa_add_memblk(int nid, u64 start, u64 end)
188{
189 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
190}
191
192/* Initialize NODE_DATA for a node on the local memory */
193static void __init setup_node_data(int nid, u64 start, u64 end)
194{
195 const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
196 const u64 nd_high = PFN_PHYS(max_pfn_mapped);
197 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
198 bool remapped = false;
199 u64 nd_pa;
200 void *nd;
201 int tnid;
202
203 /*
204 * Don't confuse VM with a node that doesn't have the
205 * minimum amount of memory:
206 */
207 if (end && (end - start) < NODE_MIN_SIZE)
208 return;
209
210 /* initialize remap allocator before aligning to ZONE_ALIGN */
211 init_alloc_remap(nid, start, end);
212
213 start = roundup(start, ZONE_ALIGN);
214
215 printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
216 nid, start, end);
217
218 /*
219 * Allocate node data. Try remap allocator first, node-local
220 * memory and then any node. Never allocate in DMA zone.
221 */
222 nd = alloc_remap(nid, nd_size);
223 if (nd) {
224 nd_pa = __pa(nd);
225 remapped = true;
226 } else {
227 nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
228 nd_size, SMP_CACHE_BYTES);
229 if (nd_pa == MEMBLOCK_ERROR)
230 nd_pa = memblock_find_in_range(nd_low, nd_high,
231 nd_size, SMP_CACHE_BYTES);
232 if (nd_pa == MEMBLOCK_ERROR) {
233 pr_err("Cannot find %zu bytes in node %d\n",
234 nd_size, nid);
235 return;
236 }
237 memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
238 nd = __va(nd_pa);
239 }
240
241 /* report and initialize */
242 printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
243 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
244 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
245 if (!remapped && tnid != nid)
246 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
247
248 node_data[nid] = nd;
249 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
250 NODE_DATA(nid)->node_id = nid;
251 NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
252 NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
253
254 node_set_online(nid);
255}
256
257/**
258 * numa_cleanup_meminfo - Cleanup a numa_meminfo
259 * @mi: numa_meminfo to clean up
260 *
261 * Sanitize @mi by merging and removing unncessary memblks. Also check for
262 * conflicts and clear unused memblks.
263 *
264 * RETURNS:
265 * 0 on success, -errno on failure.
266 */
267int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
268{
269 const u64 low = 0;
270 const u64 high = PFN_PHYS(max_pfn);
271 int i, j, k;
272
273 /* first, trim all entries */
274 for (i = 0; i < mi->nr_blks; i++) {
275 struct numa_memblk *bi = &mi->blk[i];
276
277 /* make sure all blocks are inside the limits */
278 bi->start = max(bi->start, low);
279 bi->end = min(bi->end, high);
280
281 /* and there's no empty block */
282 if (bi->start >= bi->end)
283 numa_remove_memblk_from(i--, mi);
284 }
285
286 /* merge neighboring / overlapping entries */
287 for (i = 0; i < mi->nr_blks; i++) {
288 struct numa_memblk *bi = &mi->blk[i];
289
290 for (j = i + 1; j < mi->nr_blks; j++) {
291 struct numa_memblk *bj = &mi->blk[j];
292 u64 start, end;
293
294 /*
295 * See whether there are overlapping blocks. Whine
296 * about but allow overlaps of the same nid. They
297 * will be merged below.
298 */
299 if (bi->end > bj->start && bi->start < bj->end) {
300 if (bi->nid != bj->nid) {
301 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
302 bi->nid, bi->start, bi->end,
303 bj->nid, bj->start, bj->end);
304 return -EINVAL;
305 }
306 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
307 bi->nid, bi->start, bi->end,
308 bj->start, bj->end);
309 }
310
311 /*
312 * Join together blocks on the same node, holes
313 * between which don't overlap with memory on other
314 * nodes.
315 */
316 if (bi->nid != bj->nid)
317 continue;
318 start = min(bi->start, bj->start);
319 end = max(bi->end, bj->end);
320 for (k = 0; k < mi->nr_blks; k++) {
321 struct numa_memblk *bk = &mi->blk[k];
322
323 if (bi->nid == bk->nid)
324 continue;
325 if (start < bk->end && end > bk->start)
326 break;
327 }
328 if (k < mi->nr_blks)
329 continue;
330 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
331 bi->nid, bi->start, bi->end, bj->start, bj->end,
332 start, end);
333 bi->start = start;
334 bi->end = end;
335 numa_remove_memblk_from(j--, mi);
336 }
337 }
338
339 /* clear unused ones */
340 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
341 mi->blk[i].start = mi->blk[i].end = 0;
342 mi->blk[i].nid = NUMA_NO_NODE;
343 }
344
345 return 0;
346}
347
348/*
349 * Set nodes, which have memory in @mi, in *@nodemask.
350 */
351static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
352 const struct numa_meminfo *mi)
353{
354 int i;
355
356 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
357 if (mi->blk[i].start != mi->blk[i].end &&
358 mi->blk[i].nid != NUMA_NO_NODE)
359 node_set(mi->blk[i].nid, *nodemask);
360}
361
362/**
363 * numa_reset_distance - Reset NUMA distance table
364 *
365 * The current table is freed. The next numa_set_distance() call will
366 * create a new one.
367 */
368void __init numa_reset_distance(void)
369{
370 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
371
372 /* numa_distance could be 1LU marking allocation failure, test cnt */
373 if (numa_distance_cnt)
374 memblock_x86_free_range(__pa(numa_distance),
375 __pa(numa_distance) + size);
376 numa_distance_cnt = 0;
377 numa_distance = NULL; /* enable table creation */
378}
379
380static int __init numa_alloc_distance(void)
381{
382 nodemask_t nodes_parsed;
383 size_t size;
384 int i, j, cnt = 0;
385 u64 phys;
386
387 /* size the new table and allocate it */
388 nodes_parsed = numa_nodes_parsed;
389 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
390
391 for_each_node_mask(i, nodes_parsed)
392 cnt = i;
393 cnt++;
394 size = cnt * cnt * sizeof(numa_distance[0]);
395
396 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
397 size, PAGE_SIZE);
398 if (phys == MEMBLOCK_ERROR) {
399 pr_warning("NUMA: Warning: can't allocate distance table!\n");
400 /* don't retry until explicitly reset */
401 numa_distance = (void *)1LU;
402 return -ENOMEM;
403 }
404 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
405
406 numa_distance = __va(phys);
407 numa_distance_cnt = cnt;
408
409 /* fill with the default distances */
410 for (i = 0; i < cnt; i++)
411 for (j = 0; j < cnt; j++)
412 numa_distance[i * cnt + j] = i == j ?
413 LOCAL_DISTANCE : REMOTE_DISTANCE;
414 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
415
416 return 0;
417}
418
419/**
420 * numa_set_distance - Set NUMA distance from one NUMA to another
421 * @from: the 'from' node to set distance
422 * @to: the 'to' node to set distance
423 * @distance: NUMA distance
424 *
425 * Set the distance from node @from to @to to @distance. If distance table
426 * doesn't exist, one which is large enough to accommodate all the currently
427 * known nodes will be created.
428 *
429 * If such table cannot be allocated, a warning is printed and further
430 * calls are ignored until the distance table is reset with
431 * numa_reset_distance().
432 *
433 * If @from or @to is higher than the highest known node at the time of
434 * table creation or @distance doesn't make sense, the call is ignored.
435 * This is to allow simplification of specific NUMA config implementations.
436 */
437void __init numa_set_distance(int from, int to, int distance)
438{
439 if (!numa_distance && numa_alloc_distance() < 0)
440 return;
441
442 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
443 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
444 from, to, distance);
445 return;
446 }
447
448 if ((u8)distance != distance ||
449 (from == to && distance != LOCAL_DISTANCE)) {
450 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
451 from, to, distance);
452 return;
453 }
454
455 numa_distance[from * numa_distance_cnt + to] = distance;
456}
457
458int __node_distance(int from, int to)
459{
460 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
461 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
462 return numa_distance[from * numa_distance_cnt + to];
463}
464EXPORT_SYMBOL(__node_distance);
465
466/*
467 * Sanity check to catch more bad NUMA configurations (they are amazingly
468 * common). Make sure the nodes cover all memory.
469 */
470static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
471{
472 u64 numaram, e820ram;
473 int i;
474
475 numaram = 0;
476 for (i = 0; i < mi->nr_blks; i++) {
477 u64 s = mi->blk[i].start >> PAGE_SHIFT;
478 u64 e = mi->blk[i].end >> PAGE_SHIFT;
479 numaram += e - s;
480 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
481 if ((s64)numaram < 0)
482 numaram = 0;
483 }
484
485 e820ram = max_pfn - (memblock_x86_hole_size(0,
486 PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
487 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
488 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
489 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
490 (numaram << PAGE_SHIFT) >> 20,
491 (e820ram << PAGE_SHIFT) >> 20);
492 return false;
493 }
494 return true;
495}
496
497static int __init numa_register_memblks(struct numa_meminfo *mi)
498{
499 int i, nid;
500
501 /* Account for nodes with cpus and no memory */
502 node_possible_map = numa_nodes_parsed;
503 numa_nodemask_from_meminfo(&node_possible_map, mi);
504 if (WARN_ON(nodes_empty(node_possible_map)))
505 return -EINVAL;
506
507 for (i = 0; i < mi->nr_blks; i++)
508 memblock_x86_register_active_regions(mi->blk[i].nid,
509 mi->blk[i].start >> PAGE_SHIFT,
510 mi->blk[i].end >> PAGE_SHIFT);
511
512 /* for out of order entries */
513 sort_node_map();
514 if (!numa_meminfo_cover_memory(mi))
515 return -EINVAL;
516
517 /* Finally register nodes. */
518 for_each_node_mask(nid, node_possible_map) {
519 u64 start = PFN_PHYS(max_pfn);
520 u64 end = 0;
521
522 for (i = 0; i < mi->nr_blks; i++) {
523 if (nid != mi->blk[i].nid)
524 continue;
525 start = min(mi->blk[i].start, start);
526 end = max(mi->blk[i].end, end);
527 }
528
529 if (start < end)
530 setup_node_data(nid, start, end);
531 }
532
533 return 0;
534}
535
536/*
537 * There are unfortunately some poorly designed mainboards around that
538 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
539 * mapping. To avoid this fill in the mapping for all possible CPUs,
540 * as the number of CPUs is not known yet. We round robin the existing
541 * nodes.
542 */
543static void __init numa_init_array(void)
544{
545 int rr, i;
546
547 rr = first_node(node_online_map);
548 for (i = 0; i < nr_cpu_ids; i++) {
549 if (early_cpu_to_node(i) != NUMA_NO_NODE)
550 continue;
551 numa_set_node(i, rr);
552 rr = next_node(rr, node_online_map);
553 if (rr == MAX_NUMNODES)
554 rr = first_node(node_online_map);
555 }
556}
557
558static int __init numa_init(int (*init_func)(void))
559{
560 int i;
561 int ret;
562
563 for (i = 0; i < MAX_LOCAL_APIC; i++)
564 set_apicid_to_node(i, NUMA_NO_NODE);
565
566 nodes_clear(numa_nodes_parsed);
567 nodes_clear(node_possible_map);
568 nodes_clear(node_online_map);
569 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
570 remove_all_active_ranges();
571 numa_reset_distance();
572
573 ret = init_func();
574 if (ret < 0)
575 return ret;
576 ret = numa_cleanup_meminfo(&numa_meminfo);
577 if (ret < 0)
578 return ret;
579
580 numa_emulation(&numa_meminfo, numa_distance_cnt);
581
582 ret = numa_register_memblks(&numa_meminfo);
583 if (ret < 0)
584 return ret;
585
586 for (i = 0; i < nr_cpu_ids; i++) {
587 int nid = early_cpu_to_node(i);
588
589 if (nid == NUMA_NO_NODE)
590 continue;
591 if (!node_online(nid))
592 numa_clear_node(i);
593 }
594 numa_init_array();
595 return 0;
596}
597
598/**
599 * dummy_numa_init - Fallback dummy NUMA init
600 *
601 * Used if there's no underlying NUMA architecture, NUMA initialization
602 * fails, or NUMA is disabled on the command line.
603 *
604 * Must online at least one node and add memory blocks that cover all
605 * allowed memory. This function must not fail.
606 */
607static int __init dummy_numa_init(void)
608{
609 printk(KERN_INFO "%s\n",
610 numa_off ? "NUMA turned off" : "No NUMA configuration found");
611 printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
612 0LLU, PFN_PHYS(max_pfn));
613
614 node_set(0, numa_nodes_parsed);
615 numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
616
617 return 0;
618}
619
620/**
621 * x86_numa_init - Initialize NUMA
622 *
623 * Try each configured NUMA initialization method until one succeeds. The
624 * last fallback is dummy single node config encomapssing whole memory and
625 * never fails.
626 */
627void __init x86_numa_init(void)
628{
629 if (!numa_off) {
630#ifdef CONFIG_X86_NUMAQ
631 if (!numa_init(numaq_numa_init))
632 return;
633#endif
634#ifdef CONFIG_ACPI_NUMA
635 if (!numa_init(x86_acpi_numa_init))
636 return;
637#endif
638#ifdef CONFIG_AMD_NUMA
639 if (!numa_init(amd_numa_init))
640 return;
641#endif
642 }
643
644 numa_init(dummy_numa_init);
645}
646
647static __init int find_near_online_node(int node)
648{
649 int n, val;
650 int min_val = INT_MAX;
651 int best_node = -1;
652
653 for_each_online_node(n) {
654 val = node_distance(node, n);
655
656 if (val < min_val) {
657 min_val = val;
658 best_node = n;
659 }
660 }
661
662 return best_node;
663}
664
665/*
666 * Setup early cpu_to_node.
667 *
668 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
669 * and apicid_to_node[] tables have valid entries for a CPU.
670 * This means we skip cpu_to_node[] initialisation for NUMA
671 * emulation and faking node case (when running a kernel compiled
672 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
673 * is already initialized in a round robin manner at numa_init_array,
674 * prior to this call, and this initialization is good enough
675 * for the fake NUMA cases.
676 *
677 * Called before the per_cpu areas are setup.
678 */
679void __init init_cpu_to_node(void)
680{
681 int cpu;
682 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
683
684 BUG_ON(cpu_to_apicid == NULL);
685
686 for_each_possible_cpu(cpu) {
687 int node = numa_cpu_node(cpu);
688
689 if (node == NUMA_NO_NODE)
690 continue;
691 if (!node_online(node))
692 node = find_near_online_node(node);
693 numa_set_node(cpu, node);
694 }
695}
696
697#ifndef CONFIG_DEBUG_PER_CPU_MAPS
698
699# ifndef CONFIG_NUMA_EMU
700void __cpuinit numa_add_cpu(int cpu)
701{
702 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
703}
704
705void __cpuinit numa_remove_cpu(int cpu)
706{
707 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
708}
709# endif /* !CONFIG_NUMA_EMU */
710
711#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
712
713int __cpu_to_node(int cpu)
714{
715 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
716 printk(KERN_WARNING
717 "cpu_to_node(%d): usage too early!\n", cpu);
718 dump_stack();
719 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
720 }
721 return per_cpu(x86_cpu_to_node_map, cpu);
722}
723EXPORT_SYMBOL(__cpu_to_node);
724
725/*
726 * Same function as cpu_to_node() but used if called before the
727 * per_cpu areas are setup.
728 */
729int early_cpu_to_node(int cpu)
730{
731 if (early_per_cpu_ptr(x86_cpu_to_node_map))
732 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
733
734 if (!cpu_possible(cpu)) {
735 printk(KERN_WARNING
736 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
737 dump_stack();
738 return NUMA_NO_NODE;
739 }
740 return per_cpu(x86_cpu_to_node_map, cpu);
741}
742
743void debug_cpumask_set_cpu(int cpu, int node, bool enable)
744{
745 struct cpumask *mask;
746 char buf[64];
747
748 if (node == NUMA_NO_NODE) {
749 /* early_cpu_to_node() already emits a warning and trace */
750 return;
751 }
752 mask = node_to_cpumask_map[node];
753 if (!mask) {
754 pr_err("node_to_cpumask_map[%i] NULL\n", node);
755 dump_stack();
756 return;
757 }
758
759 if (enable)
760 cpumask_set_cpu(cpu, mask);
761 else
762 cpumask_clear_cpu(cpu, mask);
763
764 cpulist_scnprintf(buf, sizeof(buf), mask);
765 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
766 enable ? "numa_add_cpu" : "numa_remove_cpu",
767 cpu, node, buf);
768 return;
769}
770
771# ifndef CONFIG_NUMA_EMU
772static void __cpuinit numa_set_cpumask(int cpu, bool enable)
773{
774 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
775}
776
777void __cpuinit numa_add_cpu(int cpu)
778{
779 numa_set_cpumask(cpu, true);
780}
781
782void __cpuinit numa_remove_cpu(int cpu)
783{
784 numa_set_cpumask(cpu, false);
785}
786# endif /* !CONFIG_NUMA_EMU */
787
39/* 788/*
40 * Returns a pointer to the bitmask of CPUs on Node 'node'. 789 * Returns a pointer to the bitmask of CPUs on Node 'node'.
41 */ 790 */
@@ -58,4 +807,20 @@ const struct cpumask *cpumask_of_node(int node)
58 return node_to_cpumask_map[node]; 807 return node_to_cpumask_map[node];
59} 808}
60EXPORT_SYMBOL(cpumask_of_node); 809EXPORT_SYMBOL(cpumask_of_node);
810
811#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
812
813#ifdef CONFIG_MEMORY_HOTPLUG
814int memory_add_physaddr_to_nid(u64 start)
815{
816 struct numa_meminfo *mi = &numa_meminfo;
817 int nid = mi->blk[0].nid;
818 int i;
819
820 for (i = 0; i < mi->nr_blks; i++)
821 if (mi->blk[i].start <= start && mi->blk[i].end > start)
822 nid = mi->blk[i].nid;
823 return nid;
824}
825EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
61#endif 826#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 809baaaf48b1..849a975d3fa0 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,38 +22,11 @@
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 23 */
24 24
25#include <linux/mm.h>
26#include <linux/bootmem.h> 25#include <linux/bootmem.h>
27#include <linux/mmzone.h> 26#include <linux/memblock.h>
28#include <linux/highmem.h>
29#include <linux/initrd.h>
30#include <linux/nodemask.h>
31#include <linux/module.h> 27#include <linux/module.h>
32#include <linux/kexec.h>
33#include <linux/pfn.h>
34#include <linux/swap.h>
35#include <linux/acpi.h>
36
37#include <asm/e820.h>
38#include <asm/setup.h>
39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
42
43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
44EXPORT_SYMBOL(node_data);
45
46/*
47 * numa interface - we expect the numa architecture specific code to have
48 * populated the following initialisation.
49 *
50 * 1) node_online_map - the map of all nodes configured (online) in the system
51 * 2) node_start_pfn - the starting page frame number for a node
52 * 3) node_end_pfn - the ending page fram number for a node
53 */
54unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
55unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
56 28
29#include "numa_internal.h"
57 30
58#ifdef CONFIG_DISCONTIGMEM 31#ifdef CONFIG_DISCONTIGMEM
59/* 32/*
@@ -98,102 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
98} 71}
99#endif 72#endif
100 73
101extern unsigned long find_max_low_pfn(void);
102extern unsigned long highend_pfn, highstart_pfn; 74extern unsigned long highend_pfn, highstart_pfn;
103 75
104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 76#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
105 77
106unsigned long node_remap_size[MAX_NUMNODES];
107static void *node_remap_start_vaddr[MAX_NUMNODES]; 78static void *node_remap_start_vaddr[MAX_NUMNODES];
108void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 79void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
109 80
110static unsigned long kva_start_pfn;
111static unsigned long kva_pages;
112/*
113 * FLAT - support for basic PC memory model with discontig enabled, essentially
114 * a single node with all available processors in it with a flat
115 * memory map.
116 */
117int __init get_memcfg_numa_flat(void)
118{
119 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
120
121 node_start_pfn[0] = 0;
122 node_end_pfn[0] = max_pfn;
123 e820_register_active_regions(0, 0, max_pfn);
124 memory_present(0, 0, max_pfn);
125 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
126
127 /* Indicate there is one node available. */
128 nodes_clear(node_online_map);
129 node_set_online(0);
130 return 1;
131}
132
133/*
134 * Find the highest page frame number we have available for the node
135 */
136static void __init propagate_e820_map_node(int nid)
137{
138 if (node_end_pfn[nid] > max_pfn)
139 node_end_pfn[nid] = max_pfn;
140 /*
141 * if a user has given mem=XXXX, then we need to make sure
142 * that the node _starts_ before that, too, not just ends
143 */
144 if (node_start_pfn[nid] > max_pfn)
145 node_start_pfn[nid] = max_pfn;
146 BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
147}
148
149/*
150 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
151 * method. For node zero take this from the bottom of memory, for
152 * subsequent nodes place them at node_remap_start_vaddr which contains
153 * node local data in physically node local memory. See setup_memory()
154 * for details.
155 */
156static void __init allocate_pgdat(int nid)
157{
158 char buf[16];
159
160 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
161 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
162 else {
163 unsigned long pgdat_phys;
164 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
165 max_pfn_mapped<<PAGE_SHIFT,
166 sizeof(pg_data_t),
167 PAGE_SIZE);
168 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
169 memset(buf, 0, sizeof(buf));
170 sprintf(buf, "NODE_DATA %d", nid);
171 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
172 }
173 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
174 nid, (unsigned long)NODE_DATA(nid));
175}
176
177/* 81/*
178 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel 82 * Remap memory allocator
179 * virtual address space (KVA) is reserved and portions of nodes are mapped
180 * using it. This is to allow node-local memory to be allocated for
181 * structures that would normally require ZONE_NORMAL. The memory is
182 * allocated with alloc_remap() and callers should be prepared to allocate
183 * from the bootmem allocator instead.
184 */ 83 */
185static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 84static unsigned long node_remap_start_pfn[MAX_NUMNODES];
186static void *node_remap_end_vaddr[MAX_NUMNODES]; 85static void *node_remap_end_vaddr[MAX_NUMNODES];
187static void *node_remap_alloc_vaddr[MAX_NUMNODES]; 86static void *node_remap_alloc_vaddr[MAX_NUMNODES];
188static unsigned long node_remap_offset[MAX_NUMNODES];
189 87
88/**
89 * alloc_remap - Allocate remapped memory
90 * @nid: NUMA node to allocate memory from
91 * @size: The size of allocation
92 *
93 * Allocate @size bytes from the remap area of NUMA node @nid. The
94 * size of the remap area is predetermined by init_alloc_remap() and
95 * only the callers considered there should call this function. For
96 * more info, please read the comment on top of init_alloc_remap().
97 *
98 * The caller must be ready to handle allocation failure from this
99 * function and fall back to regular memory allocator in such cases.
100 *
101 * CONTEXT:
102 * Single CPU early boot context.
103 *
104 * RETURNS:
105 * Pointer to the allocated memory on success, %NULL on failure.
106 */
190void *alloc_remap(int nid, unsigned long size) 107void *alloc_remap(int nid, unsigned long size)
191{ 108{
192 void *allocation = node_remap_alloc_vaddr[nid]; 109 void *allocation = node_remap_alloc_vaddr[nid];
193 110
194 size = ALIGN(size, L1_CACHE_BYTES); 111 size = ALIGN(size, L1_CACHE_BYTES);
195 112
196 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) 113 if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
197 return NULL; 114 return NULL;
198 115
199 node_remap_alloc_vaddr[nid] += size; 116 node_remap_alloc_vaddr[nid] += size;
@@ -202,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size)
202 return allocation; 119 return allocation;
203} 120}
204 121
205static void __init remap_numa_kva(void)
206{
207 void *vaddr;
208 unsigned long pfn;
209 int node;
210
211 for_each_online_node(node) {
212 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
213 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
214 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
215 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
216 (unsigned long)vaddr,
217 node_remap_start_pfn[node] + pfn);
218 set_pmd_pfn((ulong) vaddr,
219 node_remap_start_pfn[node] + pfn,
220 PAGE_KERNEL_LARGE);
221 }
222 }
223}
224
225#ifdef CONFIG_HIBERNATION 122#ifdef CONFIG_HIBERNATION
226/** 123/**
227 * resume_map_numa_kva - add KVA mapping to the temporary page tables created 124 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -233,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
233 int node; 130 int node;
234 131
235 for_each_online_node(node) { 132 for_each_online_node(node) {
236 unsigned long start_va, start_pfn, size, pfn; 133 unsigned long start_va, start_pfn, nr_pages, pfn;
237 134
238 start_va = (unsigned long)node_remap_start_vaddr[node]; 135 start_va = (unsigned long)node_remap_start_vaddr[node];
239 start_pfn = node_remap_start_pfn[node]; 136 start_pfn = node_remap_start_pfn[node];
240 size = node_remap_size[node]; 137 nr_pages = (node_remap_end_vaddr[node] -
138 node_remap_start_vaddr[node]) >> PAGE_SHIFT;
241 139
242 printk(KERN_DEBUG "%s: node %d\n", __func__, node); 140 printk(KERN_DEBUG "%s: node %d\n", __func__, node);
243 141
244 for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { 142 for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
245 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); 143 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
246 pgd_t *pgd = pgd_base + pgd_index(vaddr); 144 pgd_t *pgd = pgd_base + pgd_index(vaddr);
247 pud_t *pud = pud_offset(pgd, vaddr); 145 pud_t *pud = pud_offset(pgd, vaddr);
@@ -257,134 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base)
257} 155}
258#endif 156#endif
259 157
260static __init unsigned long calculate_numa_remap_pages(void) 158/**
261{ 159 * init_alloc_remap - Initialize remap allocator for a NUMA node
262 int nid; 160 * @nid: NUMA node to initizlie remap allocator for
263 unsigned long size, reserve_pages = 0; 161 *
264 162 * NUMA nodes may end up without any lowmem. As allocating pgdat and
265 for_each_online_node(nid) { 163 * memmap on a different node with lowmem is inefficient, a special
266 u64 node_kva_target; 164 * remap allocator is implemented which can be used by alloc_remap().
267 u64 node_kva_final; 165 *
268 166 * For each node, the amount of memory which will be necessary for
269 /* 167 * pgdat and memmap is calculated and two memory areas of the size are
270 * The acpi/srat node info can show hot-add memroy zones 168 * allocated - one in the node and the other in lowmem; then, the area
271 * where memory could be added but not currently present. 169 * in the node is remapped to the lowmem area.
272 */ 170 *
273 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", 171 * As pgdat and memmap must be allocated in lowmem anyway, this
274 nid, node_start_pfn[nid], node_end_pfn[nid]); 172 * doesn't waste lowmem address space; however, the actual lowmem
275 if (node_start_pfn[nid] > max_pfn) 173 * which gets remapped over is wasted. The amount shouldn't be
276 continue; 174 * problematic on machines this feature will be used.
277 if (!node_end_pfn[nid]) 175 *
278 continue; 176 * Initialization failure isn't fatal. alloc_remap() is used
279 if (node_end_pfn[nid] > max_pfn) 177 * opportunistically and the callers will fall back to other memory
280 node_end_pfn[nid] = max_pfn; 178 * allocation mechanisms on failure.
281 179 */
282 /* ensure the remap includes space for the pgdat. */ 180void __init init_alloc_remap(int nid, u64 start, u64 end)
283 size = node_remap_size[nid] + sizeof(pg_data_t);
284
285 /* convert size to large (pmd size) pages, rounding up */
286 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
287 /* now the roundup is correct, convert to PAGE_SIZE pages */
288 size = size * PTRS_PER_PTE;
289
290 node_kva_target = round_down(node_end_pfn[nid] - size,
291 PTRS_PER_PTE);
292 node_kva_target <<= PAGE_SHIFT;
293 do {
294 node_kva_final = find_e820_area(node_kva_target,
295 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
296 ((u64)size)<<PAGE_SHIFT,
297 LARGE_PAGE_BYTES);
298 node_kva_target -= LARGE_PAGE_BYTES;
299 } while (node_kva_final == -1ULL &&
300 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
301
302 if (node_kva_final == -1ULL)
303 panic("Can not get kva ram\n");
304
305 node_remap_size[nid] = size;
306 node_remap_offset[nid] = reserve_pages;
307 reserve_pages += size;
308 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
309 " node %d at %llx\n",
310 size, nid, node_kva_final>>PAGE_SHIFT);
311
312 /*
313 * prevent kva address below max_low_pfn want it on system
314 * with less memory later.
315 * layout will be: KVA address , KVA RAM
316 *
317 * we are supposed to only record the one less then max_low_pfn
318 * but we could have some hole in high memory, and it will only
319 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
320 * to use it as free.
321 * So reserve_early here, hope we don't run out of that array
322 */
323 reserve_early(node_kva_final,
324 node_kva_final+(((u64)size)<<PAGE_SHIFT),
325 "KVA RAM");
326
327 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
328 remove_active_range(nid, node_remap_start_pfn[nid],
329 node_remap_start_pfn[nid] + size);
330 }
331 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
332 reserve_pages);
333 return reserve_pages;
334}
335
336static void init_remap_allocator(int nid)
337{
338 node_remap_start_vaddr[nid] = pfn_to_kaddr(
339 kva_start_pfn + node_remap_offset[nid]);
340 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
341 (node_remap_size[nid] * PAGE_SIZE);
342 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
343 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
344
345 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
346 (ulong) node_remap_start_vaddr[nid],
347 (ulong) node_remap_end_vaddr[nid]);
348}
349
350void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
351 int acpi, int k8)
352{ 181{
353 int nid; 182 unsigned long start_pfn = start >> PAGE_SHIFT;
354 long kva_target_pfn; 183 unsigned long end_pfn = end >> PAGE_SHIFT;
184 unsigned long size, pfn;
185 u64 node_pa, remap_pa;
186 void *remap_va;
355 187
356 /* 188 /*
357 * When mapping a NUMA machine we allocate the node_mem_map arrays 189 * The acpi/srat node info can show hot-add memroy zones where
358 * from node local memory. They are then mapped directly into KVA 190 * memory could be added but not currently present.
359 * between zone normal and vmalloc space. Calculate the size of
360 * this space and use it to adjust the boundary between ZONE_NORMAL
361 * and ZONE_HIGHMEM.
362 */ 191 */
192 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
193 nid, start_pfn, end_pfn);
194
195 /* calculate the necessary space aligned to large page size */
196 size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
197 size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
198 size = ALIGN(size, LARGE_PAGE_BYTES);
199
200 /* allocate node memory and the lowmem remap area */
201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
202 if (node_pa == MEMBLOCK_ERROR) {
203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
204 size, nid);
205 return;
206 }
207 memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
208
209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
210 max_low_pfn << PAGE_SHIFT,
211 size, LARGE_PAGE_BYTES);
212 if (remap_pa == MEMBLOCK_ERROR) {
213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
214 size, nid);
215 memblock_x86_free_range(node_pa, node_pa + size);
216 return;
217 }
218 memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
219 remap_va = phys_to_virt(remap_pa);
220
221 /* perform actual remap */
222 for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
223 set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
224 (node_pa >> PAGE_SHIFT) + pfn,
225 PAGE_KERNEL_LARGE);
226
227 /* initialize remap allocator parameters */
228 node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
229 node_remap_start_vaddr[nid] = remap_va;
230 node_remap_end_vaddr[nid] = remap_va + size;
231 node_remap_alloc_vaddr[nid] = remap_va;
232
233 printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
234 nid, node_pa, node_pa + size, remap_va, remap_va + size);
235}
363 236
364 get_memcfg_numa(); 237void __init initmem_init(void)
365 238{
366 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); 239 x86_numa_init();
367
368 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
369 do {
370 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
371 max_low_pfn<<PAGE_SHIFT,
372 kva_pages<<PAGE_SHIFT,
373 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
374 kva_target_pfn -= PTRS_PER_PTE;
375 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
376
377 if (kva_start_pfn == -1UL)
378 panic("Can not get kva space\n");
379
380 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
381 kva_start_pfn, max_low_pfn);
382 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
383 240
384 /* avoid clash with initrd */
385 reserve_early(kva_start_pfn<<PAGE_SHIFT,
386 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
387 "KVA PG");
388#ifdef CONFIG_HIGHMEM 241#ifdef CONFIG_HIGHMEM
389 highstart_pfn = highend_pfn = max_pfn; 242 highstart_pfn = highend_pfn = max_pfn;
390 if (max_pfn > max_low_pfn) 243 if (max_pfn > max_low_pfn)
@@ -404,54 +257,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
404 257
405 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", 258 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
406 (ulong) pfn_to_kaddr(max_low_pfn)); 259 (ulong) pfn_to_kaddr(max_low_pfn));
407 for_each_online_node(nid) {
408 init_remap_allocator(nid);
409
410 allocate_pgdat(nid);
411 }
412 remap_numa_kva();
413 260
414 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", 261 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
415 (ulong) pfn_to_kaddr(highstart_pfn)); 262 (ulong) pfn_to_kaddr(highstart_pfn));
416 for_each_online_node(nid)
417 propagate_e820_map_node(nid);
418
419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->node_id = nid;
422#ifndef CONFIG_NO_BOOTMEM
423 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
424#endif
425 }
426 263
427 setup_bootmem_allocator(); 264 setup_bootmem_allocator();
428} 265}
429
430#ifdef CONFIG_MEMORY_HOTPLUG
431static int paddr_to_nid(u64 addr)
432{
433 int nid;
434 unsigned long pfn = PFN_DOWN(addr);
435
436 for_each_node(nid)
437 if (node_start_pfn[nid] <= pfn &&
438 pfn < node_end_pfn[nid])
439 return nid;
440
441 return -1;
442}
443
444/*
445 * This function is used to ask node id BEFORE memmap and mem_section's
446 * initialization (pfn_to_nid() can't be used yet).
447 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
448 */
449int memory_add_physaddr_to_nid(u64 addr)
450{
451 int nid = paddr_to_nid(addr);
452 return (nid >= 0) ? nid : 0;
453}
454
455EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
456#endif
457
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96c..dd27f401f0a0 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,697 +2,13 @@
2 * Generic VM initialization for x86-64 NUMA setups. 2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */ 4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h> 5#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14#include <linux/sched.h>
15 6
16#include <asm/e820.h> 7#include "numa_internal.h"
17#include <asm/proto.h>
18#include <asm/dma.h>
19#include <asm/numa.h>
20#include <asm/acpi.h>
21#include <asm/k8.h>
22 8
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 9void __init initmem_init(void)
24EXPORT_SYMBOL(node_data);
25
26struct memnode memnode;
27
28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
29 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
30};
31
32int numa_off __initdata;
33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size;
35
36/*
37 * Map cpu index to node index
38 */
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
41
42/*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
49static int __init populate_memnodemap(const struct bootnode *nodes,
50 int numnodes, int shift, int *nodeids)
51{
52 unsigned long addr, end;
53 int i, res = -1;
54
55 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
56 for (i = 0; i < numnodes; i++) {
57 addr = nodes[i].start;
58 end = nodes[i].end;
59 if (addr >= end)
60 continue;
61 if ((end >> shift) >= memnodemapsize)
62 return 0;
63 do {
64 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
65 return -1;
66
67 if (!nodeids)
68 memnodemap[addr >> shift] = i;
69 else
70 memnodemap[addr >> shift] = nodeids[i];
71
72 addr += (1UL << shift);
73 } while (addr < end);
74 res = 1;
75 }
76 return res;
77}
78
79static int __init allocate_cachealigned_memnodemap(void)
80{
81 unsigned long addr;
82
83 memnodemap = memnode.embedded_map;
84 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
85 return 0;
86
87 addr = 0x8000;
88 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
89 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
90 nodemap_size, L1_CACHE_BYTES);
91 if (nodemap_addr == -1UL) {
92 printk(KERN_ERR
93 "NUMA: Unable to allocate Memory to Node hash map\n");
94 nodemap_addr = nodemap_size = 0;
95 return -1;
96 }
97 memnodemap = phys_to_virt(nodemap_addr);
98 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
99
100 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
101 nodemap_addr, nodemap_addr + nodemap_size);
102 return 0;
103}
104
105/*
106 * The LSB of all start and end addresses in the node map is the value of the
107 * maximum possible shift.
108 */
109static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
110 int numnodes)
111{
112 int i, nodes_used = 0;
113 unsigned long start, end;
114 unsigned long bitfield = 0, memtop = 0;
115
116 for (i = 0; i < numnodes; i++) {
117 start = nodes[i].start;
118 end = nodes[i].end;
119 if (start >= end)
120 continue;
121 bitfield |= start;
122 nodes_used++;
123 if (end > memtop)
124 memtop = end;
125 }
126 if (nodes_used <= 1)
127 i = 63;
128 else
129 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
130 memnodemapsize = (memtop >> i)+1;
131 return i;
132}
133
134int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
135 int *nodeids)
136{
137 int shift;
138
139 shift = extract_lsb_from_nodes(nodes, numnodes);
140 if (allocate_cachealigned_memnodemap())
141 return -1;
142 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
143 shift);
144
145 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
146 printk(KERN_INFO "Your memory is not aligned you need to "
147 "rebuild your kernel with a bigger NODEMAPSIZE "
148 "shift=%d\n", shift);
149 return -1;
150 }
151 return shift;
152}
153
154int __meminit __early_pfn_to_nid(unsigned long pfn)
155{
156 return phys_to_nid(pfn << PAGE_SHIFT);
157}
158
159static void * __init early_node_mem(int nodeid, unsigned long start,
160 unsigned long end, unsigned long size,
161 unsigned long align)
162{
163 unsigned long mem;
164
165 /*
166 * put it on high as possible
167 * something will go with NODE_DATA
168 */
169 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
170 start = MAX_DMA_PFN<<PAGE_SHIFT;
171 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
172 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
173 start = MAX_DMA32_PFN<<PAGE_SHIFT;
174 mem = find_e820_area(start, end, size, align);
175 if (mem != -1L)
176 return __va(mem);
177
178 /* extend the search scope */
179 end = max_pfn_mapped << PAGE_SHIFT;
180 if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
181 start = MAX_DMA32_PFN<<PAGE_SHIFT;
182 else
183 start = MAX_DMA_PFN<<PAGE_SHIFT;
184 mem = find_e820_area(start, end, size, align);
185 if (mem != -1L)
186 return __va(mem);
187
188 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
189 size, nodeid);
190
191 return NULL;
192}
193
194/* Initialize bootmem allocator for a node */
195void __init
196setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
197{
198 unsigned long start_pfn, last_pfn, nodedata_phys;
199 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
200 int nid;
201#ifndef CONFIG_NO_BOOTMEM
202 unsigned long bootmap_start, bootmap_pages, bootmap_size;
203 void *bootmap;
204#endif
205
206 if (!end)
207 return;
208
209 /*
210 * Don't confuse VM with a node that doesn't have the
211 * minimum amount of memory:
212 */
213 if (end && (end - start) < NODE_MIN_SIZE)
214 return;
215
216 start = roundup(start, ZONE_ALIGN);
217
218 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
219 start, end);
220
221 start_pfn = start >> PAGE_SHIFT;
222 last_pfn = end >> PAGE_SHIFT;
223
224 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
225 SMP_CACHE_BYTES);
226 if (node_data[nodeid] == NULL)
227 return;
228 nodedata_phys = __pa(node_data[nodeid]);
229 reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
230 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
231 nodedata_phys + pgdat_size - 1);
232 nid = phys_to_nid(nodedata_phys);
233 if (nid != nodeid)
234 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
235
236 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
237 NODE_DATA(nodeid)->node_id = nodeid;
238 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
239 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
240
241#ifndef CONFIG_NO_BOOTMEM
242 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
243
244 /*
245 * Find a place for the bootmem map
246 * nodedata_phys could be on other nodes by alloc_bootmem,
247 * so need to sure bootmap_start not to be small, otherwise
248 * early_node_mem will get that with find_e820_area instead
249 * of alloc_bootmem, that could clash with reserved range
250 */
251 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
252 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
253 /*
254 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
255 * to use that to align to PAGE_SIZE
256 */
257 bootmap = early_node_mem(nodeid, bootmap_start, end,
258 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
259 if (bootmap == NULL) {
260 free_early(nodedata_phys, nodedata_phys + pgdat_size);
261 node_data[nodeid] = NULL;
262 return;
263 }
264 bootmap_start = __pa(bootmap);
265 reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
266 "BOOTMAP");
267
268 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
269 bootmap_start >> PAGE_SHIFT,
270 start_pfn, last_pfn);
271
272 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
273 bootmap_start, bootmap_start + bootmap_size - 1,
274 bootmap_pages);
275 nid = phys_to_nid(bootmap_start);
276 if (nid != nodeid)
277 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
278
279 free_bootmem_with_active_regions(nodeid, end);
280#endif
281
282 node_set_online(nodeid);
283}
284
285/*
286 * There are unfortunately some poorly designed mainboards around that
287 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
288 * mapping. To avoid this fill in the mapping for all possible CPUs,
289 * as the number of CPUs is not known yet. We round robin the existing
290 * nodes.
291 */
292void __init numa_init_array(void)
293{
294 int rr, i;
295
296 rr = first_node(node_online_map);
297 for (i = 0; i < nr_cpu_ids; i++) {
298 if (early_cpu_to_node(i) != NUMA_NO_NODE)
299 continue;
300 numa_set_node(i, rr);
301 rr = next_node(rr, node_online_map);
302 if (rr == MAX_NUMNODES)
303 rr = first_node(node_online_map);
304 }
305}
306
307#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */
309static struct bootnode nodes[MAX_NUMNODES] __initdata;
310static struct bootnode physnodes[MAX_NUMNODES] __initdata;
311static char *cmdline __initdata;
312
313static int __init setup_physnodes(unsigned long start, unsigned long end,
314 int acpi, int k8)
315{
316 int nr_nodes = 0;
317 int ret = 0;
318 int i;
319
320#ifdef CONFIG_ACPI_NUMA
321 if (acpi)
322 nr_nodes = acpi_get_nodes(physnodes);
323#endif
324#ifdef CONFIG_K8_NUMA
325 if (k8)
326 nr_nodes = k8_get_nodes(physnodes);
327#endif
328 /*
329 * Basic sanity checking on the physical node map: there may be errors
330 * if the SRAT or K8 incorrectly reported the topology or the mem=
331 * kernel parameter is used.
332 */
333 for (i = 0; i < nr_nodes; i++) {
334 if (physnodes[i].start == physnodes[i].end)
335 continue;
336 if (physnodes[i].start > end) {
337 physnodes[i].end = physnodes[i].start;
338 continue;
339 }
340 if (physnodes[i].end < start) {
341 physnodes[i].start = physnodes[i].end;
342 continue;
343 }
344 if (physnodes[i].start < start)
345 physnodes[i].start = start;
346 if (physnodes[i].end > end)
347 physnodes[i].end = end;
348 }
349
350 /*
351 * Remove all nodes that have no memory or were truncated because of the
352 * limited address range.
353 */
354 for (i = 0; i < nr_nodes; i++) {
355 if (physnodes[i].start == physnodes[i].end)
356 continue;
357 physnodes[ret].start = physnodes[i].start;
358 physnodes[ret].end = physnodes[i].end;
359 ret++;
360 }
361
362 /*
363 * If no physical topology was detected, a single node is faked to cover
364 * the entire address space.
365 */
366 if (!ret) {
367 physnodes[ret].start = start;
368 physnodes[ret].end = end;
369 ret = 1;
370 }
371 return ret;
372}
373
374/*
375 * Setups up nid to range from addr to addr + size. If the end
376 * boundary is greater than max_addr, then max_addr is used instead.
377 * The return value is 0 if there is additional memory left for
378 * allocation past addr and -1 otherwise. addr is adjusted to be at
379 * the end of the node.
380 */
381static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
382{
383 int ret = 0;
384 nodes[nid].start = *addr;
385 *addr += size;
386 if (*addr >= max_addr) {
387 *addr = max_addr;
388 ret = -1;
389 }
390 nodes[nid].end = *addr;
391 node_set(nid, node_possible_map);
392 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
393 nodes[nid].start, nodes[nid].end,
394 (nodes[nid].end - nodes[nid].start) >> 20);
395 return ret;
396}
397
398/*
399 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
400 * to max_addr. The return value is the number of nodes allocated.
401 */
402static int __init split_nodes_interleave(u64 addr, u64 max_addr,
403 int nr_phys_nodes, int nr_nodes)
404{ 10{
405 nodemask_t physnode_mask = NODE_MASK_NONE; 11 x86_numa_init();
406 u64 size;
407 int big;
408 int ret = 0;
409 int i;
410
411 if (nr_nodes <= 0)
412 return -1;
413 if (nr_nodes > MAX_NUMNODES) {
414 pr_info("numa=fake=%d too large, reducing to %d\n",
415 nr_nodes, MAX_NUMNODES);
416 nr_nodes = MAX_NUMNODES;
417 }
418
419 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
420 /*
421 * Calculate the number of big nodes that can be allocated as a result
422 * of consolidating the remainder.
423 */
424 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
425 FAKE_NODE_MIN_SIZE;
426
427 size &= FAKE_NODE_MIN_HASH_MASK;
428 if (!size) {
429 pr_err("Not enough memory for each node. "
430 "NUMA emulation disabled.\n");
431 return -1;
432 }
433
434 for (i = 0; i < nr_phys_nodes; i++)
435 if (physnodes[i].start != physnodes[i].end)
436 node_set(i, physnode_mask);
437
438 /*
439 * Continue to fill physical nodes with fake nodes until there is no
440 * memory left on any of them.
441 */
442 while (nodes_weight(physnode_mask)) {
443 for_each_node_mask(i, physnode_mask) {
444 u64 end = physnodes[i].start + size;
445 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
446
447 if (ret < big)
448 end += FAKE_NODE_MIN_SIZE;
449
450 /*
451 * Continue to add memory to this fake node if its
452 * non-reserved memory is less than the per-node size.
453 */
454 while (end - physnodes[i].start -
455 e820_hole_size(physnodes[i].start, end) < size) {
456 end += FAKE_NODE_MIN_SIZE;
457 if (end > physnodes[i].end) {
458 end = physnodes[i].end;
459 break;
460 }
461 }
462
463 /*
464 * If there won't be at least FAKE_NODE_MIN_SIZE of
465 * non-reserved memory in ZONE_DMA32 for the next node,
466 * this one must extend to the boundary.
467 */
468 if (end < dma32_end && dma32_end - end -
469 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
470 end = dma32_end;
471
472 /*
473 * If there won't be enough non-reserved memory for the
474 * next node, this one must extend to the end of the
475 * physical node.
476 */
477 if (physnodes[i].end - end -
478 e820_hole_size(end, physnodes[i].end) < size)
479 end = physnodes[i].end;
480
481 /*
482 * Avoid allocating more nodes than requested, which can
483 * happen as a result of rounding down each node's size
484 * to FAKE_NODE_MIN_SIZE.
485 */
486 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
487 end = physnodes[i].end;
488
489 if (setup_node_range(ret++, &physnodes[i].start,
490 end - physnodes[i].start,
491 physnodes[i].end) < 0)
492 node_clear(i, physnode_mask);
493 }
494 }
495 return ret;
496}
497
498/*
499 * Returns the end address of a node so that there is at least `size' amount of
500 * non-reserved memory or `max_addr' is reached.
501 */
502static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
503{
504 u64 end = start + size;
505
506 while (end - start - e820_hole_size(start, end) < size) {
507 end += FAKE_NODE_MIN_SIZE;
508 if (end > max_addr) {
509 end = max_addr;
510 break;
511 }
512 }
513 return end;
514}
515
516/*
517 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
518 * `addr' to `max_addr'. The return value is the number of nodes allocated.
519 */
520static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
521{
522 nodemask_t physnode_mask = NODE_MASK_NONE;
523 u64 min_size;
524 int ret = 0;
525 int i;
526
527 if (!size)
528 return -1;
529 /*
530 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
531 * increased accordingly if the requested size is too small. This
532 * creates a uniform distribution of node sizes across the entire
533 * machine (but not necessarily over physical nodes).
534 */
535 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
536 MAX_NUMNODES;
537 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
538 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
539 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
540 FAKE_NODE_MIN_HASH_MASK;
541 if (size < min_size) {
542 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
543 size >> 20, min_size >> 20);
544 size = min_size;
545 }
546 size &= FAKE_NODE_MIN_HASH_MASK;
547
548 for (i = 0; i < MAX_NUMNODES; i++)
549 if (physnodes[i].start != physnodes[i].end)
550 node_set(i, physnode_mask);
551 /*
552 * Fill physical nodes with fake nodes of size until there is no memory
553 * left on any of them.
554 */
555 while (nodes_weight(physnode_mask)) {
556 for_each_node_mask(i, physnode_mask) {
557 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
558 u64 end;
559
560 end = find_end_of_node(physnodes[i].start,
561 physnodes[i].end, size);
562 /*
563 * If there won't be at least FAKE_NODE_MIN_SIZE of
564 * non-reserved memory in ZONE_DMA32 for the next node,
565 * this one must extend to the boundary.
566 */
567 if (end < dma32_end && dma32_end - end -
568 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
569 end = dma32_end;
570
571 /*
572 * If there won't be enough non-reserved memory for the
573 * next node, this one must extend to the end of the
574 * physical node.
575 */
576 if (physnodes[i].end - end -
577 e820_hole_size(end, physnodes[i].end) < size)
578 end = physnodes[i].end;
579
580 /*
581 * Setup the fake node that will be allocated as bootmem
582 * later. If setup_node_range() returns non-zero, there
583 * is no more memory available on this physical node.
584 */
585 if (setup_node_range(ret++, &physnodes[i].start,
586 end - physnodes[i].start,
587 physnodes[i].end) < 0)
588 node_clear(i, physnode_mask);
589 }
590 }
591 return ret;
592}
593
594/*
595 * Sets up the system RAM area from start_pfn to last_pfn according to the
596 * numa=fake command-line option.
597 */
598static int __init numa_emulation(unsigned long start_pfn,
599 unsigned long last_pfn, int acpi, int k8)
600{
601 u64 addr = start_pfn << PAGE_SHIFT;
602 u64 max_addr = last_pfn << PAGE_SHIFT;
603 int num_phys_nodes;
604 int num_nodes;
605 int i;
606
607 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
608 /*
609 * If the numa=fake command-line contains a 'M' or 'G', it represents
610 * the fixed node size. Otherwise, if it is just a single number N,
611 * split the system RAM into N fake nodes.
612 */
613 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
614 u64 size;
615
616 size = memparse(cmdline, &cmdline);
617 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
618 } else {
619 unsigned long n;
620
621 n = simple_strtoul(cmdline, NULL, 0);
622 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
623 }
624
625 if (num_nodes < 0)
626 return num_nodes;
627 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
628 if (memnode_shift < 0) {
629 memnode_shift = 0;
630 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
631 "disabled.\n");
632 return -1;
633 }
634
635 /*
636 * We need to vacate all active ranges that may have been registered for
637 * the e820 memory map.
638 */
639 remove_all_active_ranges();
640 for_each_node_mask(i, node_possible_map) {
641 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
642 nodes[i].end >> PAGE_SHIFT);
643 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
644 }
645 acpi_fake_nodes(nodes, num_nodes);
646 numa_init_array();
647 return 0;
648}
649#endif /* CONFIG_NUMA_EMU */
650
651void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
652 int acpi, int k8)
653{
654 int i;
655
656 nodes_clear(node_possible_map);
657 nodes_clear(node_online_map);
658
659#ifdef CONFIG_NUMA_EMU
660 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
661 return;
662 nodes_clear(node_possible_map);
663 nodes_clear(node_online_map);
664#endif
665
666#ifdef CONFIG_ACPI_NUMA
667 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
668 last_pfn << PAGE_SHIFT))
669 return;
670 nodes_clear(node_possible_map);
671 nodes_clear(node_online_map);
672#endif
673
674#ifdef CONFIG_K8_NUMA
675 if (!numa_off && k8 && !k8_scan_nodes())
676 return;
677 nodes_clear(node_possible_map);
678 nodes_clear(node_online_map);
679#endif
680 printk(KERN_INFO "%s\n",
681 numa_off ? "NUMA turned off" : "No NUMA configuration found");
682
683 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
684 start_pfn << PAGE_SHIFT,
685 last_pfn << PAGE_SHIFT);
686 /* setup dummy node covering all memory */
687 memnode_shift = 63;
688 memnodemap = memnode.embedded_map;
689 memnodemap[0] = 0;
690 node_set_online(0);
691 node_set(0, node_possible_map);
692 for (i = 0; i < nr_cpu_ids; i++)
693 numa_set_node(i, 0);
694 e820_register_active_regions(0, start_pfn, last_pfn);
695 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
696} 12}
697 13
698unsigned long __init numa_free_all_bootmem(void) 14unsigned long __init numa_free_all_bootmem(void)
@@ -703,199 +19,7 @@ unsigned long __init numa_free_all_bootmem(void)
703 for_each_online_node(i) 19 for_each_online_node(i)
704 pages += free_all_bootmem_node(NODE_DATA(i)); 20 pages += free_all_bootmem_node(NODE_DATA(i));
705 21
706#ifdef CONFIG_NO_BOOTMEM
707 pages += free_all_memory_core_early(MAX_NUMNODES); 22 pages += free_all_memory_core_early(MAX_NUMNODES);
708#endif
709 23
710 return pages; 24 return pages;
711} 25}
712
713static __init int numa_setup(char *opt)
714{
715 if (!opt)
716 return -EINVAL;
717 if (!strncmp(opt, "off", 3))
718 numa_off = 1;
719#ifdef CONFIG_NUMA_EMU
720 if (!strncmp(opt, "fake=", 5))
721 cmdline = opt + 5;
722#endif
723#ifdef CONFIG_ACPI_NUMA
724 if (!strncmp(opt, "noacpi", 6))
725 acpi_numa = -1;
726#endif
727 return 0;
728}
729early_param("numa", numa_setup);
730
731#ifdef CONFIG_NUMA
732
733static __init int find_near_online_node(int node)
734{
735 int n, val;
736 int min_val = INT_MAX;
737 int best_node = -1;
738
739 for_each_online_node(n) {
740 val = node_distance(node, n);
741
742 if (val < min_val) {
743 min_val = val;
744 best_node = n;
745 }
746 }
747
748 return best_node;
749}
750
751/*
752 * Setup early cpu_to_node.
753 *
754 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
755 * and apicid_to_node[] tables have valid entries for a CPU.
756 * This means we skip cpu_to_node[] initialisation for NUMA
757 * emulation and faking node case (when running a kernel compiled
758 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
759 * is already initialized in a round robin manner at numa_init_array,
760 * prior to this call, and this initialization is good enough
761 * for the fake NUMA cases.
762 *
763 * Called before the per_cpu areas are setup.
764 */
765void __init init_cpu_to_node(void)
766{
767 int cpu;
768 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
769
770 BUG_ON(cpu_to_apicid == NULL);
771
772 for_each_possible_cpu(cpu) {
773 int node;
774 u16 apicid = cpu_to_apicid[cpu];
775
776 if (apicid == BAD_APICID)
777 continue;
778 node = apicid_to_node[apicid];
779 if (node == NUMA_NO_NODE)
780 continue;
781 if (!node_online(node))
782 node = find_near_online_node(node);
783 numa_set_node(cpu, node);
784 }
785}
786#endif
787
788
789void __cpuinit numa_set_node(int cpu, int node)
790{
791 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
792
793 /* early setting, no percpu area yet */
794 if (cpu_to_node_map) {
795 cpu_to_node_map[cpu] = node;
796 return;
797 }
798
799#ifdef CONFIG_DEBUG_PER_CPU_MAPS
800 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
801 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
802 dump_stack();
803 return;
804 }
805#endif
806 per_cpu(x86_cpu_to_node_map, cpu) = node;
807
808 if (node != NUMA_NO_NODE)
809 set_cpu_numa_node(cpu, node);
810}
811
812void __cpuinit numa_clear_node(int cpu)
813{
814 numa_set_node(cpu, NUMA_NO_NODE);
815}
816
817#ifndef CONFIG_DEBUG_PER_CPU_MAPS
818
819void __cpuinit numa_add_cpu(int cpu)
820{
821 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
822}
823
824void __cpuinit numa_remove_cpu(int cpu)
825{
826 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
827}
828
829#else /* CONFIG_DEBUG_PER_CPU_MAPS */
830
831/*
832 * --------- debug versions of the numa functions ---------
833 */
834static void __cpuinit numa_set_cpumask(int cpu, int enable)
835{
836 int node = early_cpu_to_node(cpu);
837 struct cpumask *mask;
838 char buf[64];
839
840 mask = node_to_cpumask_map[node];
841 if (mask == NULL) {
842 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
843 dump_stack();
844 return;
845 }
846
847 if (enable)
848 cpumask_set_cpu(cpu, mask);
849 else
850 cpumask_clear_cpu(cpu, mask);
851
852 cpulist_scnprintf(buf, sizeof(buf), mask);
853 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
854 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
855}
856
857void __cpuinit numa_add_cpu(int cpu)
858{
859 numa_set_cpumask(cpu, 1);
860}
861
862void __cpuinit numa_remove_cpu(int cpu)
863{
864 numa_set_cpumask(cpu, 0);
865}
866
867int __cpu_to_node(int cpu)
868{
869 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
870 printk(KERN_WARNING
871 "cpu_to_node(%d): usage too early!\n", cpu);
872 dump_stack();
873 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
874 }
875 return per_cpu(x86_cpu_to_node_map, cpu);
876}
877EXPORT_SYMBOL(__cpu_to_node);
878
879/*
880 * Same function as cpu_to_node() but used if called before the
881 * per_cpu areas are setup.
882 */
883int early_cpu_to_node(int cpu)
884{
885 if (early_per_cpu_ptr(x86_cpu_to_node_map))
886 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
887
888 if (!cpu_possible(cpu)) {
889 printk(KERN_WARNING
890 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
891 dump_stack();
892 return NUMA_NO_NODE;
893 }
894 return per_cpu(x86_cpu_to_node_map, cpu);
895}
896
897/*
898 * --------- end of debug versions of the numa functions ---------
899 */
900
901#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..d0ed086b6247
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,492 @@
1/*
2 * NUMA emulation
3 */
4#include <linux/kernel.h>
5#include <linux/errno.h>
6#include <linux/topology.h>
7#include <linux/memblock.h>
8#include <linux/bootmem.h>
9#include <asm/dma.h>
10
11#include "numa_internal.h"
12
13static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
14static char *emu_cmdline __initdata;
15
16void __init numa_emu_cmdline(char *str)
17{
18 emu_cmdline = str;
19}
20
21static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
22{
23 int i;
24
25 for (i = 0; i < mi->nr_blks; i++)
26 if (mi->blk[i].nid == nid)
27 return i;
28 return -ENOENT;
29}
30
31/*
32 * Sets up nid to range from @start to @end. The return value is -errno if
33 * something went wrong, 0 otherwise.
34 */
35static int __init emu_setup_memblk(struct numa_meminfo *ei,
36 struct numa_meminfo *pi,
37 int nid, int phys_blk, u64 size)
38{
39 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
40 struct numa_memblk *pb = &pi->blk[phys_blk];
41
42 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
43 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
44 return -EINVAL;
45 }
46
47 ei->nr_blks++;
48 eb->start = pb->start;
49 eb->end = pb->start + size;
50 eb->nid = nid;
51
52 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
53 emu_nid_to_phys[nid] = pb->nid;
54
55 pb->start += size;
56 if (pb->start >= pb->end) {
57 WARN_ON_ONCE(pb->start > pb->end);
58 numa_remove_memblk_from(phys_blk, pi);
59 }
60
61 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
62 eb->start, eb->end, (eb->end - eb->start) >> 20);
63 return 0;
64}
65
66/*
67 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
68 * to max_addr. The return value is the number of nodes allocated.
69 */
70static int __init split_nodes_interleave(struct numa_meminfo *ei,
71 struct numa_meminfo *pi,
72 u64 addr, u64 max_addr, int nr_nodes)
73{
74 nodemask_t physnode_mask = NODE_MASK_NONE;
75 u64 size;
76 int big;
77 int nid = 0;
78 int i, ret;
79
80 if (nr_nodes <= 0)
81 return -1;
82 if (nr_nodes > MAX_NUMNODES) {
83 pr_info("numa=fake=%d too large, reducing to %d\n",
84 nr_nodes, MAX_NUMNODES);
85 nr_nodes = MAX_NUMNODES;
86 }
87
88 /*
89 * Calculate target node size. x86_32 freaks on __udivdi3() so do
90 * the division in ulong number of pages and convert back.
91 */
92 size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
93 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
94
95 /*
96 * Calculate the number of big nodes that can be allocated as a result
97 * of consolidating the remainder.
98 */
99 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
100 FAKE_NODE_MIN_SIZE;
101
102 size &= FAKE_NODE_MIN_HASH_MASK;
103 if (!size) {
104 pr_err("Not enough memory for each node. "
105 "NUMA emulation disabled.\n");
106 return -1;
107 }
108
109 for (i = 0; i < pi->nr_blks; i++)
110 node_set(pi->blk[i].nid, physnode_mask);
111
112 /*
113 * Continue to fill physical nodes with fake nodes until there is no
114 * memory left on any of them.
115 */
116 while (nodes_weight(physnode_mask)) {
117 for_each_node_mask(i, physnode_mask) {
118 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
119 u64 start, limit, end;
120 int phys_blk;
121
122 phys_blk = emu_find_memblk_by_nid(i, pi);
123 if (phys_blk < 0) {
124 node_clear(i, physnode_mask);
125 continue;
126 }
127 start = pi->blk[phys_blk].start;
128 limit = pi->blk[phys_blk].end;
129 end = start + size;
130
131 if (nid < big)
132 end += FAKE_NODE_MIN_SIZE;
133
134 /*
135 * Continue to add memory to this fake node if its
136 * non-reserved memory is less than the per-node size.
137 */
138 while (end - start -
139 memblock_x86_hole_size(start, end) < size) {
140 end += FAKE_NODE_MIN_SIZE;
141 if (end > limit) {
142 end = limit;
143 break;
144 }
145 }
146
147 /*
148 * If there won't be at least FAKE_NODE_MIN_SIZE of
149 * non-reserved memory in ZONE_DMA32 for the next node,
150 * this one must extend to the boundary.
151 */
152 if (end < dma32_end && dma32_end - end -
153 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
154 end = dma32_end;
155
156 /*
157 * If there won't be enough non-reserved memory for the
158 * next node, this one must extend to the end of the
159 * physical node.
160 */
161 if (limit - end -
162 memblock_x86_hole_size(end, limit) < size)
163 end = limit;
164
165 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
166 phys_blk,
167 min(end, limit) - start);
168 if (ret < 0)
169 return ret;
170 }
171 }
172 return 0;
173}
174
175/*
176 * Returns the end address of a node so that there is at least `size' amount of
177 * non-reserved memory or `max_addr' is reached.
178 */
179static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
180{
181 u64 end = start + size;
182
183 while (end - start - memblock_x86_hole_size(start, end) < size) {
184 end += FAKE_NODE_MIN_SIZE;
185 if (end > max_addr) {
186 end = max_addr;
187 break;
188 }
189 }
190 return end;
191}
192
193/*
194 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
195 * `addr' to `max_addr'. The return value is the number of nodes allocated.
196 */
197static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
198 struct numa_meminfo *pi,
199 u64 addr, u64 max_addr, u64 size)
200{
201 nodemask_t physnode_mask = NODE_MASK_NONE;
202 u64 min_size;
203 int nid = 0;
204 int i, ret;
205
206 if (!size)
207 return -1;
208 /*
209 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
210 * increased accordingly if the requested size is too small. This
211 * creates a uniform distribution of node sizes across the entire
212 * machine (but not necessarily over physical nodes).
213 */
214 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
215 MAX_NUMNODES;
216 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
217 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
218 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
219 FAKE_NODE_MIN_HASH_MASK;
220 if (size < min_size) {
221 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
222 size >> 20, min_size >> 20);
223 size = min_size;
224 }
225 size &= FAKE_NODE_MIN_HASH_MASK;
226
227 for (i = 0; i < pi->nr_blks; i++)
228 node_set(pi->blk[i].nid, physnode_mask);
229
230 /*
231 * Fill physical nodes with fake nodes of size until there is no memory
232 * left on any of them.
233 */
234 while (nodes_weight(physnode_mask)) {
235 for_each_node_mask(i, physnode_mask) {
236 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
237 u64 start, limit, end;
238 int phys_blk;
239
240 phys_blk = emu_find_memblk_by_nid(i, pi);
241 if (phys_blk < 0) {
242 node_clear(i, physnode_mask);
243 continue;
244 }
245 start = pi->blk[phys_blk].start;
246 limit = pi->blk[phys_blk].end;
247
248 end = find_end_of_node(start, limit, size);
249 /*
250 * If there won't be at least FAKE_NODE_MIN_SIZE of
251 * non-reserved memory in ZONE_DMA32 for the next node,
252 * this one must extend to the boundary.
253 */
254 if (end < dma32_end && dma32_end - end -
255 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
256 end = dma32_end;
257
258 /*
259 * If there won't be enough non-reserved memory for the
260 * next node, this one must extend to the end of the
261 * physical node.
262 */
263 if (limit - end -
264 memblock_x86_hole_size(end, limit) < size)
265 end = limit;
266
267 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
268 phys_blk,
269 min(end, limit) - start);
270 if (ret < 0)
271 return ret;
272 }
273 }
274 return 0;
275}
276
277/**
278 * numa_emulation - Emulate NUMA nodes
279 * @numa_meminfo: NUMA configuration to massage
280 * @numa_dist_cnt: The size of the physical NUMA distance table
281 *
282 * Emulate NUMA nodes according to the numa=fake kernel parameter.
283 * @numa_meminfo contains the physical memory configuration and is modified
284 * to reflect the emulated configuration on success. @numa_dist_cnt is
285 * used to determine the size of the physical distance table.
286 *
287 * On success, the following modifications are made.
288 *
289 * - @numa_meminfo is updated to reflect the emulated nodes.
290 *
291 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
292 * emulated nodes.
293 *
294 * - NUMA distance table is rebuilt to represent distances between emulated
295 * nodes. The distances are determined considering how emulated nodes
296 * are mapped to physical nodes and match the actual distances.
297 *
298 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
299 * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
300 *
301 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
302 * identity mapping and no other modification is made.
303 */
304void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
305{
306 static struct numa_meminfo ei __initdata;
307 static struct numa_meminfo pi __initdata;
308 const u64 max_addr = PFN_PHYS(max_pfn);
309 u8 *phys_dist = NULL;
310 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
311 int max_emu_nid, dfl_phys_nid;
312 int i, j, ret;
313
314 if (!emu_cmdline)
315 goto no_emu;
316
317 memset(&ei, 0, sizeof(ei));
318 pi = *numa_meminfo;
319
320 for (i = 0; i < MAX_NUMNODES; i++)
321 emu_nid_to_phys[i] = NUMA_NO_NODE;
322
323 /*
324 * If the numa=fake command-line contains a 'M' or 'G', it represents
325 * the fixed node size. Otherwise, if it is just a single number N,
326 * split the system RAM into N fake nodes.
327 */
328 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
329 u64 size;
330
331 size = memparse(emu_cmdline, &emu_cmdline);
332 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
333 } else {
334 unsigned long n;
335
336 n = simple_strtoul(emu_cmdline, NULL, 0);
337 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
338 }
339
340 if (ret < 0)
341 goto no_emu;
342
343 if (numa_cleanup_meminfo(&ei) < 0) {
344 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
345 goto no_emu;
346 }
347
348 /* copy the physical distance table */
349 if (numa_dist_cnt) {
350 u64 phys;
351
352 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
353 phys_size, PAGE_SIZE);
354 if (phys == MEMBLOCK_ERROR) {
355 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
356 goto no_emu;
357 }
358 memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
359 phys_dist = __va(phys);
360
361 for (i = 0; i < numa_dist_cnt; i++)
362 for (j = 0; j < numa_dist_cnt; j++)
363 phys_dist[i * numa_dist_cnt + j] =
364 node_distance(i, j);
365 }
366
367 /*
368 * Determine the max emulated nid and the default phys nid to use
369 * for unmapped nodes.
370 */
371 max_emu_nid = 0;
372 dfl_phys_nid = NUMA_NO_NODE;
373 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
374 if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
375 max_emu_nid = i;
376 if (dfl_phys_nid == NUMA_NO_NODE)
377 dfl_phys_nid = emu_nid_to_phys[i];
378 }
379 }
380 if (dfl_phys_nid == NUMA_NO_NODE) {
381 pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
382 goto no_emu;
383 }
384
385 /* commit */
386 *numa_meminfo = ei;
387
388 /*
389 * Transform __apicid_to_node table to use emulated nids by
390 * reverse-mapping phys_nid. The maps should always exist but fall
391 * back to zero just in case.
392 */
393 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
394 if (__apicid_to_node[i] == NUMA_NO_NODE)
395 continue;
396 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
397 if (__apicid_to_node[i] == emu_nid_to_phys[j])
398 break;
399 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
400 }
401
402 /* make sure all emulated nodes are mapped to a physical node */
403 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
404 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
405 emu_nid_to_phys[i] = dfl_phys_nid;
406
407 /* transform distance table */
408 numa_reset_distance();
409 for (i = 0; i < max_emu_nid + 1; i++) {
410 for (j = 0; j < max_emu_nid + 1; j++) {
411 int physi = emu_nid_to_phys[i];
412 int physj = emu_nid_to_phys[j];
413 int dist;
414
415 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
416 dist = physi == physj ?
417 LOCAL_DISTANCE : REMOTE_DISTANCE;
418 else
419 dist = phys_dist[physi * numa_dist_cnt + physj];
420
421 numa_set_distance(i, j, dist);
422 }
423 }
424
425 /* free the copied physical distance table */
426 if (phys_dist)
427 memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
428 return;
429
430no_emu:
431 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
432 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
433 emu_nid_to_phys[i] = i;
434}
435
436#ifndef CONFIG_DEBUG_PER_CPU_MAPS
437void __cpuinit numa_add_cpu(int cpu)
438{
439 int physnid, nid;
440
441 nid = early_cpu_to_node(cpu);
442 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
443
444 physnid = emu_nid_to_phys[nid];
445
446 /*
447 * Map the cpu to each emulated node that is allocated on the physical
448 * node of the cpu's apic id.
449 */
450 for_each_online_node(nid)
451 if (emu_nid_to_phys[nid] == physnid)
452 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
453}
454
455void __cpuinit numa_remove_cpu(int cpu)
456{
457 int i;
458
459 for_each_online_node(i)
460 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
461}
462#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
463static void __cpuinit numa_set_cpumask(int cpu, bool enable)
464{
465 int nid, physnid;
466
467 nid = early_cpu_to_node(cpu);
468 if (nid == NUMA_NO_NODE) {
469 /* early_cpu_to_node() already emits a warning and trace */
470 return;
471 }
472
473 physnid = emu_nid_to_phys[nid];
474
475 for_each_online_node(nid) {
476 if (emu_nid_to_phys[nid] != physnid)
477 continue;
478
479 debug_cpumask_set_cpu(cpu, nid, enable);
480 }
481}
482
483void __cpuinit numa_add_cpu(int cpu)
484{
485 numa_set_cpumask(cpu, true);
486}
487
488void __cpuinit numa_remove_cpu(int cpu)
489{
490 numa_set_cpumask(cpu, false);
491}
492#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..7178c3afe05e
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,39 @@
1#ifndef __X86_MM_NUMA_INTERNAL_H
2#define __X86_MM_NUMA_INTERNAL_H
3
4#include <linux/types.h>
5#include <asm/numa.h>
6
7struct numa_memblk {
8 u64 start;
9 u64 end;
10 int nid;
11};
12
13struct numa_meminfo {
14 int nr_blks;
15 struct numa_memblk blk[NR_NODE_MEMBLKS];
16};
17
18void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void);
21
22void __init x86_numa_init(void);
23
24#ifdef CONFIG_X86_64
25static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
26#else
27void __init init_alloc_remap(int nid, u64 start, u64 end);
28#endif
29
30#ifdef CONFIG_NUMA_EMU
31void __init numa_emulation(struct numa_meminfo *numa_meminfo,
32 int numa_dist_cnt);
33#else
34static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
35 int numa_dist_cnt)
36{ }
37#endif
38
39#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..f9e526742fa1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
13#include <linux/pfn.h> 13#include <linux/pfn.h>
14#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/pci.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -56,12 +57,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM];
56 57
57void update_page_count(int level, unsigned long pages) 58void update_page_count(int level, unsigned long pages)
58{ 59{
59 unsigned long flags;
60
61 /* Protect against CPA */ 60 /* Protect against CPA */
62 spin_lock_irqsave(&pgd_lock, flags); 61 spin_lock(&pgd_lock);
63 direct_pages_count[level] += pages; 62 direct_pages_count[level] += pages;
64 spin_unlock_irqrestore(&pgd_lock, flags); 63 spin_unlock(&pgd_lock);
65} 64}
66 65
67static void split_page_count(int level) 66static void split_page_count(int level)
@@ -260,8 +259,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
260 * The BIOS area between 640k and 1Mb needs to be executable for 259 * The BIOS area between 640k and 1Mb needs to be executable for
261 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. 260 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
262 */ 261 */
263 if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 262#ifdef CONFIG_PCI_BIOS
263 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
264 pgprot_val(forbidden) |= _PAGE_NX; 264 pgprot_val(forbidden) |= _PAGE_NX;
265#endif
265 266
266 /* 267 /*
267 * The kernel text needs to be executable for obvious reasons 268 * The kernel text needs to be executable for obvious reasons
@@ -309,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
309 * these shared mappings are made of small page mappings. 310 * these shared mappings are made of small page mappings.
310 * Thus this don't enforce !RW mapping for small page kernel 311 * Thus this don't enforce !RW mapping for small page kernel
311 * text mapping logic will help Linux Xen parvirt guest boot 312 * text mapping logic will help Linux Xen parvirt guest boot
312 * aswell. 313 * as well.
313 */ 314 */
314 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 315 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
315 pgprot_val(forbidden) |= _PAGE_RW; 316 pgprot_val(forbidden) |= _PAGE_RW;
@@ -391,16 +392,16 @@ static int
391try_preserve_large_page(pte_t *kpte, unsigned long address, 392try_preserve_large_page(pte_t *kpte, unsigned long address,
392 struct cpa_data *cpa) 393 struct cpa_data *cpa)
393{ 394{
394 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; 395 unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
395 pte_t new_pte, old_pte, *tmp; 396 pte_t new_pte, old_pte, *tmp;
396 pgprot_t old_prot, new_prot; 397 pgprot_t old_prot, new_prot, req_prot;
397 int i, do_split = 1; 398 int i, do_split = 1;
398 unsigned int level; 399 unsigned int level;
399 400
400 if (cpa->force_split) 401 if (cpa->force_split)
401 return 1; 402 return 1;
402 403
403 spin_lock_irqsave(&pgd_lock, flags); 404 spin_lock(&pgd_lock);
404 /* 405 /*
405 * Check for races, another CPU might have split this page 406 * Check for races, another CPU might have split this page
406 * up already: 407 * up already:
@@ -438,10 +439,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
438 * We are safe now. Check whether the new pgprot is the same: 439 * We are safe now. Check whether the new pgprot is the same:
439 */ 440 */
440 old_pte = *kpte; 441 old_pte = *kpte;
441 old_prot = new_prot = pte_pgprot(old_pte); 442 old_prot = new_prot = req_prot = pte_pgprot(old_pte);
442 443
443 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 444 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
444 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 445 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
445 446
446 /* 447 /*
447 * old_pte points to the large page base address. So we need 448 * old_pte points to the large page base address. So we need
@@ -450,17 +451,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
450 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); 451 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
451 cpa->pfn = pfn; 452 cpa->pfn = pfn;
452 453
453 new_prot = static_protections(new_prot, address, pfn); 454 new_prot = static_protections(req_prot, address, pfn);
454 455
455 /* 456 /*
456 * We need to check the full range, whether 457 * We need to check the full range, whether
457 * static_protection() requires a different pgprot for one of 458 * static_protection() requires a different pgprot for one of
458 * the pages in the range we try to preserve: 459 * the pages in the range we try to preserve:
459 */ 460 */
460 addr = address + PAGE_SIZE; 461 addr = address & pmask;
461 pfn++; 462 pfn = pte_pfn(old_pte);
462 for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { 463 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
463 pgprot_t chk_prot = static_protections(new_prot, addr, pfn); 464 pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
464 465
465 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 466 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
466 goto out_unlock; 467 goto out_unlock;
@@ -483,7 +484,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
483 * that we limited the number of possible pages already to 484 * that we limited the number of possible pages already to
484 * the number of pages in the large page. 485 * the number of pages in the large page.
485 */ 486 */
486 if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { 487 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
487 /* 488 /*
488 * The address is aligned and the number of pages 489 * The address is aligned and the number of pages
489 * covers the full page. 490 * covers the full page.
@@ -495,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
495 } 496 }
496 497
497out_unlock: 498out_unlock:
498 spin_unlock_irqrestore(&pgd_lock, flags); 499 spin_unlock(&pgd_lock);
499 500
500 return do_split; 501 return do_split;
501} 502}
502 503
503static int split_large_page(pte_t *kpte, unsigned long address) 504static int split_large_page(pte_t *kpte, unsigned long address)
504{ 505{
505 unsigned long flags, pfn, pfninc = 1; 506 unsigned long pfn, pfninc = 1;
506 unsigned int i, level; 507 unsigned int i, level;
507 pte_t *pbase, *tmp; 508 pte_t *pbase, *tmp;
508 pgprot_t ref_prot; 509 pgprot_t ref_prot;
@@ -516,7 +517,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
516 if (!base) 517 if (!base)
517 return -ENOMEM; 518 return -ENOMEM;
518 519
519 spin_lock_irqsave(&pgd_lock, flags); 520 spin_lock(&pgd_lock);
520 /* 521 /*
521 * Check for races, another CPU might have split this page 522 * Check for races, another CPU might have split this page
522 * up for us already: 523 * up for us already:
@@ -588,7 +589,7 @@ out_unlock:
588 */ 589 */
589 if (base) 590 if (base)
590 __free_page(base); 591 __free_page(base);
591 spin_unlock_irqrestore(&pgd_lock, flags); 592 spin_unlock(&pgd_lock);
592 593
593 return 0; 594 return 0;
594} 595}
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 38e6d174c497..9f0614daea85 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
414 unsigned char *p; 414 unsigned char *p;
415 struct prefix_bits prf; 415 struct prefix_bits prf;
416 int i; 416 int i;
417 unsigned long rv;
418 417
419 p = (unsigned char *)ins_addr; 418 p = (unsigned char *)ins_addr;
420 p += skip_prefix(p, &prf); 419 p += skip_prefix(p, &prf);
421 p += get_opcode(p, &opcode); 420 p += get_opcode(p, &opcode);
422 for (i = 0; i < ARRAY_SIZE(reg_rop); i++) 421 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
423 if (reg_rop[i] == opcode) { 422 if (reg_rop[i] == opcode)
424 rv = REG_READ;
425 goto do_work; 423 goto do_work;
426 }
427 424
428 for (i = 0; i < ARRAY_SIZE(reg_wop); i++) 425 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
429 if (reg_wop[i] == opcode) { 426 if (reg_wop[i] == opcode)
430 rv = REG_WRITE;
431 goto do_work; 427 goto do_work;
432 }
433 428
434 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " 429 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
435 "0x%02x\n", opcode); 430 "0x%02x\n", opcode);
@@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
474 unsigned char *p; 469 unsigned char *p;
475 struct prefix_bits prf; 470 struct prefix_bits prf;
476 int i; 471 int i;
477 unsigned long rv;
478 472
479 p = (unsigned char *)ins_addr; 473 p = (unsigned char *)ins_addr;
480 p += skip_prefix(p, &prf); 474 p += skip_prefix(p, &prf);
481 p += get_opcode(p, &opcode); 475 p += get_opcode(p, &opcode);
482 for (i = 0; i < ARRAY_SIZE(imm_wop); i++) 476 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
483 if (imm_wop[i] == opcode) { 477 if (imm_wop[i] == opcode)
484 rv = IMM_WRITE;
485 goto do_work; 478 goto do_work;
486 }
487 479
488 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " 480 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
489 "0x%02x\n", opcode); 481 "0x%02x\n", opcode);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590e..8573b83a63d0 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
87#define UNSHARED_PTRS_PER_PGD \ 87#define UNSHARED_PTRS_PER_PGD \
88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
89 89
90static void pgd_ctor(pgd_t *pgd) 90
91static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
92{
93 BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
94 virt_to_page(pgd)->index = (pgoff_t)mm;
95}
96
97struct mm_struct *pgd_page_get_mm(struct page *page)
98{
99 return (struct mm_struct *)page->index;
100}
101
102static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
91{ 103{
92 /* If the pgd points to a shared pagetable level (either the 104 /* If the pgd points to a shared pagetable level (either the
93 ptes in non-PAE, or shared PMD in PAE), then just copy the 105 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,27 +110,23 @@ static void pgd_ctor(pgd_t *pgd)
98 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 110 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
99 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 111 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
100 KERNEL_PGD_PTRS); 112 KERNEL_PGD_PTRS);
101 paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
102 __pa(swapper_pg_dir) >> PAGE_SHIFT,
103 KERNEL_PGD_BOUNDARY,
104 KERNEL_PGD_PTRS);
105 } 113 }
106 114
107 /* list required to sync kernel mapping updates */ 115 /* list required to sync kernel mapping updates */
108 if (!SHARED_KERNEL_PMD) 116 if (!SHARED_KERNEL_PMD) {
117 pgd_set_mm(pgd, mm);
109 pgd_list_add(pgd); 118 pgd_list_add(pgd);
119 }
110} 120}
111 121
112static void pgd_dtor(pgd_t *pgd) 122static void pgd_dtor(pgd_t *pgd)
113{ 123{
114 unsigned long flags; /* can be called from interrupt context */
115
116 if (SHARED_KERNEL_PMD) 124 if (SHARED_KERNEL_PMD)
117 return; 125 return;
118 126
119 spin_lock_irqsave(&pgd_lock, flags); 127 spin_lock(&pgd_lock);
120 pgd_list_del(pgd); 128 pgd_list_del(pgd);
121 spin_unlock_irqrestore(&pgd_lock, flags); 129 spin_unlock(&pgd_lock);
122} 130}
123 131
124/* 132/*
@@ -160,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
160 * section 8.1: in PAE mode we explicitly have to flush the 168 * section 8.1: in PAE mode we explicitly have to flush the
161 * TLB via cr3 if the top-level pgd is changed... 169 * TLB via cr3 if the top-level pgd is changed...
162 */ 170 */
163 if (mm == current->active_mm) 171 flush_tlb_mm(mm);
164 write_cr3(read_cr3());
165} 172}
166#else /* !CONFIG_X86_PAE */ 173#else /* !CONFIG_X86_PAE */
167 174
@@ -250,7 +257,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
250{ 257{
251 pgd_t *pgd; 258 pgd_t *pgd;
252 pmd_t *pmds[PREALLOCATED_PMDS]; 259 pmd_t *pmds[PREALLOCATED_PMDS];
253 unsigned long flags;
254 260
255 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); 261 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
256 262
@@ -270,12 +276,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
270 * respect to anything walking the pgd_list, so that they 276 * respect to anything walking the pgd_list, so that they
271 * never see a partially populated pgd. 277 * never see a partially populated pgd.
272 */ 278 */
273 spin_lock_irqsave(&pgd_lock, flags); 279 spin_lock(&pgd_lock);
274 280
275 pgd_ctor(pgd); 281 pgd_ctor(mm, pgd);
276 pgd_prepopulate_pmd(mm, pgd, pmds); 282 pgd_prepopulate_pmd(mm, pgd, pmds);
277 283
278 spin_unlock_irqrestore(&pgd_lock, flags); 284 spin_unlock(&pgd_lock);
279 285
280 return pgd; 286 return pgd;
281 287
@@ -310,6 +316,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
310 return changed; 316 return changed;
311} 317}
312 318
319#ifdef CONFIG_TRANSPARENT_HUGEPAGE
320int pmdp_set_access_flags(struct vm_area_struct *vma,
321 unsigned long address, pmd_t *pmdp,
322 pmd_t entry, int dirty)
323{
324 int changed = !pmd_same(*pmdp, entry);
325
326 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
327
328 if (changed && dirty) {
329 *pmdp = entry;
330 pmd_update_defer(vma->vm_mm, address, pmdp);
331 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
332 }
333
334 return changed;
335}
336#endif
337
313int ptep_test_and_clear_young(struct vm_area_struct *vma, 338int ptep_test_and_clear_young(struct vm_area_struct *vma,
314 unsigned long addr, pte_t *ptep) 339 unsigned long addr, pte_t *ptep)
315{ 340{
@@ -325,6 +350,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
325 return ret; 350 return ret;
326} 351}
327 352
353#ifdef CONFIG_TRANSPARENT_HUGEPAGE
354int pmdp_test_and_clear_young(struct vm_area_struct *vma,
355 unsigned long addr, pmd_t *pmdp)
356{
357 int ret = 0;
358
359 if (pmd_young(*pmdp))
360 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
361 (unsigned long *)pmdp);
362
363 if (ret)
364 pmd_update(vma->vm_mm, addr, pmdp);
365
366 return ret;
367}
368#endif
369
328int ptep_clear_flush_young(struct vm_area_struct *vma, 370int ptep_clear_flush_young(struct vm_area_struct *vma,
329 unsigned long address, pte_t *ptep) 371 unsigned long address, pte_t *ptep)
330{ 372{
@@ -337,6 +379,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
337 return young; 379 return young;
338} 380}
339 381
382#ifdef CONFIG_TRANSPARENT_HUGEPAGE
383int pmdp_clear_flush_young(struct vm_area_struct *vma,
384 unsigned long address, pmd_t *pmdp)
385{
386 int young;
387
388 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
389
390 young = pmdp_test_and_clear_young(vma, address, pmdp);
391 if (young)
392 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
393
394 return young;
395}
396
397void pmdp_splitting_flush(struct vm_area_struct *vma,
398 unsigned long address, pmd_t *pmdp)
399{
400 int set;
401 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
402 set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
403 (unsigned long *)pmdp);
404 if (set) {
405 pmd_update(vma->vm_mm, address, pmdp);
406 /* need tlb flush only to serialize against gup-fast */
407 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
408 }
409}
410#endif
411
340/** 412/**
341 * reserve_top_address - reserves a hole in the top of kernel address space 413 * reserve_top_address - reserves a hole in the top of kernel address space
342 * @reserve - size of hole to reserve 414 * @reserve - size of hole to reserve
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
41{ 41{
42 if (!cpu_has_nx) { 42 if (!cpu_has_nx) {
43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
44 "missing in CPU or disabled in BIOS!\n"); 44 "missing in CPU!\n");
45 } else { 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) { 47 if (disable_nx) {
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
new file mode 100644
index 000000000000..81dbfdeb080d
--- /dev/null
+++ b/arch/x86/mm/srat.c
@@ -0,0 +1,184 @@
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/memblock.h>
20#include <linux/mm.h>
21#include <asm/proto.h>
22#include <asm/numa.h>
23#include <asm/e820.h>
24#include <asm/apic.h>
25#include <asm/uv/uv.h>
26
27int acpi_numa __initdata;
28
29static __init int setup_node(int pxm)
30{
31 return acpi_map_pxm_to_node(pxm);
32}
33
34static __init void bad_srat(void)
35{
36 printk(KERN_ERR "SRAT: SRAT not used.\n");
37 acpi_numa = -1;
38}
39
40static __init inline int srat_disabled(void)
41{
42 return acpi_numa < 0;
43}
44
45/* Callback for SLIT parsing */
46void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
47{
48 int i, j;
49
50 for (i = 0; i < slit->locality_count; i++)
51 for (j = 0; j < slit->locality_count; j++)
52 numa_set_distance(pxm_to_node(i), pxm_to_node(j),
53 slit->entry[slit->locality_count * i + j]);
54}
55
56/* Callback for Proximity Domain -> x2APIC mapping */
57void __init
58acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
59{
60 int pxm, node;
61 int apic_id;
62
63 if (srat_disabled())
64 return;
65 if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
66 bad_srat();
67 return;
68 }
69 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
70 return;
71 pxm = pa->proximity_domain;
72 node = setup_node(pxm);
73 if (node < 0) {
74 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
75 bad_srat();
76 return;
77 }
78
79 apic_id = pa->apic_id;
80 if (apic_id >= MAX_LOCAL_APIC) {
81 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
82 return;
83 }
84 set_apicid_to_node(apic_id, node);
85 node_set(node, numa_nodes_parsed);
86 acpi_numa = 1;
87 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
88 pxm, apic_id, node);
89}
90
91/* Callback for Proximity Domain -> LAPIC mapping */
92void __init
93acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
94{
95 int pxm, node;
96 int apic_id;
97
98 if (srat_disabled())
99 return;
100 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
101 bad_srat();
102 return;
103 }
104 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
105 return;
106 pxm = pa->proximity_domain_lo;
107 node = setup_node(pxm);
108 if (node < 0) {
109 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
110 bad_srat();
111 return;
112 }
113
114 if (get_uv_system_type() >= UV_X2APIC)
115 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
116 else
117 apic_id = pa->apic_id;
118
119 if (apic_id >= MAX_LOCAL_APIC) {
120 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
121 return;
122 }
123
124 set_apicid_to_node(apic_id, node);
125 node_set(node, numa_nodes_parsed);
126 acpi_numa = 1;
127 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
128 pxm, apic_id, node);
129}
130
131#ifdef CONFIG_MEMORY_HOTPLUG
132static inline int save_add_info(void) {return 1;}
133#else
134static inline int save_add_info(void) {return 0;}
135#endif
136
137/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
138void __init
139acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
140{
141 u64 start, end;
142 int node, pxm;
143
144 if (srat_disabled())
145 return;
146 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
147 bad_srat();
148 return;
149 }
150 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
151 return;
152
153 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
154 return;
155 start = ma->base_address;
156 end = start + ma->length;
157 pxm = ma->proximity_domain;
158 node = setup_node(pxm);
159 if (node < 0) {
160 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
161 bad_srat();
162 return;
163 }
164
165 if (numa_add_memblk(node, start, end) < 0) {
166 bad_srat();
167 return;
168 }
169
170 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
171 start, end);
172}
173
174void __init acpi_numa_arch_fixup(void) {}
175
176int __init x86_acpi_numa_init(void)
177{
178 int ret;
179
180 ret = acpi_numa_init();
181 if (ret < 0)
182 return ret;
183 return srat_disabled() ? -EINVAL : 0;
184}
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index 9324f13492d5..000000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,285 +0,0 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/acpi.h>
30#include <linux/nodemask.h>
31#include <asm/srat.h>
32#include <asm/topology.h>
33#include <asm/smp.h>
34#include <asm/e820.h>
35
36/*
37 * proximity macros and definitions
38 */
39#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
40#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
41#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
42#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
43/* bitmap length; _PXM is at most 255 */
44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
45static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
46
47#define MAX_CHUNKS_PER_NODE 3
48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
49struct node_memory_chunk_s {
50 unsigned long start_pfn;
51 unsigned long end_pfn;
52 u8 pxm; // proximity domain of node
53 u8 nid; // which cnode contains this chunk?
54 u8 bank; // which mem bank on this node
55};
56static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
57
58static int __initdata num_memory_chunks; /* total number of memory chunks */
59static u8 __initdata apicid_to_pxm[MAX_APICID];
60
61int numa_off __initdata;
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
95
96 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
97 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
98}
99
100/*
101 * Identify memory proximity domains and hot-remove capabilities.
102 * Fill node memory chunk list structure.
103 */
104void __init
105acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
106{
107 unsigned long long paddr, size;
108 unsigned long start_pfn, end_pfn;
109 u8 pxm;
110 struct node_memory_chunk_s *p, *q, *pend;
111
112 if (srat_disabled())
113 return;
114 if (memory_affinity->header.length !=
115 sizeof(struct acpi_srat_mem_affinity)) {
116 bad_srat();
117 return;
118 }
119
120 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
121 return; /* empty entry */
122
123 pxm = memory_affinity->proximity_domain & 0xff;
124
125 /* mark this node as "seen" in node bitmap */
126 BMAP_SET(pxm_bitmap, pxm);
127
128 /* calculate info for memory chunk structure */
129 paddr = memory_affinity->base_address;
130 size = memory_affinity->length;
131
132 start_pfn = paddr >> PAGE_SHIFT;
133 end_pfn = (paddr + size) >> PAGE_SHIFT;
134
135
136 if (num_memory_chunks >= MAXCHUNKS) {
137 printk(KERN_WARNING "Too many mem chunks in SRAT."
138 " Ignoring %lld MBytes at %llx\n",
139 size/(1024*1024), paddr);
140 return;
141 }
142
143 /* Insertion sort based on base address */
144 pend = &node_memory_chunk[num_memory_chunks];
145 for (p = &node_memory_chunk[0]; p < pend; p++) {
146 if (start_pfn < p->start_pfn)
147 break;
148 }
149 if (p < pend) {
150 for (q = pend; q >= p; q--)
151 *(q + 1) = *q;
152 }
153 p->start_pfn = start_pfn;
154 p->end_pfn = end_pfn;
155 p->pxm = pxm;
156
157 num_memory_chunks++;
158
159 printk(KERN_DEBUG "Memory range %08lx to %08lx"
160 " in proximity domain %02x %s\n",
161 start_pfn, end_pfn,
162 pxm,
163 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
164 "enabled and removable" : "enabled" ) );
165}
166
167/* Callback for SLIT parsing */
168void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
169{
170}
171
172void acpi_numa_arch_fixup(void)
173{
174}
175/*
176 * The SRAT table always lists ascending addresses, so can always
177 * assume that the first "start" address that you see is the real
178 * start of the node, and that the current "end" address is after
179 * the previous one.
180 */
181static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
182{
183 /*
184 * Only add present memory as told by the e820.
185 * There is no guarantee from the SRAT that the memory it
186 * enumerates is present at boot time because it represents
187 * *possible* memory hotplug areas the same as normal RAM.
188 */
189 if (memory_chunk->start_pfn >= max_pfn) {
190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
191 memory_chunk->start_pfn, memory_chunk->end_pfn);
192 return -1;
193 }
194 if (memory_chunk->nid != nid)
195 return -1;
196
197 if (!node_has_online_mem(nid))
198 node_start_pfn[nid] = memory_chunk->start_pfn;
199
200 if (node_start_pfn[nid] > memory_chunk->start_pfn)
201 node_start_pfn[nid] = memory_chunk->start_pfn;
202
203 if (node_end_pfn[nid] < memory_chunk->end_pfn)
204 node_end_pfn[nid] = memory_chunk->end_pfn;
205
206 return 0;
207}
208
209int __init get_memcfg_from_srat(void)
210{
211 int i, j, nid;
212
213
214 if (srat_disabled())
215 goto out_fail;
216
217 if (num_memory_chunks == 0) {
218 printk(KERN_DEBUG
219 "could not find any ACPI SRAT memory areas.\n");
220 goto out_fail;
221 }
222
223 /* Calculate total number of nodes in system from PXM bitmap and create
224 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
225 * to specify the range of _PXM values.)
226 */
227 /*
228 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
229 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
230 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
231 * approaches MAX_PXM_DOMAINS for i386.
232 */
233 nodes_clear(node_online_map);
234 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
235 if (BMAP_TEST(pxm_bitmap, i)) {
236 int nid = acpi_map_pxm_to_node(i);
237 node_set_online(nid);
238 }
239 }
240 BUG_ON(num_online_nodes() == 0);
241
242 /* set cnode id in memory chunk structure */
243 for (i = 0; i < num_memory_chunks; i++)
244 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
245
246 printk(KERN_DEBUG "pxm bitmap: ");
247 for (i = 0; i < sizeof(pxm_bitmap); i++) {
248 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
249 }
250 printk(KERN_CONT "\n");
251 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
252 num_online_nodes());
253 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
254 num_memory_chunks);
255
256 for (i = 0; i < MAX_APICID; i++)
257 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
258
259 for (j = 0; j < num_memory_chunks; j++){
260 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
261 printk(KERN_DEBUG
262 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
263 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
264 if (node_read_chunk(chunk->nid, chunk))
265 continue;
266
267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
268 min(chunk->end_pfn, max_pfn));
269 }
270 /* for out of order entries in SRAT */
271 sort_node_map();
272
273 for_each_online_node(nid) {
274 unsigned long start = node_start_pfn[nid];
275 unsigned long end = min(node_end_pfn[nid], max_pfn);
276
277 memory_present(nid, start, end);
278 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
279 }
280 return 1;
281out_fail:
282 printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
283 " table\n");
284 return 0;
285}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
deleted file mode 100644
index 9c0d0d399c30..000000000000
--- a/arch/x86/mm/srat_64.c
+++ /dev/null
@@ -1,564 +0,0 @@
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/mm.h>
20#include <asm/proto.h>
21#include <asm/numa.h>
22#include <asm/e820.h>
23#include <asm/apic.h>
24#include <asm/uv/uv.h>
25
26int acpi_numa __initdata;
27
28static struct acpi_table_slit *acpi_slit;
29
30static nodemask_t nodes_parsed __initdata;
31static nodemask_t cpu_nodes_parsed __initdata;
32static struct bootnode nodes[MAX_NUMNODES] __initdata;
33static struct bootnode nodes_add[MAX_NUMNODES];
34
35static int num_node_memblks __initdata;
36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
38
39static __init int setup_node(int pxm)
40{
41 return acpi_map_pxm_to_node(pxm);
42}
43
44static __init int conflicting_memblks(unsigned long start, unsigned long end)
45{
46 int i;
47 for (i = 0; i < num_node_memblks; i++) {
48 struct bootnode *nd = &node_memblk_range[i];
49 if (nd->start == nd->end)
50 continue;
51 if (nd->end > start && nd->start < end)
52 return memblk_nodeid[i];
53 if (nd->end == end && nd->start == start)
54 return memblk_nodeid[i];
55 }
56 return -1;
57}
58
59static __init void cutoff_node(int i, unsigned long start, unsigned long end)
60{
61 struct bootnode *nd = &nodes[i];
62
63 if (nd->start < start) {
64 nd->start = start;
65 if (nd->end < nd->start)
66 nd->start = nd->end;
67 }
68 if (nd->end > end) {
69 nd->end = end;
70 if (nd->start > nd->end)
71 nd->start = nd->end;
72 }
73}
74
75static __init void bad_srat(void)
76{
77 int i;
78 printk(KERN_ERR "SRAT: SRAT not used.\n");
79 acpi_numa = -1;
80 for (i = 0; i < MAX_LOCAL_APIC; i++)
81 apicid_to_node[i] = NUMA_NO_NODE;
82 for (i = 0; i < MAX_NUMNODES; i++) {
83 nodes[i].start = nodes[i].end = 0;
84 nodes_add[i].start = nodes_add[i].end = 0;
85 }
86 remove_all_active_ranges();
87}
88
89static __init inline int srat_disabled(void)
90{
91 return numa_off || acpi_numa < 0;
92}
93
94/* Callback for SLIT parsing */
95void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
96{
97 unsigned length;
98 unsigned long phys;
99
100 length = slit->header.length;
101 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
102 PAGE_SIZE);
103
104 if (phys == -1L)
105 panic(" Can not save slit!\n");
106
107 acpi_slit = __va(phys);
108 memcpy(acpi_slit, slit, length);
109 reserve_early(phys, phys + length, "ACPI SLIT");
110}
111
112/* Callback for Proximity Domain -> x2APIC mapping */
113void __init
114acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
115{
116 int pxm, node;
117 int apic_id;
118
119 if (srat_disabled())
120 return;
121 if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
122 bad_srat();
123 return;
124 }
125 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
126 return;
127 pxm = pa->proximity_domain;
128 node = setup_node(pxm);
129 if (node < 0) {
130 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
131 bad_srat();
132 return;
133 }
134
135 apic_id = pa->apic_id;
136 apicid_to_node[apic_id] = node;
137 node_set(node, cpu_nodes_parsed);
138 acpi_numa = 1;
139 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
140 pxm, apic_id, node);
141}
142
143/* Callback for Proximity Domain -> LAPIC mapping */
144void __init
145acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
146{
147 int pxm, node;
148 int apic_id;
149
150 if (srat_disabled())
151 return;
152 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
153 bad_srat();
154 return;
155 }
156 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
157 return;
158 pxm = pa->proximity_domain_lo;
159 node = setup_node(pxm);
160 if (node < 0) {
161 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
162 bad_srat();
163 return;
164 }
165
166 if (get_uv_system_type() >= UV_X2APIC)
167 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
168 else
169 apic_id = pa->apic_id;
170 apicid_to_node[apic_id] = node;
171 node_set(node, cpu_nodes_parsed);
172 acpi_numa = 1;
173 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
174 pxm, apic_id, node);
175}
176
177#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
178static inline int save_add_info(void) {return 1;}
179#else
180static inline int save_add_info(void) {return 0;}
181#endif
182/*
183 * Update nodes_add[]
184 * This code supports one contiguous hot add area per node
185 */
186static void __init
187update_nodes_add(int node, unsigned long start, unsigned long end)
188{
189 unsigned long s_pfn = start >> PAGE_SHIFT;
190 unsigned long e_pfn = end >> PAGE_SHIFT;
191 int changed = 0;
192 struct bootnode *nd = &nodes_add[node];
193
194 /* I had some trouble with strange memory hotadd regions breaking
195 the boot. Be very strict here and reject anything unexpected.
196 If you want working memory hotadd write correct SRATs.
197
198 The node size check is a basic sanity check to guard against
199 mistakes */
200 if ((signed long)(end - start) < NODE_MIN_SIZE) {
201 printk(KERN_ERR "SRAT: Hotplug area too small\n");
202 return;
203 }
204
205 /* This check might be a bit too strict, but I'm keeping it for now. */
206 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
207 printk(KERN_ERR
208 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
209 s_pfn, e_pfn);
210 return;
211 }
212
213 /* Looks good */
214
215 if (nd->start == nd->end) {
216 nd->start = start;
217 nd->end = end;
218 changed = 1;
219 } else {
220 if (nd->start == end) {
221 nd->start = start;
222 changed = 1;
223 }
224 if (nd->end == start) {
225 nd->end = end;
226 changed = 1;
227 }
228 if (!changed)
229 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
230 }
231
232 if (changed) {
233 node_set(node, cpu_nodes_parsed);
234 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
235 nd->start, nd->end);
236 }
237}
238
239/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
240void __init
241acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
242{
243 struct bootnode *nd, oldnode;
244 unsigned long start, end;
245 int node, pxm;
246 int i;
247
248 if (srat_disabled())
249 return;
250 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
251 bad_srat();
252 return;
253 }
254 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
255 return;
256
257 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
258 return;
259 start = ma->base_address;
260 end = start + ma->length;
261 pxm = ma->proximity_domain;
262 node = setup_node(pxm);
263 if (node < 0) {
264 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
265 bad_srat();
266 return;
267 }
268 i = conflicting_memblks(start, end);
269 if (i == node) {
270 printk(KERN_WARNING
271 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
272 pxm, start, end, nodes[i].start, nodes[i].end);
273 } else if (i >= 0) {
274 printk(KERN_ERR
275 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
276 pxm, start, end, node_to_pxm(i),
277 nodes[i].start, nodes[i].end);
278 bad_srat();
279 return;
280 }
281 nd = &nodes[node];
282 oldnode = *nd;
283 if (!node_test_and_set(node, nodes_parsed)) {
284 nd->start = start;
285 nd->end = end;
286 } else {
287 if (start < nd->start)
288 nd->start = start;
289 if (nd->end < end)
290 nd->end = end;
291 }
292
293 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
294 start, end);
295
296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
297 update_nodes_add(node, start, end);
298 /* restore nodes[node] */
299 *nd = oldnode;
300 if ((nd->start | nd->end) == 0)
301 node_clear(node, nodes_parsed);
302 }
303
304 node_memblk_range[num_node_memblks].start = start;
305 node_memblk_range[num_node_memblks].end = end;
306 memblk_nodeid[num_node_memblks] = node;
307 num_node_memblks++;
308}
309
310/* Sanity check to catch more bad SRATs (they are amazingly common).
311 Make sure the PXMs cover all memory. */
312static int __init nodes_cover_memory(const struct bootnode *nodes)
313{
314 int i;
315 unsigned long pxmram, e820ram;
316
317 pxmram = 0;
318 for_each_node_mask(i, nodes_parsed) {
319 unsigned long s = nodes[i].start >> PAGE_SHIFT;
320 unsigned long e = nodes[i].end >> PAGE_SHIFT;
321 pxmram += e - s;
322 pxmram -= __absent_pages_in_range(i, s, e);
323 if ((long)pxmram < 0)
324 pxmram = 0;
325 }
326
327 e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
328 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
329 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
330 printk(KERN_ERR
331 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
332 (pxmram << PAGE_SHIFT) >> 20,
333 (e820ram << PAGE_SHIFT) >> 20);
334 return 0;
335 }
336 return 1;
337}
338
339void __init acpi_numa_arch_fixup(void) {}
340
341int __init acpi_get_nodes(struct bootnode *physnodes)
342{
343 int i;
344 int ret = 0;
345
346 for_each_node_mask(i, nodes_parsed) {
347 physnodes[ret].start = nodes[i].start;
348 physnodes[ret].end = nodes[i].end;
349 ret++;
350 }
351 return ret;
352}
353
354/* Use the information discovered above to actually set up the nodes. */
355int __init acpi_scan_nodes(unsigned long start, unsigned long end)
356{
357 int i;
358
359 if (acpi_numa <= 0)
360 return -1;
361
362 /* First clean up the node list */
363 for (i = 0; i < MAX_NUMNODES; i++)
364 cutoff_node(i, start, end);
365
366 /*
367 * Join together blocks on the same node, holes between
368 * which don't overlap with memory on other nodes.
369 */
370 for (i = 0; i < num_node_memblks; ++i) {
371 int j, k;
372
373 for (j = i + 1; j < num_node_memblks; ++j) {
374 unsigned long start, end;
375
376 if (memblk_nodeid[i] != memblk_nodeid[j])
377 continue;
378 start = min(node_memblk_range[i].end,
379 node_memblk_range[j].end);
380 end = max(node_memblk_range[i].start,
381 node_memblk_range[j].start);
382 for (k = 0; k < num_node_memblks; ++k) {
383 if (memblk_nodeid[i] == memblk_nodeid[k])
384 continue;
385 if (start < node_memblk_range[k].end &&
386 end > node_memblk_range[k].start)
387 break;
388 }
389 if (k < num_node_memblks)
390 continue;
391 start = min(node_memblk_range[i].start,
392 node_memblk_range[j].start);
393 end = max(node_memblk_range[i].end,
394 node_memblk_range[j].end);
395 printk(KERN_INFO "SRAT: Node %d "
396 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
397 memblk_nodeid[i],
398 node_memblk_range[i].start,
399 node_memblk_range[i].end,
400 node_memblk_range[j].start,
401 node_memblk_range[j].end,
402 start, end);
403 node_memblk_range[i].start = start;
404 node_memblk_range[i].end = end;
405 k = --num_node_memblks - j;
406 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
407 k * sizeof(*memblk_nodeid));
408 memmove(node_memblk_range + j, node_memblk_range + j+1,
409 k * sizeof(*node_memblk_range));
410 --j;
411 }
412 }
413
414 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
415 memblk_nodeid);
416 if (memnode_shift < 0) {
417 printk(KERN_ERR
418 "SRAT: No NUMA node hash function found. Contact maintainer\n");
419 bad_srat();
420 return -1;
421 }
422
423 for (i = 0; i < num_node_memblks; i++)
424 e820_register_active_regions(memblk_nodeid[i],
425 node_memblk_range[i].start >> PAGE_SHIFT,
426 node_memblk_range[i].end >> PAGE_SHIFT);
427
428 /* for out of order entries in SRAT */
429 sort_node_map();
430 if (!nodes_cover_memory(nodes)) {
431 bad_srat();
432 return -1;
433 }
434
435 /* Account for nodes with cpus and no memory */
436 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
437
438 /* Finally register nodes */
439 for_each_node_mask(i, node_possible_map)
440 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
441 /* Try again in case setup_node_bootmem missed one due
442 to missing bootmem */
443 for_each_node_mask(i, node_possible_map)
444 if (!node_online(i))
445 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
446
447 for (i = 0; i < nr_cpu_ids; i++) {
448 int node = early_cpu_to_node(i);
449
450 if (node == NUMA_NO_NODE)
451 continue;
452 if (!node_online(node))
453 numa_clear_node(i);
454 }
455 numa_init_array();
456 return 0;
457}
458
459#ifdef CONFIG_NUMA_EMU
460static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
461 [0 ... MAX_NUMNODES-1] = PXM_INVAL
462};
463static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
464 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
465};
466static int __init find_node_by_addr(unsigned long addr)
467{
468 int ret = NUMA_NO_NODE;
469 int i;
470
471 for_each_node_mask(i, nodes_parsed) {
472 /*
473 * Find the real node that this emulated node appears on. For
474 * the sake of simplicity, we only use a real node's starting
475 * address to determine which emulated node it appears on.
476 */
477 if (addr >= nodes[i].start && addr < nodes[i].end) {
478 ret = i;
479 break;
480 }
481 }
482 return ret;
483}
484
485/*
486 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
487 * mappings that respect the real ACPI topology but reflect our emulated
488 * environment. For each emulated node, we find which real node it appears on
489 * and create PXM to NID mappings for those fake nodes which mirror that
490 * locality. SLIT will now represent the correct distances between emulated
491 * nodes as a result of the real topology.
492 */
493void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
494{
495 int i, j;
496
497 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
498 "topology.\n");
499 for (i = 0; i < num_nodes; i++) {
500 int nid, pxm;
501
502 nid = find_node_by_addr(fake_nodes[i].start);
503 if (nid == NUMA_NO_NODE)
504 continue;
505 pxm = node_to_pxm(nid);
506 if (pxm == PXM_INVAL)
507 continue;
508 fake_node_to_pxm_map[i] = pxm;
509 /*
510 * For each apicid_to_node mapping that exists for this real
511 * node, it must now point to the fake node ID.
512 */
513 for (j = 0; j < MAX_LOCAL_APIC; j++)
514 if (apicid_to_node[j] == nid &&
515 fake_apicid_to_node[j] == NUMA_NO_NODE)
516 fake_apicid_to_node[j] = i;
517 }
518 for (i = 0; i < num_nodes; i++)
519 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
520 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
521
522 nodes_clear(nodes_parsed);
523 for (i = 0; i < num_nodes; i++)
524 if (fake_nodes[i].start != fake_nodes[i].end)
525 node_set(i, nodes_parsed);
526}
527
528static int null_slit_node_compare(int a, int b)
529{
530 return node_to_pxm(a) == node_to_pxm(b);
531}
532#else
533static int null_slit_node_compare(int a, int b)
534{
535 return a == b;
536}
537#endif /* CONFIG_NUMA_EMU */
538
539int __node_distance(int a, int b)
540{
541 int index;
542
543 if (!acpi_slit)
544 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
545 REMOTE_DISTANCE;
546 index = acpi_slit->locality_count * node_to_pxm(a);
547 return acpi_slit->entry[index + node_to_pxm(b)];
548}
549
550EXPORT_SYMBOL(__node_distance);
551
552#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
553int memory_add_physaddr_to_nid(u64 start)
554{
555 int i, ret = 0;
556
557 for_each_node(i)
558 if (nodes_add[i].start <= start && nodes_add[i].end > start)
559 ret = i;
560
561 return ret;
562}
563EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
564#endif
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab6667..d6c0418c3e47 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
5#include <linux/smp.h> 5#include <linux/smp.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cpu.h>
8 9
9#include <asm/tlbflush.h> 10#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
52 want false sharing in the per cpu data segment. */ 53 want false sharing in the per cpu data segment. */
53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; 54static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54 55
56static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57
55/* 58/*
56 * We cannot call mmdrop() because we are in interrupt context, 59 * We cannot call mmdrop() because we are in interrupt context,
57 * instead update mm->cpu_vm_mask. 60 * instead update mm->cpu_vm_mask.
@@ -173,15 +176,11 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
173 union smp_flush_state *f; 176 union smp_flush_state *f;
174 177
175 /* Caller has disabled preemption */ 178 /* Caller has disabled preemption */
176 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 179 sender = this_cpu_read(tlb_vector_offset);
177 f = &flush_state[sender]; 180 f = &flush_state[sender];
178 181
179 /* 182 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
180 * Could avoid this lock when 183 raw_spin_lock(&f->tlbstate_lock);
181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
182 * probably not worth checking this for a cache-hot lock.
183 */
184 raw_spin_lock(&f->tlbstate_lock);
185 184
186 f->flush_mm = mm; 185 f->flush_mm = mm;
187 f->flush_va = va; 186 f->flush_va = va;
@@ -199,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
199 198
200 f->flush_mm = NULL; 199 f->flush_mm = NULL;
201 f->flush_va = 0; 200 f->flush_va = 0;
202 raw_spin_unlock(&f->tlbstate_lock); 201 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
202 raw_spin_unlock(&f->tlbstate_lock);
203} 203}
204 204
205void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -208,16 +208,57 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
208 if (is_uv_system()) { 208 if (is_uv_system()) {
209 unsigned int cpu; 209 unsigned int cpu;
210 210
211 cpu = get_cpu(); 211 cpu = smp_processor_id();
212 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 212 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
213 if (cpumask) 213 if (cpumask)
214 flush_tlb_others_ipi(cpumask, mm, va); 214 flush_tlb_others_ipi(cpumask, mm, va);
215 put_cpu();
216 return; 215 return;
217 } 216 }
218 flush_tlb_others_ipi(cpumask, mm, va); 217 flush_tlb_others_ipi(cpumask, mm, va);
219} 218}
220 219
220static void __cpuinit calculate_tlb_offset(void)
221{
222 int cpu, node, nr_node_vecs, idx = 0;
223 /*
224 * we are changing tlb_vector_offset for each CPU in runtime, but this
225 * will not cause inconsistency, as the write is atomic under X86. we
226 * might see more lock contentions in a short time, but after all CPU's
227 * tlb_vector_offset are changed, everything should go normal
228 *
229 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
230 * waste some vectors.
231 **/
232 if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
233 nr_node_vecs = 1;
234 else
235 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
236
237 for_each_online_node(node) {
238 int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
239 nr_node_vecs;
240 int cpu_offset = 0;
241 for_each_cpu(cpu, cpumask_of_node(node)) {
242 per_cpu(tlb_vector_offset, cpu) = node_offset +
243 cpu_offset;
244 cpu_offset++;
245 cpu_offset = cpu_offset % nr_node_vecs;
246 }
247 idx++;
248 }
249}
250
251static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
252 unsigned long action, void *hcpu)
253{
254 switch (action & 0xf) {
255 case CPU_ONLINE:
256 case CPU_DEAD:
257 calculate_tlb_offset();
258 }
259 return NOTIFY_OK;
260}
261
221static int __cpuinit init_smp_flush(void) 262static int __cpuinit init_smp_flush(void)
222{ 263{
223 int i; 264 int i;
@@ -225,6 +266,8 @@ static int __cpuinit init_smp_flush(void)
225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 266 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 raw_spin_lock_init(&flush_state[i].tlbstate_lock); 267 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 268
269 calculate_tlb_offset();
270 hotcpu_notifier(tlb_cpuhp_notify, 0);
228 return 0; 271 return 0;
229} 272}
230core_initcall(init_smp_flush); 273core_initcall(init_smp_flush);