diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/mm | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/mm')
29 files changed, 2562 insertions, 2621 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index a4c768397baa..3d11327c9ab4 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile | |||
@@ -23,7 +23,10 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o | |||
23 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o | 23 | obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o |
24 | 24 | ||
25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o | 25 | obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o |
26 | obj-$(CONFIG_K8_NUMA) += k8topology_64.o | 26 | obj-$(CONFIG_AMD_NUMA) += amdtopology.o |
27 | obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o | 27 | obj-$(CONFIG_ACPI_NUMA) += srat.o |
28 | obj-$(CONFIG_NUMA_EMU) += numa_emulation.o | ||
29 | |||
30 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | ||
28 | 31 | ||
29 | obj-$(CONFIG_MEMTEST) += memtest.o | 32 | obj-$(CONFIG_MEMTEST) += memtest.o |
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology.c index 970ed579d4e4..5247d01329ca 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/amdtopology.c | |||
@@ -1,8 +1,8 @@ | |||
1 | /* | 1 | /* |
2 | * AMD K8 NUMA support. | 2 | * AMD NUMA support. |
3 | * Discover the memory map and associated nodes. | 3 | * Discover the memory map and associated nodes. |
4 | * | 4 | * |
5 | * This version reads it directly from the K8 northbridge. | 5 | * This version reads it directly from the AMD northbridge. |
6 | * | 6 | * |
7 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | 7 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. |
8 | */ | 8 | */ |
@@ -11,6 +11,9 @@ | |||
11 | #include <linux/string.h> | 11 | #include <linux/string.h> |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/memblock.h> | ||
15 | #include <linux/bootmem.h> | ||
16 | |||
14 | #include <asm/io.h> | 17 | #include <asm/io.h> |
15 | #include <linux/pci_ids.h> | 18 | #include <linux/pci_ids.h> |
16 | #include <linux/acpi.h> | 19 | #include <linux/acpi.h> |
@@ -22,10 +25,9 @@ | |||
22 | #include <asm/numa.h> | 25 | #include <asm/numa.h> |
23 | #include <asm/mpspec.h> | 26 | #include <asm/mpspec.h> |
24 | #include <asm/apic.h> | 27 | #include <asm/apic.h> |
25 | #include <asm/k8.h> | 28 | #include <asm/amd_nb.h> |
26 | 29 | ||
27 | static struct bootnode __initdata nodes[8]; | 30 | static unsigned char __initdata nodeids[8]; |
28 | static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; | ||
29 | 31 | ||
30 | static __init int find_northbridge(void) | 32 | static __init int find_northbridge(void) |
31 | { | 33 | { |
@@ -48,14 +50,14 @@ static __init int find_northbridge(void) | |||
48 | return num; | 50 | return num; |
49 | } | 51 | } |
50 | 52 | ||
51 | return -1; | 53 | return -ENOENT; |
52 | } | 54 | } |
53 | 55 | ||
54 | static __init void early_get_boot_cpu_id(void) | 56 | static __init void early_get_boot_cpu_id(void) |
55 | { | 57 | { |
56 | /* | 58 | /* |
57 | * need to get boot_cpu_id so can use that to create apicid_to_node | 59 | * need to get the APIC ID of the BSP so can use that to |
58 | * in k8_scan_nodes() | 60 | * create apicid_to_node in amd_scan_nodes() |
59 | */ | 61 | */ |
60 | #ifdef CONFIG_X86_MPPARSE | 62 | #ifdef CONFIG_X86_MPPARSE |
61 | /* | 63 | /* |
@@ -64,33 +66,20 @@ static __init void early_get_boot_cpu_id(void) | |||
64 | if (smp_found_config) | 66 | if (smp_found_config) |
65 | early_get_smp_config(); | 67 | early_get_smp_config(); |
66 | #endif | 68 | #endif |
67 | early_init_lapic_mapping(); | ||
68 | } | ||
69 | |||
70 | int __init k8_get_nodes(struct bootnode *physnodes) | ||
71 | { | ||
72 | int i; | ||
73 | int ret = 0; | ||
74 | |||
75 | for_each_node_mask(i, nodes_parsed) { | ||
76 | physnodes[ret].start = nodes[i].start; | ||
77 | physnodes[ret].end = nodes[i].end; | ||
78 | ret++; | ||
79 | } | ||
80 | return ret; | ||
81 | } | 69 | } |
82 | 70 | ||
83 | int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) | 71 | int __init amd_numa_init(void) |
84 | { | 72 | { |
85 | unsigned long start = PFN_PHYS(start_pfn); | 73 | u64 start = PFN_PHYS(0); |
86 | unsigned long end = PFN_PHYS(end_pfn); | 74 | u64 end = PFN_PHYS(max_pfn); |
87 | unsigned numnodes; | 75 | unsigned numnodes; |
88 | unsigned long prevbase; | 76 | u64 prevbase; |
89 | int i, nb, found = 0; | 77 | int i, j, nb; |
90 | u32 nodeid, reg; | 78 | u32 nodeid, reg; |
79 | unsigned int bits, cores, apicid_base; | ||
91 | 80 | ||
92 | if (!early_pci_allowed()) | 81 | if (!early_pci_allowed()) |
93 | return -1; | 82 | return -EINVAL; |
94 | 83 | ||
95 | nb = find_northbridge(); | 84 | nb = find_northbridge(); |
96 | if (nb < 0) | 85 | if (nb < 0) |
@@ -101,40 +90,40 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
101 | reg = read_pci_config(0, nb, 0, 0x60); | 90 | reg = read_pci_config(0, nb, 0, 0x60); |
102 | numnodes = ((reg >> 4) & 0xF) + 1; | 91 | numnodes = ((reg >> 4) & 0xF) + 1; |
103 | if (numnodes <= 1) | 92 | if (numnodes <= 1) |
104 | return -1; | 93 | return -ENOENT; |
105 | 94 | ||
106 | pr_info("Number of physical nodes %d\n", numnodes); | 95 | pr_info("Number of physical nodes %d\n", numnodes); |
107 | 96 | ||
108 | prevbase = 0; | 97 | prevbase = 0; |
109 | for (i = 0; i < 8; i++) { | 98 | for (i = 0; i < 8; i++) { |
110 | unsigned long base, limit; | 99 | u64 base, limit; |
111 | 100 | ||
112 | base = read_pci_config(0, nb, 1, 0x40 + i*8); | 101 | base = read_pci_config(0, nb, 1, 0x40 + i*8); |
113 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); | 102 | limit = read_pci_config(0, nb, 1, 0x44 + i*8); |
114 | 103 | ||
115 | nodeid = limit & 7; | 104 | nodeids[i] = nodeid = limit & 7; |
116 | if ((base & 3) == 0) { | 105 | if ((base & 3) == 0) { |
117 | if (i < numnodes) | 106 | if (i < numnodes) |
118 | pr_info("Skipping disabled node %d\n", i); | 107 | pr_info("Skipping disabled node %d\n", i); |
119 | continue; | 108 | continue; |
120 | } | 109 | } |
121 | if (nodeid >= numnodes) { | 110 | if (nodeid >= numnodes) { |
122 | pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, | 111 | pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid, |
123 | base, limit); | 112 | base, limit); |
124 | continue; | 113 | continue; |
125 | } | 114 | } |
126 | 115 | ||
127 | if (!limit) { | 116 | if (!limit) { |
128 | pr_info("Skipping node entry %d (base %lx)\n", | 117 | pr_info("Skipping node entry %d (base %Lx)\n", |
129 | i, base); | 118 | i, base); |
130 | continue; | 119 | continue; |
131 | } | 120 | } |
132 | if ((base >> 8) & 3 || (limit >> 8) & 3) { | 121 | if ((base >> 8) & 3 || (limit >> 8) & 3) { |
133 | pr_err("Node %d using interleaving mode %lx/%lx\n", | 122 | pr_err("Node %d using interleaving mode %Lx/%Lx\n", |
134 | nodeid, (base >> 8) & 3, (limit >> 8) & 3); | 123 | nodeid, (base >> 8) & 3, (limit >> 8) & 3); |
135 | return -1; | 124 | return -EINVAL; |
136 | } | 125 | } |
137 | if (node_isset(nodeid, nodes_parsed)) { | 126 | if (node_isset(nodeid, numa_nodes_parsed)) { |
138 | pr_info("Node %d already present, skipping\n", | 127 | pr_info("Node %d already present, skipping\n", |
139 | nodeid); | 128 | nodeid); |
140 | continue; | 129 | continue; |
@@ -162,74 +151,47 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) | |||
162 | continue; | 151 | continue; |
163 | } | 152 | } |
164 | if (limit < base) { | 153 | if (limit < base) { |
165 | pr_err("Node %d bogus settings %lx-%lx.\n", | 154 | pr_err("Node %d bogus settings %Lx-%Lx.\n", |
166 | nodeid, base, limit); | 155 | nodeid, base, limit); |
167 | continue; | 156 | continue; |
168 | } | 157 | } |
169 | 158 | ||
170 | /* Could sort here, but pun for now. Should not happen anyroads. */ | 159 | /* Could sort here, but pun for now. Should not happen anyroads. */ |
171 | if (prevbase > base) { | 160 | if (prevbase > base) { |
172 | pr_err("Node map not sorted %lx,%lx\n", | 161 | pr_err("Node map not sorted %Lx,%Lx\n", |
173 | prevbase, base); | 162 | prevbase, base); |
174 | return -1; | 163 | return -EINVAL; |
175 | } | 164 | } |
176 | 165 | ||
177 | pr_info("Node %d MemBase %016lx Limit %016lx\n", | 166 | pr_info("Node %d MemBase %016Lx Limit %016Lx\n", |
178 | nodeid, base, limit); | 167 | nodeid, base, limit); |
179 | 168 | ||
180 | found++; | ||
181 | |||
182 | nodes[nodeid].start = base; | ||
183 | nodes[nodeid].end = limit; | ||
184 | |||
185 | prevbase = base; | 169 | prevbase = base; |
186 | 170 | numa_add_memblk(nodeid, base, limit); | |
187 | node_set(nodeid, nodes_parsed); | 171 | node_set(nodeid, numa_nodes_parsed); |
188 | } | 172 | } |
189 | 173 | ||
190 | if (!found) | 174 | if (!nodes_weight(numa_nodes_parsed)) |
191 | return -1; | 175 | return -ENOENT; |
192 | return 0; | ||
193 | } | ||
194 | 176 | ||
195 | int __init k8_scan_nodes(void) | 177 | /* |
196 | { | 178 | * We seem to have valid NUMA configuration. Map apicids to nodes |
197 | unsigned int bits; | 179 | * using the coreid bits from early_identify_cpu. |
198 | unsigned int cores; | 180 | */ |
199 | unsigned int apicid_base; | ||
200 | int i; | ||
201 | |||
202 | BUG_ON(nodes_empty(nodes_parsed)); | ||
203 | node_possible_map = nodes_parsed; | ||
204 | memnode_shift = compute_hash_shift(nodes, 8, NULL); | ||
205 | if (memnode_shift < 0) { | ||
206 | pr_err("No NUMA node hash function found. Contact maintainer\n"); | ||
207 | return -1; | ||
208 | } | ||
209 | pr_info("Using node hash shift of %d\n", memnode_shift); | ||
210 | |||
211 | /* use the coreid bits from early_identify_cpu */ | ||
212 | bits = boot_cpu_data.x86_coreid_bits; | 181 | bits = boot_cpu_data.x86_coreid_bits; |
213 | cores = (1<<bits); | 182 | cores = 1 << bits; |
214 | apicid_base = 0; | 183 | apicid_base = 0; |
215 | /* need to get boot_cpu_id early for system with apicid lifting */ | 184 | |
185 | /* get the APIC ID of the BSP early for systems with apicid lifting */ | ||
216 | early_get_boot_cpu_id(); | 186 | early_get_boot_cpu_id(); |
217 | if (boot_cpu_physical_apicid > 0) { | 187 | if (boot_cpu_physical_apicid > 0) { |
218 | pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); | 188 | pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); |
219 | apicid_base = boot_cpu_physical_apicid; | 189 | apicid_base = boot_cpu_physical_apicid; |
220 | } | 190 | } |
221 | 191 | ||
222 | for_each_node_mask(i, node_possible_map) { | 192 | for_each_node_mask(i, numa_nodes_parsed) |
223 | int j; | ||
224 | |||
225 | e820_register_active_regions(i, | ||
226 | nodes[i].start >> PAGE_SHIFT, | ||
227 | nodes[i].end >> PAGE_SHIFT); | ||
228 | for (j = apicid_base; j < cores + apicid_base; j++) | 193 | for (j = apicid_base; j < cores + apicid_base; j++) |
229 | apicid_to_node[(i << bits) + j] = i; | 194 | set_apicid_to_node((i << bits) + j, i); |
230 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
231 | } | ||
232 | 195 | ||
233 | numa_init_array(); | ||
234 | return 0; | 196 | return 0; |
235 | } | 197 | } |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e8a204..2dbf6bf4c7e5 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <linux/kprobes.h> /* __kprobes, ... */ | 11 | #include <linux/kprobes.h> /* __kprobes, ... */ |
12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ | 12 | #include <linux/mmiotrace.h> /* kmmio_handler, ... */ |
13 | #include <linux/perf_event.h> /* perf_sw_event */ | 13 | #include <linux/perf_event.h> /* perf_sw_event */ |
14 | #include <linux/hugetlb.h> /* hstate_index_to_shift */ | ||
15 | #include <linux/prefetch.h> /* prefetchw */ | ||
14 | 16 | ||
15 | #include <asm/traps.h> /* dotraplinkage, ... */ | 17 | #include <asm/traps.h> /* dotraplinkage, ... */ |
16 | #include <asm/pgalloc.h> /* pgd_*(), ... */ | 18 | #include <asm/pgalloc.h> /* pgd_*(), ... */ |
@@ -160,15 +162,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) | |||
160 | 162 | ||
161 | static void | 163 | static void |
162 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, | 164 | force_sig_info_fault(int si_signo, int si_code, unsigned long address, |
163 | struct task_struct *tsk) | 165 | struct task_struct *tsk, int fault) |
164 | { | 166 | { |
167 | unsigned lsb = 0; | ||
165 | siginfo_t info; | 168 | siginfo_t info; |
166 | 169 | ||
167 | info.si_signo = si_signo; | 170 | info.si_signo = si_signo; |
168 | info.si_errno = 0; | 171 | info.si_errno = 0; |
169 | info.si_code = si_code; | 172 | info.si_code = si_code; |
170 | info.si_addr = (void __user *)address; | 173 | info.si_addr = (void __user *)address; |
171 | info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; | 174 | if (fault & VM_FAULT_HWPOISON_LARGE) |
175 | lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); | ||
176 | if (fault & VM_FAULT_HWPOISON) | ||
177 | lsb = PAGE_SHIFT; | ||
178 | info.si_addr_lsb = lsb; | ||
172 | 179 | ||
173 | force_sig_info(si_signo, &info, tsk); | 180 | force_sig_info(si_signo, &info, tsk); |
174 | } | 181 | } |
@@ -223,16 +230,24 @@ void vmalloc_sync_all(void) | |||
223 | for (address = VMALLOC_START & PMD_MASK; | 230 | for (address = VMALLOC_START & PMD_MASK; |
224 | address >= TASK_SIZE && address < FIXADDR_TOP; | 231 | address >= TASK_SIZE && address < FIXADDR_TOP; |
225 | address += PMD_SIZE) { | 232 | address += PMD_SIZE) { |
226 | |||
227 | unsigned long flags; | ||
228 | struct page *page; | 233 | struct page *page; |
229 | 234 | ||
230 | spin_lock_irqsave(&pgd_lock, flags); | 235 | spin_lock(&pgd_lock); |
231 | list_for_each_entry(page, &pgd_list, lru) { | 236 | list_for_each_entry(page, &pgd_list, lru) { |
232 | if (!vmalloc_sync_one(page_address(page), address)) | 237 | spinlock_t *pgt_lock; |
238 | pmd_t *ret; | ||
239 | |||
240 | /* the pgt_lock only for Xen */ | ||
241 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | ||
242 | |||
243 | spin_lock(pgt_lock); | ||
244 | ret = vmalloc_sync_one(page_address(page), address); | ||
245 | spin_unlock(pgt_lock); | ||
246 | |||
247 | if (!ret) | ||
233 | break; | 248 | break; |
234 | } | 249 | } |
235 | spin_unlock_irqrestore(&pgd_lock, flags); | 250 | spin_unlock(&pgd_lock); |
236 | } | 251 | } |
237 | } | 252 | } |
238 | 253 | ||
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) | |||
251 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 266 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
252 | return -1; | 267 | return -1; |
253 | 268 | ||
269 | WARN_ON_ONCE(in_nmi()); | ||
270 | |||
254 | /* | 271 | /* |
255 | * Synchronize this task's top level page-table | 272 | * Synchronize this task's top level page-table |
256 | * with the 'reference' page table. | 273 | * with the 'reference' page table. |
@@ -326,29 +343,7 @@ out: | |||
326 | 343 | ||
327 | void vmalloc_sync_all(void) | 344 | void vmalloc_sync_all(void) |
328 | { | 345 | { |
329 | unsigned long address; | 346 | sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); |
330 | |||
331 | for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; | ||
332 | address += PGDIR_SIZE) { | ||
333 | |||
334 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
335 | unsigned long flags; | ||
336 | struct page *page; | ||
337 | |||
338 | if (pgd_none(*pgd_ref)) | ||
339 | continue; | ||
340 | |||
341 | spin_lock_irqsave(&pgd_lock, flags); | ||
342 | list_for_each_entry(page, &pgd_list, lru) { | ||
343 | pgd_t *pgd; | ||
344 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
345 | if (pgd_none(*pgd)) | ||
346 | set_pgd(pgd, *pgd_ref); | ||
347 | else | ||
348 | BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); | ||
349 | } | ||
350 | spin_unlock_irqrestore(&pgd_lock, flags); | ||
351 | } | ||
352 | } | 347 | } |
353 | 348 | ||
354 | /* | 349 | /* |
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address) | |||
369 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) | 364 | if (!(address >= VMALLOC_START && address < VMALLOC_END)) |
370 | return -1; | 365 | return -1; |
371 | 366 | ||
367 | WARN_ON_ONCE(in_nmi()); | ||
368 | |||
372 | /* | 369 | /* |
373 | * Copy kernel mappings over when needed. This can also | 370 | * Copy kernel mappings over when needed. This can also |
374 | * happen within a race in page table update. In the later | 371 | * happen within a race in page table update. In the later |
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, | |||
731 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); | 728 | tsk->thread.error_code = error_code | (address >= TASK_SIZE); |
732 | tsk->thread.trap_no = 14; | 729 | tsk->thread.trap_no = 14; |
733 | 730 | ||
734 | force_sig_info_fault(SIGSEGV, si_code, address, tsk); | 731 | force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); |
735 | 732 | ||
736 | return; | 733 | return; |
737 | } | 734 | } |
@@ -816,28 +813,51 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | |||
816 | tsk->thread.trap_no = 14; | 813 | tsk->thread.trap_no = 14; |
817 | 814 | ||
818 | #ifdef CONFIG_MEMORY_FAILURE | 815 | #ifdef CONFIG_MEMORY_FAILURE |
819 | if (fault & VM_FAULT_HWPOISON) { | 816 | if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { |
820 | printk(KERN_ERR | 817 | printk(KERN_ERR |
821 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", | 818 | "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", |
822 | tsk->comm, tsk->pid, address); | 819 | tsk->comm, tsk->pid, address); |
823 | code = BUS_MCEERR_AR; | 820 | code = BUS_MCEERR_AR; |
824 | } | 821 | } |
825 | #endif | 822 | #endif |
826 | force_sig_info_fault(SIGBUS, code, address, tsk); | 823 | force_sig_info_fault(SIGBUS, code, address, tsk, fault); |
827 | } | 824 | } |
828 | 825 | ||
829 | static noinline void | 826 | static noinline int |
830 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, | 827 | mm_fault_error(struct pt_regs *regs, unsigned long error_code, |
831 | unsigned long address, unsigned int fault) | 828 | unsigned long address, unsigned int fault) |
832 | { | 829 | { |
830 | /* | ||
831 | * Pagefault was interrupted by SIGKILL. We have no reason to | ||
832 | * continue pagefault. | ||
833 | */ | ||
834 | if (fatal_signal_pending(current)) { | ||
835 | if (!(fault & VM_FAULT_RETRY)) | ||
836 | up_read(¤t->mm->mmap_sem); | ||
837 | if (!(error_code & PF_USER)) | ||
838 | no_context(regs, error_code, address); | ||
839 | return 1; | ||
840 | } | ||
841 | if (!(fault & VM_FAULT_ERROR)) | ||
842 | return 0; | ||
843 | |||
833 | if (fault & VM_FAULT_OOM) { | 844 | if (fault & VM_FAULT_OOM) { |
845 | /* Kernel mode? Handle exceptions or die: */ | ||
846 | if (!(error_code & PF_USER)) { | ||
847 | up_read(¤t->mm->mmap_sem); | ||
848 | no_context(regs, error_code, address); | ||
849 | return 1; | ||
850 | } | ||
851 | |||
834 | out_of_memory(regs, error_code, address); | 852 | out_of_memory(regs, error_code, address); |
835 | } else { | 853 | } else { |
836 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) | 854 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
855 | VM_FAULT_HWPOISON_LARGE)) | ||
837 | do_sigbus(regs, error_code, address, fault); | 856 | do_sigbus(regs, error_code, address, fault); |
838 | else | 857 | else |
839 | BUG(); | 858 | BUG(); |
840 | } | 859 | } |
860 | return 1; | ||
841 | } | 861 | } |
842 | 862 | ||
843 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) | 863 | static int spurious_fault_check(unsigned long error_code, pte_t *pte) |
@@ -894,8 +914,14 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
894 | if (pmd_large(*pmd)) | 914 | if (pmd_large(*pmd)) |
895 | return spurious_fault_check(error_code, (pte_t *) pmd); | 915 | return spurious_fault_check(error_code, (pte_t *) pmd); |
896 | 916 | ||
917 | /* | ||
918 | * Note: don't use pte_present() here, since it returns true | ||
919 | * if the _PAGE_PROTNONE bit is set. However, this aliases the | ||
920 | * _PAGE_GLOBAL bit, which for kernel pages give false positives | ||
921 | * when CONFIG_DEBUG_PAGEALLOC is used. | ||
922 | */ | ||
897 | pte = pte_offset_kernel(pmd, address); | 923 | pte = pte_offset_kernel(pmd, address); |
898 | if (!pte_present(*pte)) | 924 | if (!(pte_flags(*pte) & _PAGE_PRESENT)) |
899 | return 0; | 925 | return 0; |
900 | 926 | ||
901 | ret = spurious_fault_check(error_code, pte); | 927 | ret = spurious_fault_check(error_code, pte); |
@@ -915,9 +941,9 @@ spurious_fault(unsigned long error_code, unsigned long address) | |||
915 | int show_unhandled_signals = 1; | 941 | int show_unhandled_signals = 1; |
916 | 942 | ||
917 | static inline int | 943 | static inline int |
918 | access_error(unsigned long error_code, int write, struct vm_area_struct *vma) | 944 | access_error(unsigned long error_code, struct vm_area_struct *vma) |
919 | { | 945 | { |
920 | if (write) { | 946 | if (error_code & PF_WRITE) { |
921 | /* write, present and write, not present: */ | 947 | /* write, present and write, not present: */ |
922 | if (unlikely(!(vma->vm_flags & VM_WRITE))) | 948 | if (unlikely(!(vma->vm_flags & VM_WRITE))) |
923 | return 1; | 949 | return 1; |
@@ -952,8 +978,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
952 | struct task_struct *tsk; | 978 | struct task_struct *tsk; |
953 | unsigned long address; | 979 | unsigned long address; |
954 | struct mm_struct *mm; | 980 | struct mm_struct *mm; |
955 | int write; | ||
956 | int fault; | 981 | int fault; |
982 | int write = error_code & PF_WRITE; | ||
983 | unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | | ||
984 | (write ? FAULT_FLAG_WRITE : 0); | ||
957 | 985 | ||
958 | tsk = current; | 986 | tsk = current; |
959 | mm = tsk->mm; | 987 | mm = tsk->mm; |
@@ -1064,6 +1092,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1064 | bad_area_nosemaphore(regs, error_code, address); | 1092 | bad_area_nosemaphore(regs, error_code, address); |
1065 | return; | 1093 | return; |
1066 | } | 1094 | } |
1095 | retry: | ||
1067 | down_read(&mm->mmap_sem); | 1096 | down_read(&mm->mmap_sem); |
1068 | } else { | 1097 | } else { |
1069 | /* | 1098 | /* |
@@ -1107,9 +1136,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) | |||
1107 | * we can handle it.. | 1136 | * we can handle it.. |
1108 | */ | 1137 | */ |
1109 | good_area: | 1138 | good_area: |
1110 | write = error_code & PF_WRITE; | 1139 | if (unlikely(access_error(error_code, vma))) { |
1111 | |||
1112 | if (unlikely(access_error(error_code, write, vma))) { | ||
1113 | bad_area_access_error(regs, error_code, address); | 1140 | bad_area_access_error(regs, error_code, address); |
1114 | return; | 1141 | return; |
1115 | } | 1142 | } |
@@ -1119,21 +1146,34 @@ good_area: | |||
1119 | * make sure we exit gracefully rather than endlessly redo | 1146 | * make sure we exit gracefully rather than endlessly redo |
1120 | * the fault: | 1147 | * the fault: |
1121 | */ | 1148 | */ |
1122 | fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); | 1149 | fault = handle_mm_fault(mm, vma, address, flags); |
1123 | 1150 | ||
1124 | if (unlikely(fault & VM_FAULT_ERROR)) { | 1151 | if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { |
1125 | mm_fault_error(regs, error_code, address, fault); | 1152 | if (mm_fault_error(regs, error_code, address, fault)) |
1126 | return; | 1153 | return; |
1127 | } | 1154 | } |
1128 | 1155 | ||
1129 | if (fault & VM_FAULT_MAJOR) { | 1156 | /* |
1130 | tsk->maj_flt++; | 1157 | * Major/minor page fault accounting is only done on the |
1131 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | 1158 | * initial attempt. If we go through a retry, it is extremely |
1132 | regs, address); | 1159 | * likely that the page will be found in page cache at that point. |
1133 | } else { | 1160 | */ |
1134 | tsk->min_flt++; | 1161 | if (flags & FAULT_FLAG_ALLOW_RETRY) { |
1135 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | 1162 | if (fault & VM_FAULT_MAJOR) { |
1136 | regs, address); | 1163 | tsk->maj_flt++; |
1164 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, | ||
1165 | regs, address); | ||
1166 | } else { | ||
1167 | tsk->min_flt++; | ||
1168 | perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, | ||
1169 | regs, address); | ||
1170 | } | ||
1171 | if (fault & VM_FAULT_RETRY) { | ||
1172 | /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk | ||
1173 | * of starvation. */ | ||
1174 | flags &= ~FAULT_FLAG_ALLOW_RETRY; | ||
1175 | goto retry; | ||
1176 | } | ||
1137 | } | 1177 | } |
1138 | 1178 | ||
1139 | check_v8086_mode(regs, address, tsk); | 1179 | check_v8086_mode(regs, address, tsk); |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e6593799d..dbe34b931374 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/vmstat.h> | 9 | #include <linux/vmstat.h> |
10 | #include <linux/highmem.h> | 10 | #include <linux/highmem.h> |
11 | #include <linux/swap.h> | ||
11 | 12 | ||
12 | #include <asm/pgtable.h> | 13 | #include <asm/pgtable.h> |
13 | 14 | ||
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | |||
89 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 90 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
90 | page = pte_page(pte); | 91 | page = pte_page(pte); |
91 | get_page(page); | 92 | get_page(page); |
93 | SetPageReferenced(page); | ||
92 | pages[*nr] = page; | 94 | pages[*nr] = page; |
93 | (*nr)++; | 95 | (*nr)++; |
94 | 96 | ||
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr) | |||
103 | VM_BUG_ON(page != compound_head(page)); | 105 | VM_BUG_ON(page != compound_head(page)); |
104 | VM_BUG_ON(page_count(page) == 0); | 106 | VM_BUG_ON(page_count(page) == 0); |
105 | atomic_add(nr, &page->_count); | 107 | atomic_add(nr, &page->_count); |
108 | SetPageReferenced(page); | ||
109 | } | ||
110 | |||
111 | static inline void get_huge_page_tail(struct page *page) | ||
112 | { | ||
113 | /* | ||
114 | * __split_huge_page_refcount() cannot run | ||
115 | * from under us. | ||
116 | */ | ||
117 | VM_BUG_ON(atomic_read(&page->_count) < 0); | ||
118 | atomic_inc(&page->_count); | ||
106 | } | 119 | } |
107 | 120 | ||
108 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | 121 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, |
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | |||
128 | do { | 141 | do { |
129 | VM_BUG_ON(compound_head(page) != head); | 142 | VM_BUG_ON(compound_head(page) != head); |
130 | pages[*nr] = page; | 143 | pages[*nr] = page; |
144 | if (PageTail(page)) | ||
145 | get_huge_page_tail(page); | ||
131 | (*nr)++; | 146 | (*nr)++; |
132 | page++; | 147 | page++; |
133 | refs++; | 148 | refs++; |
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
148 | pmd_t pmd = *pmdp; | 163 | pmd_t pmd = *pmdp; |
149 | 164 | ||
150 | next = pmd_addr_end(addr, end); | 165 | next = pmd_addr_end(addr, end); |
151 | if (pmd_none(pmd)) | 166 | /* |
167 | * The pmd_trans_splitting() check below explains why | ||
168 | * pmdp_splitting_flush has to flush the tlb, to stop | ||
169 | * this gup-fast code from running while we set the | ||
170 | * splitting bit in the pmd. Returning zero will take | ||
171 | * the slow path that will call wait_split_huge_page() | ||
172 | * if the pmd is still in splitting state. gup-fast | ||
173 | * can't because it has irq disabled and | ||
174 | * wait_split_huge_page() would never return as the | ||
175 | * tlb flush IPI wouldn't run. | ||
176 | */ | ||
177 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
152 | return 0; | 178 | return 0; |
153 | if (unlikely(pmd_large(pmd))) { | 179 | if (unlikely(pmd_large(pmd))) { |
154 | if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) | 180 | if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) |
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 5e8fa12ef861..b49962662101 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c | |||
@@ -9,6 +9,7 @@ void *kmap(struct page *page) | |||
9 | return page_address(page); | 9 | return page_address(page); |
10 | return kmap_high(page); | 10 | return kmap_high(page); |
11 | } | 11 | } |
12 | EXPORT_SYMBOL(kmap); | ||
12 | 13 | ||
13 | void kunmap(struct page *page) | 14 | void kunmap(struct page *page) |
14 | { | 15 | { |
@@ -18,6 +19,7 @@ void kunmap(struct page *page) | |||
18 | return; | 19 | return; |
19 | kunmap_high(page); | 20 | kunmap_high(page); |
20 | } | 21 | } |
22 | EXPORT_SYMBOL(kunmap); | ||
21 | 23 | ||
22 | /* | 24 | /* |
23 | * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because | 25 | * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because |
@@ -27,10 +29,10 @@ void kunmap(struct page *page) | |||
27 | * However when holding an atomic kmap it is not legal to sleep, so atomic | 29 | * However when holding an atomic kmap it is not legal to sleep, so atomic |
28 | * kmaps are appropriate for short, tight code paths only. | 30 | * kmaps are appropriate for short, tight code paths only. |
29 | */ | 31 | */ |
30 | void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) | 32 | void *kmap_atomic_prot(struct page *page, pgprot_t prot) |
31 | { | 33 | { |
32 | enum fixed_addresses idx; | ||
33 | unsigned long vaddr; | 34 | unsigned long vaddr; |
35 | int idx, type; | ||
34 | 36 | ||
35 | /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ | 37 | /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ |
36 | pagefault_disable(); | 38 | pagefault_disable(); |
@@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) | |||
38 | if (!PageHighMem(page)) | 40 | if (!PageHighMem(page)) |
39 | return page_address(page); | 41 | return page_address(page); |
40 | 42 | ||
41 | debug_kmap_atomic(type); | 43 | type = kmap_atomic_idx_push(); |
42 | |||
43 | idx = type + KM_TYPE_NR*smp_processor_id(); | 44 | idx = type + KM_TYPE_NR*smp_processor_id(); |
44 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | 45 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); |
45 | BUG_ON(!pte_none(*(kmap_pte-idx))); | 46 | BUG_ON(!pte_none(*(kmap_pte-idx))); |
@@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) | |||
47 | 48 | ||
48 | return (void *)vaddr; | 49 | return (void *)vaddr; |
49 | } | 50 | } |
51 | EXPORT_SYMBOL(kmap_atomic_prot); | ||
52 | |||
53 | void *__kmap_atomic(struct page *page) | ||
54 | { | ||
55 | return kmap_atomic_prot(page, kmap_prot); | ||
56 | } | ||
57 | EXPORT_SYMBOL(__kmap_atomic); | ||
50 | 58 | ||
51 | void *kmap_atomic(struct page *page, enum km_type type) | 59 | /* |
60 | * This is the same as kmap_atomic() but can map memory that doesn't | ||
61 | * have a struct page associated with it. | ||
62 | */ | ||
63 | void *kmap_atomic_pfn(unsigned long pfn) | ||
52 | { | 64 | { |
53 | return kmap_atomic_prot(page, type, kmap_prot); | 65 | return kmap_atomic_prot_pfn(pfn, kmap_prot); |
54 | } | 66 | } |
67 | EXPORT_SYMBOL_GPL(kmap_atomic_pfn); | ||
55 | 68 | ||
56 | void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) | 69 | void __kunmap_atomic(void *kvaddr) |
57 | { | 70 | { |
58 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; | 71 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; |
59 | enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); | 72 | |
60 | 73 | if (vaddr >= __fix_to_virt(FIX_KMAP_END) && | |
61 | /* | 74 | vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { |
62 | * Force other mappings to Oops if they'll try to access this pte | 75 | int idx, type; |
63 | * without first remap it. Keeping stale mappings around is a bad idea | 76 | |
64 | * also, in case the page changes cacheability attributes or becomes | 77 | type = kmap_atomic_idx(); |
65 | * a protected page in a hypervisor. | 78 | idx = type + KM_TYPE_NR * smp_processor_id(); |
66 | */ | 79 | |
67 | if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) | 80 | #ifdef CONFIG_DEBUG_HIGHMEM |
81 | WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); | ||
82 | #endif | ||
83 | /* | ||
84 | * Force other mappings to Oops if they'll try to access this | ||
85 | * pte without first remap it. Keeping stale mappings around | ||
86 | * is a bad idea also, in case the page changes cacheability | ||
87 | * attributes or becomes a protected page in a hypervisor. | ||
88 | */ | ||
68 | kpte_clear_flush(kmap_pte-idx, vaddr); | 89 | kpte_clear_flush(kmap_pte-idx, vaddr); |
69 | else { | 90 | kmap_atomic_idx_pop(); |
91 | } | ||
70 | #ifdef CONFIG_DEBUG_HIGHMEM | 92 | #ifdef CONFIG_DEBUG_HIGHMEM |
93 | else { | ||
71 | BUG_ON(vaddr < PAGE_OFFSET); | 94 | BUG_ON(vaddr < PAGE_OFFSET); |
72 | BUG_ON(vaddr >= (unsigned long)high_memory); | 95 | BUG_ON(vaddr >= (unsigned long)high_memory); |
73 | #endif | ||
74 | } | 96 | } |
97 | #endif | ||
75 | 98 | ||
76 | pagefault_enable(); | 99 | pagefault_enable(); |
77 | } | 100 | } |
78 | 101 | EXPORT_SYMBOL(__kunmap_atomic); | |
79 | /* | ||
80 | * This is the same as kmap_atomic() but can map memory that doesn't | ||
81 | * have a struct page associated with it. | ||
82 | */ | ||
83 | void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) | ||
84 | { | ||
85 | return kmap_atomic_prot_pfn(pfn, type, kmap_prot); | ||
86 | } | ||
87 | EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ | ||
88 | 102 | ||
89 | struct page *kmap_atomic_to_page(void *ptr) | 103 | struct page *kmap_atomic_to_page(void *ptr) |
90 | { | 104 | { |
@@ -98,12 +112,6 @@ struct page *kmap_atomic_to_page(void *ptr) | |||
98 | pte = kmap_pte - (idx - FIX_KMAP_BEGIN); | 112 | pte = kmap_pte - (idx - FIX_KMAP_BEGIN); |
99 | return pte_page(*pte); | 113 | return pte_page(*pte); |
100 | } | 114 | } |
101 | |||
102 | EXPORT_SYMBOL(kmap); | ||
103 | EXPORT_SYMBOL(kunmap); | ||
104 | EXPORT_SYMBOL(kmap_atomic); | ||
105 | EXPORT_SYMBOL(kunmap_atomic_notypecheck); | ||
106 | EXPORT_SYMBOL(kmap_atomic_prot); | ||
107 | EXPORT_SYMBOL(kmap_atomic_to_page); | 115 | EXPORT_SYMBOL(kmap_atomic_to_page); |
108 | 116 | ||
109 | void __init set_highmem_pages_init(void) | 117 | void __init set_highmem_pages_init(void) |
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 069ce7c37c01..f581a18c0d4d 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c | |||
@@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
72 | if (!vma_shareable(vma, addr)) | 72 | if (!vma_shareable(vma, addr)) |
73 | return; | 73 | return; |
74 | 74 | ||
75 | spin_lock(&mapping->i_mmap_lock); | 75 | mutex_lock(&mapping->i_mmap_mutex); |
76 | vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { | 76 | vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { |
77 | if (svma == vma) | 77 | if (svma == vma) |
78 | continue; | 78 | continue; |
@@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
97 | put_page(virt_to_page(spte)); | 97 | put_page(virt_to_page(spte)); |
98 | spin_unlock(&mm->page_table_lock); | 98 | spin_unlock(&mm->page_table_lock); |
99 | out: | 99 | out: |
100 | spin_unlock(&mapping->i_mmap_lock); | 100 | mutex_unlock(&mapping->i_mmap_mutex); |
101 | } | 101 | } |
102 | 102 | ||
103 | /* | 103 | /* |
@@ -326,7 +326,7 @@ try_again: | |||
326 | if (mm->free_area_cache < len) | 326 | if (mm->free_area_cache < len) |
327 | goto fail; | 327 | goto fail; |
328 | 328 | ||
329 | /* either no address requested or cant fit in requested address hole */ | 329 | /* either no address requested or can't fit in requested address hole */ |
330 | addr = (mm->free_area_cache - len) & huge_page_mask(h); | 330 | addr = (mm->free_area_cache - len) & huge_page_mask(h); |
331 | do { | 331 | do { |
332 | /* | 332 | /* |
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index b278535b14aa..30326443ab81 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/initrd.h> | 2 | #include <linux/initrd.h> |
3 | #include <linux/ioport.h> | 3 | #include <linux/ioport.h> |
4 | #include <linux/swap.h> | 4 | #include <linux/swap.h> |
5 | #include <linux/memblock.h> | ||
5 | 6 | ||
6 | #include <asm/cacheflush.h> | 7 | #include <asm/cacheflush.h> |
7 | #include <asm/e820.h> | 8 | #include <asm/e820.h> |
@@ -15,11 +16,9 @@ | |||
15 | #include <asm/tlb.h> | 16 | #include <asm/tlb.h> |
16 | #include <asm/proto.h> | 17 | #include <asm/proto.h> |
17 | 18 | ||
18 | DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); | 19 | unsigned long __initdata pgt_buf_start; |
19 | 20 | unsigned long __meminitdata pgt_buf_end; | |
20 | unsigned long __initdata e820_table_start; | 21 | unsigned long __meminitdata pgt_buf_top; |
21 | unsigned long __meminitdata e820_table_end; | ||
22 | unsigned long __meminitdata e820_table_top; | ||
23 | 22 | ||
24 | int after_bootmem; | 23 | int after_bootmem; |
25 | 24 | ||
@@ -32,7 +31,8 @@ int direct_gbpages | |||
32 | static void __init find_early_table_space(unsigned long end, int use_pse, | 31 | static void __init find_early_table_space(unsigned long end, int use_pse, |
33 | int use_gbpages) | 32 | int use_gbpages) |
34 | { | 33 | { |
35 | unsigned long puds, pmds, ptes, tables, start; | 34 | unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; |
35 | phys_addr_t base; | ||
36 | 36 | ||
37 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; | 37 | puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; |
38 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); | 38 | tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); |
@@ -63,29 +63,25 @@ static void __init find_early_table_space(unsigned long end, int use_pse, | |||
63 | #ifdef CONFIG_X86_32 | 63 | #ifdef CONFIG_X86_32 |
64 | /* for fixmap */ | 64 | /* for fixmap */ |
65 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); | 65 | tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); |
66 | #endif | ||
67 | 66 | ||
68 | /* | 67 | good_end = max_pfn_mapped << PAGE_SHIFT; |
69 | * RED-PEN putting page tables only on node 0 could | ||
70 | * cause a hotspot and fill up ZONE_DMA. The page tables | ||
71 | * need roughly 0.5KB per GB. | ||
72 | */ | ||
73 | #ifdef CONFIG_X86_32 | ||
74 | start = 0x7000; | ||
75 | #else | ||
76 | start = 0x8000; | ||
77 | #endif | 68 | #endif |
78 | e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, | 69 | |
79 | tables, PAGE_SIZE); | 70 | base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); |
80 | if (e820_table_start == -1UL) | 71 | if (base == MEMBLOCK_ERROR) |
81 | panic("Cannot find space for the kernel page tables"); | 72 | panic("Cannot find space for the kernel page tables"); |
82 | 73 | ||
83 | e820_table_start >>= PAGE_SHIFT; | 74 | pgt_buf_start = base >> PAGE_SHIFT; |
84 | e820_table_end = e820_table_start; | 75 | pgt_buf_end = pgt_buf_start; |
85 | e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); | 76 | pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); |
86 | 77 | ||
87 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", | 78 | printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", |
88 | end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); | 79 | end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT); |
80 | } | ||
81 | |||
82 | void __init native_pagetable_reserve(u64 start, u64 end) | ||
83 | { | ||
84 | memblock_x86_reserve_range(start, end, "PGTABLE"); | ||
89 | } | 85 | } |
90 | 86 | ||
91 | struct map_range { | 87 | struct map_range { |
@@ -277,30 +273,26 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, | |||
277 | load_cr3(swapper_pg_dir); | 273 | load_cr3(swapper_pg_dir); |
278 | #endif | 274 | #endif |
279 | 275 | ||
280 | #ifdef CONFIG_X86_64 | ||
281 | if (!after_bootmem && !start) { | ||
282 | pud_t *pud; | ||
283 | pmd_t *pmd; | ||
284 | |||
285 | mmu_cr4_features = read_cr4(); | ||
286 | |||
287 | /* | ||
288 | * _brk_end cannot change anymore, but it and _end may be | ||
289 | * located on different 2M pages. cleanup_highmap(), however, | ||
290 | * can only consider _end when it runs, so destroy any | ||
291 | * mappings beyond _brk_end here. | ||
292 | */ | ||
293 | pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); | ||
294 | pmd = pmd_offset(pud, _brk_end - 1); | ||
295 | while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) | ||
296 | pmd_clear(pmd); | ||
297 | } | ||
298 | #endif | ||
299 | __flush_tlb_all(); | 276 | __flush_tlb_all(); |
300 | 277 | ||
301 | if (!after_bootmem && e820_table_end > e820_table_start) | 278 | /* |
302 | reserve_early(e820_table_start << PAGE_SHIFT, | 279 | * Reserve the kernel pagetable pages we used (pgt_buf_start - |
303 | e820_table_end << PAGE_SHIFT, "PGTABLE"); | 280 | * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) |
281 | * so that they can be reused for other purposes. | ||
282 | * | ||
283 | * On native it just means calling memblock_x86_reserve_range, on Xen it | ||
284 | * also means marking RW the pagetable pages that we allocated before | ||
285 | * but that haven't been used. | ||
286 | * | ||
287 | * In fact on xen we mark RO the whole range pgt_buf_start - | ||
288 | * pgt_buf_top, because we have to make sure that when | ||
289 | * init_memory_mapping reaches the pagetable pages area, it maps | ||
290 | * RO all the pagetable pages, including the ones that are beyond | ||
291 | * pgt_buf_end at that time. | ||
292 | */ | ||
293 | if (!after_bootmem && pgt_buf_end > pgt_buf_start) | ||
294 | x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start), | ||
295 | PFN_PHYS(pgt_buf_end)); | ||
304 | 296 | ||
305 | if (!after_bootmem) | 297 | if (!after_bootmem) |
306 | early_memtest(start, end); | 298 | early_memtest(start, end); |
@@ -362,8 +354,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) | |||
362 | /* | 354 | /* |
363 | * We just marked the kernel text read only above, now that | 355 | * We just marked the kernel text read only above, now that |
364 | * we are going to free part of that, we need to make that | 356 | * we are going to free part of that, we need to make that |
365 | * writeable first. | 357 | * writeable and non-executable first. |
366 | */ | 358 | */ |
359 | set_memory_nx(begin, (end - begin) >> PAGE_SHIFT); | ||
367 | set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); | 360 | set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); |
368 | 361 | ||
369 | printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); | 362 | printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); |
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bca79091b9d6..29f7c6d98179 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/pfn.h> | 25 | #include <linux/pfn.h> |
26 | #include <linux/poison.h> | 26 | #include <linux/poison.h> |
27 | #include <linux/bootmem.h> | 27 | #include <linux/bootmem.h> |
28 | #include <linux/memblock.h> | ||
28 | #include <linux/proc_fs.h> | 29 | #include <linux/proc_fs.h> |
29 | #include <linux/memory_hotplug.h> | 30 | #include <linux/memory_hotplug.h> |
30 | #include <linux/initrd.h> | 31 | #include <linux/initrd.h> |
@@ -44,6 +45,7 @@ | |||
44 | #include <asm/bugs.h> | 45 | #include <asm/bugs.h> |
45 | #include <asm/tlb.h> | 46 | #include <asm/tlb.h> |
46 | #include <asm/tlbflush.h> | 47 | #include <asm/tlbflush.h> |
48 | #include <asm/olpc_ofw.h> | ||
47 | #include <asm/pgalloc.h> | 49 | #include <asm/pgalloc.h> |
48 | #include <asm/sections.h> | 50 | #include <asm/sections.h> |
49 | #include <asm/paravirt.h> | 51 | #include <asm/paravirt.h> |
@@ -60,14 +62,14 @@ bool __read_mostly __vmalloc_start_set = false; | |||
60 | 62 | ||
61 | static __init void *alloc_low_page(void) | 63 | static __init void *alloc_low_page(void) |
62 | { | 64 | { |
63 | unsigned long pfn = e820_table_end++; | 65 | unsigned long pfn = pgt_buf_end++; |
64 | void *adr; | 66 | void *adr; |
65 | 67 | ||
66 | if (pfn >= e820_table_top) | 68 | if (pfn >= pgt_buf_top) |
67 | panic("alloc_low_page: ran out of memory"); | 69 | panic("alloc_low_page: ran out of memory"); |
68 | 70 | ||
69 | adr = __va(pfn * PAGE_SIZE); | 71 | adr = __va(pfn * PAGE_SIZE); |
70 | memset(adr, 0, PAGE_SIZE); | 72 | clear_page(adr); |
71 | return adr; | 73 | return adr; |
72 | } | 74 | } |
73 | 75 | ||
@@ -161,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd, | |||
161 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end | 163 | if (pmd_idx_kmap_begin != pmd_idx_kmap_end |
162 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin | 164 | && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin |
163 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end | 165 | && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end |
164 | && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start | 166 | && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start |
165 | || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { | 167 | || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) { |
166 | pte_t *newpte; | 168 | pte_t *newpte; |
167 | int i; | 169 | int i; |
168 | 170 | ||
@@ -225,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) | |||
225 | 227 | ||
226 | static inline int is_kernel_text(unsigned long addr) | 228 | static inline int is_kernel_text(unsigned long addr) |
227 | { | 229 | { |
228 | if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) | 230 | if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end) |
229 | return 1; | 231 | return 1; |
230 | return 0; | 232 | return 0; |
231 | } | 233 | } |
@@ -422,49 +424,28 @@ static void __init add_one_highpage_init(struct page *page) | |||
422 | totalhigh_pages++; | 424 | totalhigh_pages++; |
423 | } | 425 | } |
424 | 426 | ||
425 | struct add_highpages_data { | 427 | void __init add_highpages_with_active_regions(int nid, |
426 | unsigned long start_pfn; | 428 | unsigned long start_pfn, unsigned long end_pfn) |
427 | unsigned long end_pfn; | ||
428 | }; | ||
429 | |||
430 | static int __init add_highpages_work_fn(unsigned long start_pfn, | ||
431 | unsigned long end_pfn, void *datax) | ||
432 | { | 429 | { |
433 | int node_pfn; | 430 | struct range *range; |
434 | struct page *page; | 431 | int nr_range; |
435 | unsigned long final_start_pfn, final_end_pfn; | 432 | int i; |
436 | struct add_highpages_data *data; | ||
437 | 433 | ||
438 | data = (struct add_highpages_data *)datax; | 434 | nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn); |
439 | 435 | ||
440 | final_start_pfn = max(start_pfn, data->start_pfn); | 436 | for (i = 0; i < nr_range; i++) { |
441 | final_end_pfn = min(end_pfn, data->end_pfn); | 437 | struct page *page; |
442 | if (final_start_pfn >= final_end_pfn) | 438 | int node_pfn; |
443 | return 0; | ||
444 | 439 | ||
445 | for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; | 440 | for (node_pfn = range[i].start; node_pfn < range[i].end; |
446 | node_pfn++) { | 441 | node_pfn++) { |
447 | if (!pfn_valid(node_pfn)) | 442 | if (!pfn_valid(node_pfn)) |
448 | continue; | 443 | continue; |
449 | page = pfn_to_page(node_pfn); | 444 | page = pfn_to_page(node_pfn); |
450 | add_one_highpage_init(page); | 445 | add_one_highpage_init(page); |
446 | } | ||
451 | } | 447 | } |
452 | |||
453 | return 0; | ||
454 | |||
455 | } | 448 | } |
456 | |||
457 | void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, | ||
458 | unsigned long end_pfn) | ||
459 | { | ||
460 | struct add_highpages_data data; | ||
461 | |||
462 | data.start_pfn = start_pfn; | ||
463 | data.end_pfn = end_pfn; | ||
464 | |||
465 | work_with_active_regions(nid, add_highpages_work_fn, &data); | ||
466 | } | ||
467 | |||
468 | #else | 449 | #else |
469 | static inline void permanent_kmaps_init(pgd_t *pgd_base) | 450 | static inline void permanent_kmaps_init(pgd_t *pgd_base) |
470 | { | 451 | { |
@@ -548,48 +529,6 @@ static void __init pagetable_init(void) | |||
548 | permanent_kmaps_init(pgd_base); | 529 | permanent_kmaps_init(pgd_base); |
549 | } | 530 | } |
550 | 531 | ||
551 | #ifdef CONFIG_ACPI_SLEEP | ||
552 | /* | ||
553 | * ACPI suspend needs this for resume, because things like the intel-agp | ||
554 | * driver might have split up a kernel 4MB mapping. | ||
555 | */ | ||
556 | char swsusp_pg_dir[PAGE_SIZE] | ||
557 | __attribute__ ((aligned(PAGE_SIZE))); | ||
558 | |||
559 | static inline void save_pg_dir(void) | ||
560 | { | ||
561 | memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); | ||
562 | } | ||
563 | #else /* !CONFIG_ACPI_SLEEP */ | ||
564 | static inline void save_pg_dir(void) | ||
565 | { | ||
566 | } | ||
567 | #endif /* !CONFIG_ACPI_SLEEP */ | ||
568 | |||
569 | void zap_low_mappings(bool early) | ||
570 | { | ||
571 | int i; | ||
572 | |||
573 | /* | ||
574 | * Zap initial low-memory mappings. | ||
575 | * | ||
576 | * Note that "pgd_clear()" doesn't do it for | ||
577 | * us, because pgd_clear() is a no-op on i386. | ||
578 | */ | ||
579 | for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { | ||
580 | #ifdef CONFIG_X86_PAE | ||
581 | set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); | ||
582 | #else | ||
583 | set_pgd(swapper_pg_dir+i, __pgd(0)); | ||
584 | #endif | ||
585 | } | ||
586 | |||
587 | if (early) | ||
588 | __flush_tlb(); | ||
589 | else | ||
590 | flush_tlb_all(); | ||
591 | } | ||
592 | |||
593 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); | 532 | pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); |
594 | EXPORT_SYMBOL_GPL(__supported_pte_mask); | 533 | EXPORT_SYMBOL_GPL(__supported_pte_mask); |
595 | 534 | ||
@@ -705,21 +644,20 @@ void __init find_low_pfn_range(void) | |||
705 | } | 644 | } |
706 | 645 | ||
707 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 646 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
708 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 647 | void __init initmem_init(void) |
709 | int acpi, int k8) | ||
710 | { | 648 | { |
711 | #ifdef CONFIG_HIGHMEM | 649 | #ifdef CONFIG_HIGHMEM |
712 | highstart_pfn = highend_pfn = max_pfn; | 650 | highstart_pfn = highend_pfn = max_pfn; |
713 | if (max_pfn > max_low_pfn) | 651 | if (max_pfn > max_low_pfn) |
714 | highstart_pfn = max_low_pfn; | 652 | highstart_pfn = max_low_pfn; |
715 | e820_register_active_regions(0, 0, highend_pfn); | 653 | memblock_x86_register_active_regions(0, 0, highend_pfn); |
716 | sparse_memory_present_with_active_regions(0); | 654 | sparse_memory_present_with_active_regions(0); |
717 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", | 655 | printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", |
718 | pages_to_mb(highend_pfn - highstart_pfn)); | 656 | pages_to_mb(highend_pfn - highstart_pfn)); |
719 | num_physpages = highend_pfn; | 657 | num_physpages = highend_pfn; |
720 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; | 658 | high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; |
721 | #else | 659 | #else |
722 | e820_register_active_regions(0, 0, max_low_pfn); | 660 | memblock_x86_register_active_regions(0, 0, max_low_pfn); |
723 | sparse_memory_present_with_active_regions(0); | 661 | sparse_memory_present_with_active_regions(0); |
724 | num_physpages = max_low_pfn; | 662 | num_physpages = max_low_pfn; |
725 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; | 663 | high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; |
@@ -740,8 +678,10 @@ static void __init zone_sizes_init(void) | |||
740 | { | 678 | { |
741 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | 679 | unsigned long max_zone_pfns[MAX_NR_ZONES]; |
742 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | 680 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); |
681 | #ifdef CONFIG_ZONE_DMA | ||
743 | max_zone_pfns[ZONE_DMA] = | 682 | max_zone_pfns[ZONE_DMA] = |
744 | virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; | 683 | virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; |
684 | #endif | ||
745 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; | 685 | max_zone_pfns[ZONE_NORMAL] = max_low_pfn; |
746 | #ifdef CONFIG_HIGHMEM | 686 | #ifdef CONFIG_HIGHMEM |
747 | max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; | 687 | max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; |
@@ -750,68 +690,12 @@ static void __init zone_sizes_init(void) | |||
750 | free_area_init_nodes(max_zone_pfns); | 690 | free_area_init_nodes(max_zone_pfns); |
751 | } | 691 | } |
752 | 692 | ||
753 | #ifndef CONFIG_NO_BOOTMEM | ||
754 | static unsigned long __init setup_node_bootmem(int nodeid, | ||
755 | unsigned long start_pfn, | ||
756 | unsigned long end_pfn, | ||
757 | unsigned long bootmap) | ||
758 | { | ||
759 | unsigned long bootmap_size; | ||
760 | |||
761 | /* don't touch min_low_pfn */ | ||
762 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
763 | bootmap >> PAGE_SHIFT, | ||
764 | start_pfn, end_pfn); | ||
765 | printk(KERN_INFO " node %d low ram: %08lx - %08lx\n", | ||
766 | nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); | ||
767 | printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", | ||
768 | nodeid, bootmap, bootmap + bootmap_size); | ||
769 | free_bootmem_with_active_regions(nodeid, end_pfn); | ||
770 | |||
771 | return bootmap + bootmap_size; | ||
772 | } | ||
773 | #endif | ||
774 | |||
775 | void __init setup_bootmem_allocator(void) | 693 | void __init setup_bootmem_allocator(void) |
776 | { | 694 | { |
777 | #ifndef CONFIG_NO_BOOTMEM | ||
778 | int nodeid; | ||
779 | unsigned long bootmap_size, bootmap; | ||
780 | /* | ||
781 | * Initialize the boot-time allocator (with low memory only): | ||
782 | */ | ||
783 | bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT; | ||
784 | bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size, | ||
785 | PAGE_SIZE); | ||
786 | if (bootmap == -1L) | ||
787 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | ||
788 | reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); | ||
789 | #endif | ||
790 | |||
791 | printk(KERN_INFO " mapped low ram: 0 - %08lx\n", | 695 | printk(KERN_INFO " mapped low ram: 0 - %08lx\n", |
792 | max_pfn_mapped<<PAGE_SHIFT); | 696 | max_pfn_mapped<<PAGE_SHIFT); |
793 | printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); | 697 | printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); |
794 | 698 | ||
795 | #ifndef CONFIG_NO_BOOTMEM | ||
796 | for_each_online_node(nodeid) { | ||
797 | unsigned long start_pfn, end_pfn; | ||
798 | |||
799 | #ifdef CONFIG_NEED_MULTIPLE_NODES | ||
800 | start_pfn = node_start_pfn[nodeid]; | ||
801 | end_pfn = node_end_pfn[nodeid]; | ||
802 | if (start_pfn > max_low_pfn) | ||
803 | continue; | ||
804 | if (end_pfn > max_low_pfn) | ||
805 | end_pfn = max_low_pfn; | ||
806 | #else | ||
807 | start_pfn = 0; | ||
808 | end_pfn = max_low_pfn; | ||
809 | #endif | ||
810 | bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, | ||
811 | bootmap); | ||
812 | } | ||
813 | #endif | ||
814 | |||
815 | after_bootmem = 1; | 699 | after_bootmem = 1; |
816 | } | 700 | } |
817 | 701 | ||
@@ -833,6 +717,8 @@ void __init paging_init(void) | |||
833 | /* | 717 | /* |
834 | * NOTE: at this point the bootmem allocator is fully available. | 718 | * NOTE: at this point the bootmem allocator is fully available. |
835 | */ | 719 | */ |
720 | olpc_dt_build_devicetree(); | ||
721 | sparse_memory_present_with_active_regions(MAX_NUMNODES); | ||
836 | sparse_init(); | 722 | sparse_init(); |
837 | zone_sizes_init(); | 723 | zone_sizes_init(); |
838 | } | 724 | } |
@@ -958,9 +844,6 @@ void __init mem_init(void) | |||
958 | 844 | ||
959 | if (boot_cpu_data.wp_works_ok < 0) | 845 | if (boot_cpu_data.wp_works_ok < 0) |
960 | test_wp_bit(); | 846 | test_wp_bit(); |
961 | |||
962 | save_pg_dir(); | ||
963 | zap_low_mappings(true); | ||
964 | } | 847 | } |
965 | 848 | ||
966 | #ifdef CONFIG_MEMORY_HOTPLUG | 849 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -1033,6 +916,23 @@ void set_kernel_text_ro(void) | |||
1033 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | 916 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); |
1034 | } | 917 | } |
1035 | 918 | ||
919 | static void mark_nxdata_nx(void) | ||
920 | { | ||
921 | /* | ||
922 | * When this called, init has already been executed and released, | ||
923 | * so everything past _etext should be NX. | ||
924 | */ | ||
925 | unsigned long start = PFN_ALIGN(_etext); | ||
926 | /* | ||
927 | * This comes from is_kernel_text upper limit. Also HPAGE where used: | ||
928 | */ | ||
929 | unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start; | ||
930 | |||
931 | if (__supported_pte_mask & _PAGE_NX) | ||
932 | printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10); | ||
933 | set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT); | ||
934 | } | ||
935 | |||
1036 | void mark_rodata_ro(void) | 936 | void mark_rodata_ro(void) |
1037 | { | 937 | { |
1038 | unsigned long start = PFN_ALIGN(_text); | 938 | unsigned long start = PFN_ALIGN(_text); |
@@ -1067,11 +967,7 @@ void mark_rodata_ro(void) | |||
1067 | printk(KERN_INFO "Testing CPA: write protecting again\n"); | 967 | printk(KERN_INFO "Testing CPA: write protecting again\n"); |
1068 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); | 968 | set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); |
1069 | #endif | 969 | #endif |
970 | mark_nxdata_nx(); | ||
1070 | } | 971 | } |
1071 | #endif | 972 | #endif |
1072 | 973 | ||
1073 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | ||
1074 | int flags) | ||
1075 | { | ||
1076 | return reserve_bootmem(phys, len, flags); | ||
1077 | } | ||
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a6674689a20..bbaaa005bf0e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -21,12 +21,14 @@ | |||
21 | #include <linux/initrd.h> | 21 | #include <linux/initrd.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/memblock.h> | ||
24 | #include <linux/proc_fs.h> | 25 | #include <linux/proc_fs.h> |
25 | #include <linux/pci.h> | 26 | #include <linux/pci.h> |
26 | #include <linux/pfn.h> | 27 | #include <linux/pfn.h> |
27 | #include <linux/poison.h> | 28 | #include <linux/poison.h> |
28 | #include <linux/dma-mapping.h> | 29 | #include <linux/dma-mapping.h> |
29 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/memory.h> | ||
30 | #include <linux/memory_hotplug.h> | 32 | #include <linux/memory_hotplug.h> |
31 | #include <linux/nmi.h> | 33 | #include <linux/nmi.h> |
32 | #include <linux/gfp.h> | 34 | #include <linux/gfp.h> |
@@ -50,9 +52,8 @@ | |||
50 | #include <asm/numa.h> | 52 | #include <asm/numa.h> |
51 | #include <asm/cacheflush.h> | 53 | #include <asm/cacheflush.h> |
52 | #include <asm/init.h> | 54 | #include <asm/init.h> |
53 | #include <linux/bootmem.h> | 55 | #include <asm/uv/uv.h> |
54 | 56 | #include <asm/setup.h> | |
55 | static unsigned long dma_reserve __initdata; | ||
56 | 57 | ||
57 | static int __init parse_direct_gbpages_off(char *arg) | 58 | static int __init parse_direct_gbpages_off(char *arg) |
58 | { | 59 | { |
@@ -98,6 +99,43 @@ static int __init nonx32_setup(char *str) | |||
98 | __setup("noexec32=", nonx32_setup); | 99 | __setup("noexec32=", nonx32_setup); |
99 | 100 | ||
100 | /* | 101 | /* |
102 | * When memory was added/removed make sure all the processes MM have | ||
103 | * suitable PGD entries in the local PGD level page. | ||
104 | */ | ||
105 | void sync_global_pgds(unsigned long start, unsigned long end) | ||
106 | { | ||
107 | unsigned long address; | ||
108 | |||
109 | for (address = start; address <= end; address += PGDIR_SIZE) { | ||
110 | const pgd_t *pgd_ref = pgd_offset_k(address); | ||
111 | struct page *page; | ||
112 | |||
113 | if (pgd_none(*pgd_ref)) | ||
114 | continue; | ||
115 | |||
116 | spin_lock(&pgd_lock); | ||
117 | list_for_each_entry(page, &pgd_list, lru) { | ||
118 | pgd_t *pgd; | ||
119 | spinlock_t *pgt_lock; | ||
120 | |||
121 | pgd = (pgd_t *)page_address(page) + pgd_index(address); | ||
122 | /* the pgt_lock only for Xen */ | ||
123 | pgt_lock = &pgd_page_get_mm(page)->page_table_lock; | ||
124 | spin_lock(pgt_lock); | ||
125 | |||
126 | if (pgd_none(*pgd)) | ||
127 | set_pgd(pgd, *pgd_ref); | ||
128 | else | ||
129 | BUG_ON(pgd_page_vaddr(*pgd) | ||
130 | != pgd_page_vaddr(*pgd_ref)); | ||
131 | |||
132 | spin_unlock(pgt_lock); | ||
133 | } | ||
134 | spin_unlock(&pgd_lock); | ||
135 | } | ||
136 | } | ||
137 | |||
138 | /* | ||
101 | * NOTE: This function is marked __ref because it calls __init function | 139 | * NOTE: This function is marked __ref because it calls __init function |
102 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. | 140 | * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. |
103 | */ | 141 | */ |
@@ -258,18 +296,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) | |||
258 | * to the compile time generated pmds. This results in invalid pmds up | 296 | * to the compile time generated pmds. This results in invalid pmds up |
259 | * to the point where we hit the physaddr 0 mapping. | 297 | * to the point where we hit the physaddr 0 mapping. |
260 | * | 298 | * |
261 | * We limit the mappings to the region from _text to _end. _end is | 299 | * We limit the mappings to the region from _text to _brk_end. _brk_end |
262 | * rounded up to the 2MB boundary. This catches the invalid pmds as | 300 | * is rounded up to the 2MB boundary. This catches the invalid pmds as |
263 | * well, as they are located before _text: | 301 | * well, as they are located before _text: |
264 | */ | 302 | */ |
265 | void __init cleanup_highmap(void) | 303 | void __init cleanup_highmap(void) |
266 | { | 304 | { |
267 | unsigned long vaddr = __START_KERNEL_map; | 305 | unsigned long vaddr = __START_KERNEL_map; |
268 | unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; | 306 | unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); |
307 | unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; | ||
269 | pmd_t *pmd = level2_kernel_pgt; | 308 | pmd_t *pmd = level2_kernel_pgt; |
270 | pmd_t *last_pmd = pmd + PTRS_PER_PMD; | ||
271 | 309 | ||
272 | for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { | 310 | for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { |
273 | if (pmd_none(*pmd)) | 311 | if (pmd_none(*pmd)) |
274 | continue; | 312 | continue; |
275 | if (vaddr < (unsigned long) _text || vaddr > end) | 313 | if (vaddr < (unsigned long) _text || vaddr > end) |
@@ -279,7 +317,7 @@ void __init cleanup_highmap(void) | |||
279 | 317 | ||
280 | static __ref void *alloc_low_page(unsigned long *phys) | 318 | static __ref void *alloc_low_page(unsigned long *phys) |
281 | { | 319 | { |
282 | unsigned long pfn = e820_table_end++; | 320 | unsigned long pfn = pgt_buf_end++; |
283 | void *adr; | 321 | void *adr; |
284 | 322 | ||
285 | if (after_bootmem) { | 323 | if (after_bootmem) { |
@@ -289,21 +327,37 @@ static __ref void *alloc_low_page(unsigned long *phys) | |||
289 | return adr; | 327 | return adr; |
290 | } | 328 | } |
291 | 329 | ||
292 | if (pfn >= e820_table_top) | 330 | if (pfn >= pgt_buf_top) |
293 | panic("alloc_low_page: ran out of memory"); | 331 | panic("alloc_low_page: ran out of memory"); |
294 | 332 | ||
295 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); | 333 | adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); |
296 | memset(adr, 0, PAGE_SIZE); | 334 | clear_page(adr); |
297 | *phys = pfn * PAGE_SIZE; | 335 | *phys = pfn * PAGE_SIZE; |
298 | return adr; | 336 | return adr; |
299 | } | 337 | } |
300 | 338 | ||
339 | static __ref void *map_low_page(void *virt) | ||
340 | { | ||
341 | void *adr; | ||
342 | unsigned long phys, left; | ||
343 | |||
344 | if (after_bootmem) | ||
345 | return virt; | ||
346 | |||
347 | phys = __pa(virt); | ||
348 | left = phys & (PAGE_SIZE - 1); | ||
349 | adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE); | ||
350 | adr = (void *)(((unsigned long)adr) | left); | ||
351 | |||
352 | return adr; | ||
353 | } | ||
354 | |||
301 | static __ref void unmap_low_page(void *adr) | 355 | static __ref void unmap_low_page(void *adr) |
302 | { | 356 | { |
303 | if (after_bootmem) | 357 | if (after_bootmem) |
304 | return; | 358 | return; |
305 | 359 | ||
306 | early_iounmap(adr, PAGE_SIZE); | 360 | early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE); |
307 | } | 361 | } |
308 | 362 | ||
309 | static unsigned long __meminit | 363 | static unsigned long __meminit |
@@ -351,15 +405,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, | |||
351 | } | 405 | } |
352 | 406 | ||
353 | static unsigned long __meminit | 407 | static unsigned long __meminit |
354 | phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, | ||
355 | pgprot_t prot) | ||
356 | { | ||
357 | pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); | ||
358 | |||
359 | return phys_pte_init(pte, address, end, prot); | ||
360 | } | ||
361 | |||
362 | static unsigned long __meminit | ||
363 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | 408 | phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, |
364 | unsigned long page_size_mask, pgprot_t prot) | 409 | unsigned long page_size_mask, pgprot_t prot) |
365 | { | 410 | { |
@@ -385,8 +430,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
385 | if (pmd_val(*pmd)) { | 430 | if (pmd_val(*pmd)) { |
386 | if (!pmd_large(*pmd)) { | 431 | if (!pmd_large(*pmd)) { |
387 | spin_lock(&init_mm.page_table_lock); | 432 | spin_lock(&init_mm.page_table_lock); |
388 | last_map_addr = phys_pte_update(pmd, address, | 433 | pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd)); |
434 | last_map_addr = phys_pte_init(pte, address, | ||
389 | end, prot); | 435 | end, prot); |
436 | unmap_low_page(pte); | ||
390 | spin_unlock(&init_mm.page_table_lock); | 437 | spin_unlock(&init_mm.page_table_lock); |
391 | continue; | 438 | continue; |
392 | } | 439 | } |
@@ -433,18 +480,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, | |||
433 | } | 480 | } |
434 | 481 | ||
435 | static unsigned long __meminit | 482 | static unsigned long __meminit |
436 | phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, | ||
437 | unsigned long page_size_mask, pgprot_t prot) | ||
438 | { | ||
439 | pmd_t *pmd = pmd_offset(pud, 0); | ||
440 | unsigned long last_map_addr; | ||
441 | |||
442 | last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); | ||
443 | __flush_tlb_all(); | ||
444 | return last_map_addr; | ||
445 | } | ||
446 | |||
447 | static unsigned long __meminit | ||
448 | phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | 483 | phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, |
449 | unsigned long page_size_mask) | 484 | unsigned long page_size_mask) |
450 | { | 485 | { |
@@ -469,8 +504,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
469 | 504 | ||
470 | if (pud_val(*pud)) { | 505 | if (pud_val(*pud)) { |
471 | if (!pud_large(*pud)) { | 506 | if (!pud_large(*pud)) { |
472 | last_map_addr = phys_pmd_update(pud, addr, end, | 507 | pmd = map_low_page(pmd_offset(pud, 0)); |
508 | last_map_addr = phys_pmd_init(pmd, addr, end, | ||
473 | page_size_mask, prot); | 509 | page_size_mask, prot); |
510 | unmap_low_page(pmd); | ||
511 | __flush_tlb_all(); | ||
474 | continue; | 512 | continue; |
475 | } | 513 | } |
476 | /* | 514 | /* |
@@ -518,27 +556,18 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, | |||
518 | return last_map_addr; | 556 | return last_map_addr; |
519 | } | 557 | } |
520 | 558 | ||
521 | static unsigned long __meminit | ||
522 | phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, | ||
523 | unsigned long page_size_mask) | ||
524 | { | ||
525 | pud_t *pud; | ||
526 | |||
527 | pud = (pud_t *)pgd_page_vaddr(*pgd); | ||
528 | |||
529 | return phys_pud_init(pud, addr, end, page_size_mask); | ||
530 | } | ||
531 | |||
532 | unsigned long __meminit | 559 | unsigned long __meminit |
533 | kernel_physical_mapping_init(unsigned long start, | 560 | kernel_physical_mapping_init(unsigned long start, |
534 | unsigned long end, | 561 | unsigned long end, |
535 | unsigned long page_size_mask) | 562 | unsigned long page_size_mask) |
536 | { | 563 | { |
537 | 564 | bool pgd_changed = false; | |
538 | unsigned long next, last_map_addr = end; | 565 | unsigned long next, last_map_addr = end; |
566 | unsigned long addr; | ||
539 | 567 | ||
540 | start = (unsigned long)__va(start); | 568 | start = (unsigned long)__va(start); |
541 | end = (unsigned long)__va(end); | 569 | end = (unsigned long)__va(end); |
570 | addr = start; | ||
542 | 571 | ||
543 | for (; start < end; start = next) { | 572 | for (; start < end; start = next) { |
544 | pgd_t *pgd = pgd_offset_k(start); | 573 | pgd_t *pgd = pgd_offset_k(start); |
@@ -550,8 +579,10 @@ kernel_physical_mapping_init(unsigned long start, | |||
550 | next = end; | 579 | next = end; |
551 | 580 | ||
552 | if (pgd_val(*pgd)) { | 581 | if (pgd_val(*pgd)) { |
553 | last_map_addr = phys_pud_update(pgd, __pa(start), | 582 | pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd)); |
583 | last_map_addr = phys_pud_init(pud, __pa(start), | ||
554 | __pa(end), page_size_mask); | 584 | __pa(end), page_size_mask); |
585 | unmap_low_page(pud); | ||
555 | continue; | 586 | continue; |
556 | } | 587 | } |
557 | 588 | ||
@@ -563,33 +594,21 @@ kernel_physical_mapping_init(unsigned long start, | |||
563 | spin_lock(&init_mm.page_table_lock); | 594 | spin_lock(&init_mm.page_table_lock); |
564 | pgd_populate(&init_mm, pgd, __va(pud_phys)); | 595 | pgd_populate(&init_mm, pgd, __va(pud_phys)); |
565 | spin_unlock(&init_mm.page_table_lock); | 596 | spin_unlock(&init_mm.page_table_lock); |
597 | pgd_changed = true; | ||
566 | } | 598 | } |
599 | |||
600 | if (pgd_changed) | ||
601 | sync_global_pgds(addr, end); | ||
602 | |||
567 | __flush_tlb_all(); | 603 | __flush_tlb_all(); |
568 | 604 | ||
569 | return last_map_addr; | 605 | return last_map_addr; |
570 | } | 606 | } |
571 | 607 | ||
572 | #ifndef CONFIG_NUMA | 608 | #ifndef CONFIG_NUMA |
573 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | 609 | void __init initmem_init(void) |
574 | int acpi, int k8) | 610 | { |
575 | { | 611 | memblock_x86_register_active_regions(0, 0, max_pfn); |
576 | #ifndef CONFIG_NO_BOOTMEM | ||
577 | unsigned long bootmap_size, bootmap; | ||
578 | |||
579 | bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; | ||
580 | bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size, | ||
581 | PAGE_SIZE); | ||
582 | if (bootmap == -1L) | ||
583 | panic("Cannot find bootmem map of size %ld\n", bootmap_size); | ||
584 | reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); | ||
585 | /* don't touch min_low_pfn */ | ||
586 | bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, | ||
587 | 0, end_pfn); | ||
588 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
589 | free_bootmem_with_active_regions(0, end_pfn); | ||
590 | #else | ||
591 | e820_register_active_regions(0, start_pfn, end_pfn); | ||
592 | #endif | ||
593 | } | 612 | } |
594 | #endif | 613 | #endif |
595 | 614 | ||
@@ -598,7 +617,9 @@ void __init paging_init(void) | |||
598 | unsigned long max_zone_pfns[MAX_NR_ZONES]; | 617 | unsigned long max_zone_pfns[MAX_NR_ZONES]; |
599 | 618 | ||
600 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); | 619 | memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); |
620 | #ifdef CONFIG_ZONE_DMA | ||
601 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; | 621 | max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; |
622 | #endif | ||
602 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; | 623 | max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; |
603 | max_zone_pfns[ZONE_NORMAL] = max_pfn; | 624 | max_zone_pfns[ZONE_NORMAL] = max_pfn; |
604 | 625 | ||
@@ -661,14 +682,6 @@ int arch_add_memory(int nid, u64 start, u64 size) | |||
661 | } | 682 | } |
662 | EXPORT_SYMBOL_GPL(arch_add_memory); | 683 | EXPORT_SYMBOL_GPL(arch_add_memory); |
663 | 684 | ||
664 | #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) | ||
665 | int memory_add_physaddr_to_nid(u64 start) | ||
666 | { | ||
667 | return 0; | ||
668 | } | ||
669 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | ||
670 | #endif | ||
671 | |||
672 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 685 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
673 | 686 | ||
674 | static struct kcore_list kcore_vsyscall; | 687 | static struct kcore_list kcore_vsyscall; |
@@ -799,52 +812,6 @@ void mark_rodata_ro(void) | |||
799 | 812 | ||
800 | #endif | 813 | #endif |
801 | 814 | ||
802 | int __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | ||
803 | int flags) | ||
804 | { | ||
805 | #ifdef CONFIG_NUMA | ||
806 | int nid, next_nid; | ||
807 | int ret; | ||
808 | #endif | ||
809 | unsigned long pfn = phys >> PAGE_SHIFT; | ||
810 | |||
811 | if (pfn >= max_pfn) { | ||
812 | /* | ||
813 | * This can happen with kdump kernels when accessing | ||
814 | * firmware tables: | ||
815 | */ | ||
816 | if (pfn < max_pfn_mapped) | ||
817 | return -EFAULT; | ||
818 | |||
819 | printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n", | ||
820 | phys, len); | ||
821 | return -EFAULT; | ||
822 | } | ||
823 | |||
824 | /* Should check here against the e820 map to avoid double free */ | ||
825 | #ifdef CONFIG_NUMA | ||
826 | nid = phys_to_nid(phys); | ||
827 | next_nid = phys_to_nid(phys + len - 1); | ||
828 | if (nid == next_nid) | ||
829 | ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags); | ||
830 | else | ||
831 | ret = reserve_bootmem(phys, len, flags); | ||
832 | |||
833 | if (ret != 0) | ||
834 | return ret; | ||
835 | |||
836 | #else | ||
837 | reserve_bootmem(phys, len, flags); | ||
838 | #endif | ||
839 | |||
840 | if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { | ||
841 | dma_reserve += len / PAGE_SIZE; | ||
842 | set_dma_reserve(dma_reserve); | ||
843 | } | ||
844 | |||
845 | return 0; | ||
846 | } | ||
847 | |||
848 | int kern_addr_valid(unsigned long addr) | 815 | int kern_addr_valid(unsigned long addr) |
849 | { | 816 | { |
850 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; | 817 | unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; |
@@ -890,18 +857,18 @@ static struct vm_area_struct gate_vma = { | |||
890 | .vm_flags = VM_READ | VM_EXEC | 857 | .vm_flags = VM_READ | VM_EXEC |
891 | }; | 858 | }; |
892 | 859 | ||
893 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | 860 | struct vm_area_struct *get_gate_vma(struct mm_struct *mm) |
894 | { | 861 | { |
895 | #ifdef CONFIG_IA32_EMULATION | 862 | #ifdef CONFIG_IA32_EMULATION |
896 | if (test_tsk_thread_flag(tsk, TIF_IA32)) | 863 | if (!mm || mm->context.ia32_compat) |
897 | return NULL; | 864 | return NULL; |
898 | #endif | 865 | #endif |
899 | return &gate_vma; | 866 | return &gate_vma; |
900 | } | 867 | } |
901 | 868 | ||
902 | int in_gate_area(struct task_struct *task, unsigned long addr) | 869 | int in_gate_area(struct mm_struct *mm, unsigned long addr) |
903 | { | 870 | { |
904 | struct vm_area_struct *vma = get_gate_vma(task); | 871 | struct vm_area_struct *vma = get_gate_vma(mm); |
905 | 872 | ||
906 | if (!vma) | 873 | if (!vma) |
907 | return 0; | 874 | return 0; |
@@ -910,11 +877,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr) | |||
910 | } | 877 | } |
911 | 878 | ||
912 | /* | 879 | /* |
913 | * Use this when you have no reliable task/vma, typically from interrupt | 880 | * Use this when you have no reliable mm, typically from interrupt |
914 | * context. It is less reliable than using the task's vma and may give | 881 | * context. It is less reliable than using a task's mm and may give |
915 | * false positives: | 882 | * false positives. |
916 | */ | 883 | */ |
917 | int in_gate_area_no_task(unsigned long addr) | 884 | int in_gate_area_no_mm(unsigned long addr) |
918 | { | 885 | { |
919 | return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); | 886 | return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); |
920 | } | 887 | } |
@@ -928,6 +895,17 @@ const char *arch_vma_name(struct vm_area_struct *vma) | |||
928 | return NULL; | 895 | return NULL; |
929 | } | 896 | } |
930 | 897 | ||
898 | #ifdef CONFIG_X86_UV | ||
899 | unsigned long memory_block_size_bytes(void) | ||
900 | { | ||
901 | if (is_uv_system()) { | ||
902 | printk(KERN_INFO "UV: memory block size 2GB\n"); | ||
903 | return 2UL * 1024 * 1024 * 1024; | ||
904 | } | ||
905 | return MIN_MEMORY_BLOCK_SIZE; | ||
906 | } | ||
907 | #endif | ||
908 | |||
931 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | 909 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
932 | /* | 910 | /* |
933 | * Initialise the sparsemem vmemmap using huge-pages at the PMD level. | 911 | * Initialise the sparsemem vmemmap using huge-pages at the PMD level. |
@@ -1003,6 +981,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) | |||
1003 | } | 981 | } |
1004 | 982 | ||
1005 | } | 983 | } |
984 | sync_global_pgds((unsigned long)start_page, end); | ||
1006 | return 0; | 985 | return 0; |
1007 | } | 986 | } |
1008 | 987 | ||
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 72fc70cf6184..7b179b499fa3 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c | |||
@@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) | |||
48 | } | 48 | } |
49 | EXPORT_SYMBOL_GPL(iomap_create_wc); | 49 | EXPORT_SYMBOL_GPL(iomap_create_wc); |
50 | 50 | ||
51 | void | 51 | void iomap_free(resource_size_t base, unsigned long size) |
52 | iomap_free(resource_size_t base, unsigned long size) | ||
53 | { | 52 | { |
54 | io_free_memtype(base, base + size); | 53 | io_free_memtype(base, base + size); |
55 | } | 54 | } |
56 | EXPORT_SYMBOL_GPL(iomap_free); | 55 | EXPORT_SYMBOL_GPL(iomap_free); |
57 | 56 | ||
58 | void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | 57 | void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) |
59 | { | 58 | { |
60 | enum fixed_addresses idx; | ||
61 | unsigned long vaddr; | 59 | unsigned long vaddr; |
60 | int idx, type; | ||
62 | 61 | ||
63 | pagefault_disable(); | 62 | pagefault_disable(); |
64 | 63 | ||
65 | debug_kmap_atomic(type); | 64 | type = kmap_atomic_idx_push(); |
66 | idx = type + KM_TYPE_NR * smp_processor_id(); | 65 | idx = type + KM_TYPE_NR * smp_processor_id(); |
67 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | 66 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); |
68 | set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); | 67 | set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); |
@@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | |||
72 | } | 71 | } |
73 | 72 | ||
74 | /* | 73 | /* |
75 | * Map 'pfn' using fixed map 'type' and protections 'prot' | 74 | * Map 'pfn' using protections 'prot' |
76 | */ | 75 | */ |
77 | void __iomem * | 76 | void __iomem * |
78 | iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | 77 | iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) |
79 | { | 78 | { |
80 | /* | 79 | /* |
81 | * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. | 80 | * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. |
@@ -86,24 +85,34 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) | |||
86 | if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) | 85 | if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) |
87 | prot = PAGE_KERNEL_UC_MINUS; | 86 | prot = PAGE_KERNEL_UC_MINUS; |
88 | 87 | ||
89 | return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); | 88 | return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); |
90 | } | 89 | } |
91 | EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); | 90 | EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); |
92 | 91 | ||
93 | void | 92 | void |
94 | iounmap_atomic(void __iomem *kvaddr, enum km_type type) | 93 | iounmap_atomic(void __iomem *kvaddr) |
95 | { | 94 | { |
96 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; | 95 | unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; |
97 | enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); | ||
98 | 96 | ||
99 | /* | 97 | if (vaddr >= __fix_to_virt(FIX_KMAP_END) && |
100 | * Force other mappings to Oops if they'll try to access this pte | 98 | vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { |
101 | * without first remap it. Keeping stale mappings around is a bad idea | 99 | int idx, type; |
102 | * also, in case the page changes cacheability attributes or becomes | 100 | |
103 | * a protected page in a hypervisor. | 101 | type = kmap_atomic_idx(); |
104 | */ | 102 | idx = type + KM_TYPE_NR * smp_processor_id(); |
105 | if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) | 103 | |
104 | #ifdef CONFIG_DEBUG_HIGHMEM | ||
105 | WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); | ||
106 | #endif | ||
107 | /* | ||
108 | * Force other mappings to Oops if they'll try to access this | ||
109 | * pte without first remap it. Keeping stale mappings around | ||
110 | * is a bad idea also, in case the page changes cacheability | ||
111 | * attributes or becomes a protected page in a hypervisor. | ||
112 | */ | ||
106 | kpte_clear_flush(kmap_pte-idx, vaddr); | 113 | kpte_clear_flush(kmap_pte-idx, vaddr); |
114 | kmap_atomic_idx_pop(); | ||
115 | } | ||
107 | 116 | ||
108 | pagefault_enable(); | 117 | pagefault_enable(); |
109 | } | 118 | } |
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 3ba6e0608c55..be1ef574ce9a 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c | |||
@@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
91 | return (__force void __iomem *)phys_to_virt(phys_addr); | 91 | return (__force void __iomem *)phys_to_virt(phys_addr); |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * Check if the request spans more than any BAR in the iomem resource | ||
95 | * tree. | ||
96 | */ | ||
97 | WARN_ONCE(iomem_map_sanity_check(phys_addr, size), | ||
98 | KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); | ||
99 | |||
100 | /* | ||
101 | * Don't allow anybody to remap normal RAM that we're using.. | 94 | * Don't allow anybody to remap normal RAM that we're using.. |
102 | */ | 95 | */ |
103 | last_pfn = last_addr >> PAGE_SHIFT; | 96 | last_pfn = last_addr >> PAGE_SHIFT; |
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, | |||
170 | ret_addr = (void __iomem *) (vaddr + offset); | 163 | ret_addr = (void __iomem *) (vaddr + offset); |
171 | mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); | 164 | mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); |
172 | 165 | ||
166 | /* | ||
167 | * Check if the request spans more than any BAR in the iomem resource | ||
168 | * tree. | ||
169 | */ | ||
170 | WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), | ||
171 | KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); | ||
172 | |||
173 | return ret_addr; | 173 | return ret_addr; |
174 | err_free_area: | 174 | err_free_area: |
175 | free_vm_area(area); | 175 | free_vm_area(area); |
@@ -362,6 +362,11 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr) | |||
362 | return &bm_pte[pte_index(addr)]; | 362 | return &bm_pte[pte_index(addr)]; |
363 | } | 363 | } |
364 | 364 | ||
365 | bool __init is_early_ioremap_ptep(pte_t *ptep) | ||
366 | { | ||
367 | return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)]; | ||
368 | } | ||
369 | |||
365 | static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; | 370 | static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; |
366 | 371 | ||
367 | void __init early_ioremap_init(void) | 372 | void __init early_ioremap_init(void) |
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index af3b6c8a436f..704a37cedddb 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c | |||
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state, | |||
185 | e->trace.entries = e->trace_entries; | 185 | e->trace.entries = e->trace_entries; |
186 | e->trace.max_entries = ARRAY_SIZE(e->trace_entries); | 186 | e->trace.max_entries = ARRAY_SIZE(e->trace_entries); |
187 | e->trace.skip = 0; | 187 | e->trace.skip = 0; |
188 | save_stack_trace_bp(&e->trace, regs->bp); | 188 | save_stack_trace_regs(&e->trace, regs); |
189 | 189 | ||
190 | /* Round address down to nearest 16 bytes */ | 190 | /* Round address down to nearest 16 bytes */ |
191 | shadow_copy = kmemcheck_shadow_lookup(address | 191 | shadow_copy = kmemcheck_shadow_lookup(address |
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c index b3b531a4f8e5..d87dd6d042d6 100644 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ b/arch/x86/mm/kmemcheck/kmemcheck.c | |||
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, | |||
631 | if (!pte) | 631 | if (!pte) |
632 | return false; | 632 | return false; |
633 | 633 | ||
634 | WARN_ON_ONCE(in_nmi()); | ||
635 | |||
634 | if (error_code & 2) | 636 | if (error_code & 2) |
635 | kmemcheck_access(regs, address, KMEMCHECK_WRITE); | 637 | kmemcheck_access(regs, address, KMEMCHECK_WRITE); |
636 | else | 638 | else |
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index 63c19e27aa6f..324aa3f07237 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c | |||
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b) | |||
9 | b == 0xf0 || b == 0xf2 || b == 0xf3 | 9 | b == 0xf0 || b == 0xf2 || b == 0xf3 |
10 | /* Group 2 */ | 10 | /* Group 2 */ |
11 | || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 | 11 | || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 |
12 | || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e | 12 | || b == 0x64 || b == 0x65 |
13 | /* Group 3 */ | 13 | /* Group 3 */ |
14 | || b == 0x66 | 14 | || b == 0x66 |
15 | /* Group 4 */ | 15 | /* Group 4 */ |
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c new file mode 100644 index 000000000000..992da5ec5a64 --- /dev/null +++ b/arch/x86/mm/memblock.c | |||
@@ -0,0 +1,348 @@ | |||
1 | #include <linux/kernel.h> | ||
2 | #include <linux/types.h> | ||
3 | #include <linux/init.h> | ||
4 | #include <linux/bitops.h> | ||
5 | #include <linux/memblock.h> | ||
6 | #include <linux/bootmem.h> | ||
7 | #include <linux/mm.h> | ||
8 | #include <linux/range.h> | ||
9 | |||
10 | /* Check for already reserved areas */ | ||
11 | bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align) | ||
12 | { | ||
13 | struct memblock_region *r; | ||
14 | u64 addr = *addrp, last; | ||
15 | u64 size = *sizep; | ||
16 | bool changed = false; | ||
17 | |||
18 | again: | ||
19 | last = addr + size; | ||
20 | for_each_memblock(reserved, r) { | ||
21 | if (last > r->base && addr < r->base) { | ||
22 | size = r->base - addr; | ||
23 | changed = true; | ||
24 | goto again; | ||
25 | } | ||
26 | if (last > (r->base + r->size) && addr < (r->base + r->size)) { | ||
27 | addr = round_up(r->base + r->size, align); | ||
28 | size = last - addr; | ||
29 | changed = true; | ||
30 | goto again; | ||
31 | } | ||
32 | if (last <= (r->base + r->size) && addr >= r->base) { | ||
33 | *sizep = 0; | ||
34 | return false; | ||
35 | } | ||
36 | } | ||
37 | if (changed) { | ||
38 | *addrp = addr; | ||
39 | *sizep = size; | ||
40 | } | ||
41 | return changed; | ||
42 | } | ||
43 | |||
44 | /* | ||
45 | * Find next free range after start, and size is returned in *sizep | ||
46 | */ | ||
47 | u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align) | ||
48 | { | ||
49 | struct memblock_region *r; | ||
50 | |||
51 | for_each_memblock(memory, r) { | ||
52 | u64 ei_start = r->base; | ||
53 | u64 ei_last = ei_start + r->size; | ||
54 | u64 addr; | ||
55 | |||
56 | addr = round_up(ei_start, align); | ||
57 | if (addr < start) | ||
58 | addr = round_up(start, align); | ||
59 | if (addr >= ei_last) | ||
60 | continue; | ||
61 | *sizep = ei_last - addr; | ||
62 | while (memblock_x86_check_reserved_size(&addr, sizep, align)) | ||
63 | ; | ||
64 | |||
65 | if (*sizep) | ||
66 | return addr; | ||
67 | } | ||
68 | |||
69 | return MEMBLOCK_ERROR; | ||
70 | } | ||
71 | |||
72 | static __init struct range *find_range_array(int count) | ||
73 | { | ||
74 | u64 end, size, mem; | ||
75 | struct range *range; | ||
76 | |||
77 | size = sizeof(struct range) * count; | ||
78 | end = memblock.current_limit; | ||
79 | |||
80 | mem = memblock_find_in_range(0, end, size, sizeof(struct range)); | ||
81 | if (mem == MEMBLOCK_ERROR) | ||
82 | panic("can not find more space for range array"); | ||
83 | |||
84 | /* | ||
85 | * This range is tempoaray, so don't reserve it, it will not be | ||
86 | * overlapped because We will not alloccate new buffer before | ||
87 | * We discard this one | ||
88 | */ | ||
89 | range = __va(mem); | ||
90 | memset(range, 0, size); | ||
91 | |||
92 | return range; | ||
93 | } | ||
94 | |||
95 | static void __init memblock_x86_subtract_reserved(struct range *range, int az) | ||
96 | { | ||
97 | u64 final_start, final_end; | ||
98 | struct memblock_region *r; | ||
99 | |||
100 | /* Take out region array itself at first*/ | ||
101 | memblock_free_reserved_regions(); | ||
102 | |||
103 | memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt); | ||
104 | |||
105 | for_each_memblock(reserved, r) { | ||
106 | memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1); | ||
107 | final_start = PFN_DOWN(r->base); | ||
108 | final_end = PFN_UP(r->base + r->size); | ||
109 | if (final_start >= final_end) | ||
110 | continue; | ||
111 | subtract_range(range, az, final_start, final_end); | ||
112 | } | ||
113 | |||
114 | /* Put region array back ? */ | ||
115 | memblock_reserve_reserved_regions(); | ||
116 | } | ||
117 | |||
118 | struct count_data { | ||
119 | int nr; | ||
120 | }; | ||
121 | |||
122 | static int __init count_work_fn(unsigned long start_pfn, | ||
123 | unsigned long end_pfn, void *datax) | ||
124 | { | ||
125 | struct count_data *data = datax; | ||
126 | |||
127 | data->nr++; | ||
128 | |||
129 | return 0; | ||
130 | } | ||
131 | |||
132 | static int __init count_early_node_map(int nodeid) | ||
133 | { | ||
134 | struct count_data data; | ||
135 | |||
136 | data.nr = 0; | ||
137 | work_with_active_regions(nodeid, count_work_fn, &data); | ||
138 | |||
139 | return data.nr; | ||
140 | } | ||
141 | |||
142 | int __init __get_free_all_memory_range(struct range **rangep, int nodeid, | ||
143 | unsigned long start_pfn, unsigned long end_pfn) | ||
144 | { | ||
145 | int count; | ||
146 | struct range *range; | ||
147 | int nr_range; | ||
148 | |||
149 | count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2; | ||
150 | |||
151 | range = find_range_array(count); | ||
152 | nr_range = 0; | ||
153 | |||
154 | /* | ||
155 | * Use early_node_map[] and memblock.reserved.region to get range array | ||
156 | * at first | ||
157 | */ | ||
158 | nr_range = add_from_early_node_map(range, count, nr_range, nodeid); | ||
159 | subtract_range(range, count, 0, start_pfn); | ||
160 | subtract_range(range, count, end_pfn, -1ULL); | ||
161 | |||
162 | memblock_x86_subtract_reserved(range, count); | ||
163 | nr_range = clean_sort_range(range, count); | ||
164 | |||
165 | *rangep = range; | ||
166 | return nr_range; | ||
167 | } | ||
168 | |||
169 | int __init get_free_all_memory_range(struct range **rangep, int nodeid) | ||
170 | { | ||
171 | unsigned long end_pfn = -1UL; | ||
172 | |||
173 | #ifdef CONFIG_X86_32 | ||
174 | end_pfn = max_low_pfn; | ||
175 | #endif | ||
176 | return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn); | ||
177 | } | ||
178 | |||
179 | static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free) | ||
180 | { | ||
181 | int i, count; | ||
182 | struct range *range; | ||
183 | int nr_range; | ||
184 | u64 final_start, final_end; | ||
185 | u64 free_size; | ||
186 | struct memblock_region *r; | ||
187 | |||
188 | count = (memblock.reserved.cnt + memblock.memory.cnt) * 2; | ||
189 | |||
190 | range = find_range_array(count); | ||
191 | nr_range = 0; | ||
192 | |||
193 | addr = PFN_UP(addr); | ||
194 | limit = PFN_DOWN(limit); | ||
195 | |||
196 | for_each_memblock(memory, r) { | ||
197 | final_start = PFN_UP(r->base); | ||
198 | final_end = PFN_DOWN(r->base + r->size); | ||
199 | if (final_start >= final_end) | ||
200 | continue; | ||
201 | if (final_start >= limit || final_end <= addr) | ||
202 | continue; | ||
203 | |||
204 | nr_range = add_range(range, count, nr_range, final_start, final_end); | ||
205 | } | ||
206 | subtract_range(range, count, 0, addr); | ||
207 | subtract_range(range, count, limit, -1ULL); | ||
208 | |||
209 | /* Subtract memblock.reserved.region in range ? */ | ||
210 | if (!get_free) | ||
211 | goto sort_and_count_them; | ||
212 | for_each_memblock(reserved, r) { | ||
213 | final_start = PFN_DOWN(r->base); | ||
214 | final_end = PFN_UP(r->base + r->size); | ||
215 | if (final_start >= final_end) | ||
216 | continue; | ||
217 | if (final_start >= limit || final_end <= addr) | ||
218 | continue; | ||
219 | |||
220 | subtract_range(range, count, final_start, final_end); | ||
221 | } | ||
222 | |||
223 | sort_and_count_them: | ||
224 | nr_range = clean_sort_range(range, count); | ||
225 | |||
226 | free_size = 0; | ||
227 | for (i = 0; i < nr_range; i++) | ||
228 | free_size += range[i].end - range[i].start; | ||
229 | |||
230 | return free_size << PAGE_SHIFT; | ||
231 | } | ||
232 | |||
233 | u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit) | ||
234 | { | ||
235 | return __memblock_x86_memory_in_range(addr, limit, true); | ||
236 | } | ||
237 | |||
238 | u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit) | ||
239 | { | ||
240 | return __memblock_x86_memory_in_range(addr, limit, false); | ||
241 | } | ||
242 | |||
243 | void __init memblock_x86_reserve_range(u64 start, u64 end, char *name) | ||
244 | { | ||
245 | if (start == end) | ||
246 | return; | ||
247 | |||
248 | if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end)) | ||
249 | return; | ||
250 | |||
251 | memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name); | ||
252 | |||
253 | memblock_reserve(start, end - start); | ||
254 | } | ||
255 | |||
256 | void __init memblock_x86_free_range(u64 start, u64 end) | ||
257 | { | ||
258 | if (start == end) | ||
259 | return; | ||
260 | |||
261 | if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end)) | ||
262 | return; | ||
263 | |||
264 | memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1); | ||
265 | |||
266 | memblock_free(start, end - start); | ||
267 | } | ||
268 | |||
269 | /* | ||
270 | * Need to call this function after memblock_x86_register_active_regions, | ||
271 | * so early_node_map[] is filled already. | ||
272 | */ | ||
273 | u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align) | ||
274 | { | ||
275 | u64 addr; | ||
276 | addr = find_memory_core_early(nid, size, align, start, end); | ||
277 | if (addr != MEMBLOCK_ERROR) | ||
278 | return addr; | ||
279 | |||
280 | /* Fallback, should already have start end within node range */ | ||
281 | return memblock_find_in_range(start, end, size, align); | ||
282 | } | ||
283 | |||
284 | /* | ||
285 | * Finds an active region in the address range from start_pfn to last_pfn and | ||
286 | * returns its range in ei_startpfn and ei_endpfn for the memblock entry. | ||
287 | */ | ||
288 | static int __init memblock_x86_find_active_region(const struct memblock_region *ei, | ||
289 | unsigned long start_pfn, | ||
290 | unsigned long last_pfn, | ||
291 | unsigned long *ei_startpfn, | ||
292 | unsigned long *ei_endpfn) | ||
293 | { | ||
294 | u64 align = PAGE_SIZE; | ||
295 | |||
296 | *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT; | ||
297 | *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT; | ||
298 | |||
299 | /* Skip map entries smaller than a page */ | ||
300 | if (*ei_startpfn >= *ei_endpfn) | ||
301 | return 0; | ||
302 | |||
303 | /* Skip if map is outside the node */ | ||
304 | if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn) | ||
305 | return 0; | ||
306 | |||
307 | /* Check for overlaps */ | ||
308 | if (*ei_startpfn < start_pfn) | ||
309 | *ei_startpfn = start_pfn; | ||
310 | if (*ei_endpfn > last_pfn) | ||
311 | *ei_endpfn = last_pfn; | ||
312 | |||
313 | return 1; | ||
314 | } | ||
315 | |||
316 | /* Walk the memblock.memory map and register active regions within a node */ | ||
317 | void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn, | ||
318 | unsigned long last_pfn) | ||
319 | { | ||
320 | unsigned long ei_startpfn; | ||
321 | unsigned long ei_endpfn; | ||
322 | struct memblock_region *r; | ||
323 | |||
324 | for_each_memblock(memory, r) | ||
325 | if (memblock_x86_find_active_region(r, start_pfn, last_pfn, | ||
326 | &ei_startpfn, &ei_endpfn)) | ||
327 | add_active_range(nid, ei_startpfn, ei_endpfn); | ||
328 | } | ||
329 | |||
330 | /* | ||
331 | * Find the hole size (in bytes) in the memory range. | ||
332 | * @start: starting address of the memory range to scan | ||
333 | * @end: ending address of the memory range to scan | ||
334 | */ | ||
335 | u64 __init memblock_x86_hole_size(u64 start, u64 end) | ||
336 | { | ||
337 | unsigned long start_pfn = start >> PAGE_SHIFT; | ||
338 | unsigned long last_pfn = end >> PAGE_SHIFT; | ||
339 | unsigned long ei_startpfn, ei_endpfn, ram = 0; | ||
340 | struct memblock_region *r; | ||
341 | |||
342 | for_each_memblock(memory, r) | ||
343 | if (memblock_x86_find_active_region(r, start_pfn, last_pfn, | ||
344 | &ei_startpfn, &ei_endpfn)) | ||
345 | ram += ei_endpfn - ei_startpfn; | ||
346 | |||
347 | return end - start - ((u64)ram << PAGE_SHIFT); | ||
348 | } | ||
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 18d244f70205..92faf3a1c53e 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c | |||
@@ -6,8 +6,7 @@ | |||
6 | #include <linux/smp.h> | 6 | #include <linux/smp.h> |
7 | #include <linux/init.h> | 7 | #include <linux/init.h> |
8 | #include <linux/pfn.h> | 8 | #include <linux/pfn.h> |
9 | 9 | #include <linux/memblock.h> | |
10 | #include <asm/e820.h> | ||
11 | 10 | ||
12 | static u64 patterns[] __initdata = { | 11 | static u64 patterns[] __initdata = { |
13 | 0, | 12 | 0, |
@@ -35,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) | |||
35 | (unsigned long long) pattern, | 34 | (unsigned long long) pattern, |
36 | (unsigned long long) start_bad, | 35 | (unsigned long long) start_bad, |
37 | (unsigned long long) end_bad); | 36 | (unsigned long long) end_bad); |
38 | reserve_early(start_bad, end_bad, "BAD RAM"); | 37 | memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM"); |
39 | } | 38 | } |
40 | 39 | ||
41 | static void __init memtest(u64 pattern, u64 start_phys, u64 size) | 40 | static void __init memtest(u64 pattern, u64 start_phys, u64 size) |
@@ -74,7 +73,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end) | |||
74 | u64 size = 0; | 73 | u64 size = 0; |
75 | 74 | ||
76 | while (start < end) { | 75 | while (start < end) { |
77 | start = find_e820_area_size(start, &size, 1); | 76 | start = memblock_x86_find_in_range_size(start, &size, 1); |
78 | 77 | ||
79 | /* done ? */ | 78 | /* done ? */ |
80 | if (start >= end) | 79 | if (start >= end) |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 787c52ca49c3..f5510d889a22 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -1,15 +1,112 @@ | |||
1 | /* Common code for 32 and 64-bit NUMA */ | 1 | /* Common code for 32 and 64-bit NUMA */ |
2 | #include <linux/topology.h> | 2 | #include <linux/kernel.h> |
3 | #include <linux/module.h> | 3 | #include <linux/mm.h> |
4 | #include <linux/string.h> | ||
5 | #include <linux/init.h> | ||
4 | #include <linux/bootmem.h> | 6 | #include <linux/bootmem.h> |
7 | #include <linux/memblock.h> | ||
8 | #include <linux/mmzone.h> | ||
9 | #include <linux/ctype.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/nodemask.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/topology.h> | ||
14 | |||
15 | #include <asm/e820.h> | ||
16 | #include <asm/proto.h> | ||
17 | #include <asm/dma.h> | ||
18 | #include <asm/acpi.h> | ||
19 | #include <asm/amd_nb.h> | ||
20 | |||
21 | #include "numa_internal.h" | ||
22 | |||
23 | int __initdata numa_off; | ||
24 | nodemask_t numa_nodes_parsed __initdata; | ||
25 | |||
26 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | ||
27 | EXPORT_SYMBOL(node_data); | ||
28 | |||
29 | static struct numa_meminfo numa_meminfo | ||
30 | #ifndef CONFIG_MEMORY_HOTPLUG | ||
31 | __initdata | ||
32 | #endif | ||
33 | ; | ||
34 | |||
35 | static int numa_distance_cnt; | ||
36 | static u8 *numa_distance; | ||
37 | |||
38 | static __init int numa_setup(char *opt) | ||
39 | { | ||
40 | if (!opt) | ||
41 | return -EINVAL; | ||
42 | if (!strncmp(opt, "off", 3)) | ||
43 | numa_off = 1; | ||
44 | #ifdef CONFIG_NUMA_EMU | ||
45 | if (!strncmp(opt, "fake=", 5)) | ||
46 | numa_emu_cmdline(opt + 5); | ||
47 | #endif | ||
48 | #ifdef CONFIG_ACPI_NUMA | ||
49 | if (!strncmp(opt, "noacpi", 6)) | ||
50 | acpi_numa = -1; | ||
51 | #endif | ||
52 | return 0; | ||
53 | } | ||
54 | early_param("numa", numa_setup); | ||
5 | 55 | ||
6 | /* | 56 | /* |
7 | * Which logical CPUs are on which nodes | 57 | * apicid, cpu, node mappings |
8 | */ | 58 | */ |
59 | s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | ||
60 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
61 | }; | ||
62 | |||
63 | int __cpuinit numa_cpu_node(int cpu) | ||
64 | { | ||
65 | int apicid = early_per_cpu(x86_cpu_to_apicid, cpu); | ||
66 | |||
67 | if (apicid != BAD_APICID) | ||
68 | return __apicid_to_node[apicid]; | ||
69 | return NUMA_NO_NODE; | ||
70 | } | ||
71 | |||
9 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; | 72 | cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; |
10 | EXPORT_SYMBOL(node_to_cpumask_map); | 73 | EXPORT_SYMBOL(node_to_cpumask_map); |
11 | 74 | ||
12 | /* | 75 | /* |
76 | * Map cpu index to node index | ||
77 | */ | ||
78 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | ||
79 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
80 | |||
81 | void __cpuinit numa_set_node(int cpu, int node) | ||
82 | { | ||
83 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | ||
84 | |||
85 | /* early setting, no percpu area yet */ | ||
86 | if (cpu_to_node_map) { | ||
87 | cpu_to_node_map[cpu] = node; | ||
88 | return; | ||
89 | } | ||
90 | |||
91 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
92 | if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { | ||
93 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); | ||
94 | dump_stack(); | ||
95 | return; | ||
96 | } | ||
97 | #endif | ||
98 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
99 | |||
100 | if (node != NUMA_NO_NODE) | ||
101 | set_cpu_numa_node(cpu, node); | ||
102 | } | ||
103 | |||
104 | void __cpuinit numa_clear_node(int cpu) | ||
105 | { | ||
106 | numa_set_node(cpu, NUMA_NO_NODE); | ||
107 | } | ||
108 | |||
109 | /* | ||
13 | * Allocate node_to_cpumask_map based on number of available nodes | 110 | * Allocate node_to_cpumask_map based on number of available nodes |
14 | * Requires node_possible_map to be valid. | 111 | * Requires node_possible_map to be valid. |
15 | * | 112 | * |
@@ -35,7 +132,659 @@ void __init setup_node_to_cpumask_map(void) | |||
35 | pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); | 132 | pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); |
36 | } | 133 | } |
37 | 134 | ||
38 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | 135 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, |
136 | struct numa_meminfo *mi) | ||
137 | { | ||
138 | /* ignore zero length blks */ | ||
139 | if (start == end) | ||
140 | return 0; | ||
141 | |||
142 | /* whine about and ignore invalid blks */ | ||
143 | if (start > end || nid < 0 || nid >= MAX_NUMNODES) { | ||
144 | pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n", | ||
145 | nid, start, end); | ||
146 | return 0; | ||
147 | } | ||
148 | |||
149 | if (mi->nr_blks >= NR_NODE_MEMBLKS) { | ||
150 | pr_err("NUMA: too many memblk ranges\n"); | ||
151 | return -EINVAL; | ||
152 | } | ||
153 | |||
154 | mi->blk[mi->nr_blks].start = start; | ||
155 | mi->blk[mi->nr_blks].end = end; | ||
156 | mi->blk[mi->nr_blks].nid = nid; | ||
157 | mi->nr_blks++; | ||
158 | return 0; | ||
159 | } | ||
160 | |||
161 | /** | ||
162 | * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo | ||
163 | * @idx: Index of memblk to remove | ||
164 | * @mi: numa_meminfo to remove memblk from | ||
165 | * | ||
166 | * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and | ||
167 | * decrementing @mi->nr_blks. | ||
168 | */ | ||
169 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) | ||
170 | { | ||
171 | mi->nr_blks--; | ||
172 | memmove(&mi->blk[idx], &mi->blk[idx + 1], | ||
173 | (mi->nr_blks - idx) * sizeof(mi->blk[0])); | ||
174 | } | ||
175 | |||
176 | /** | ||
177 | * numa_add_memblk - Add one numa_memblk to numa_meminfo | ||
178 | * @nid: NUMA node ID of the new memblk | ||
179 | * @start: Start address of the new memblk | ||
180 | * @end: End address of the new memblk | ||
181 | * | ||
182 | * Add a new memblk to the default numa_meminfo. | ||
183 | * | ||
184 | * RETURNS: | ||
185 | * 0 on success, -errno on failure. | ||
186 | */ | ||
187 | int __init numa_add_memblk(int nid, u64 start, u64 end) | ||
188 | { | ||
189 | return numa_add_memblk_to(nid, start, end, &numa_meminfo); | ||
190 | } | ||
191 | |||
192 | /* Initialize NODE_DATA for a node on the local memory */ | ||
193 | static void __init setup_node_data(int nid, u64 start, u64 end) | ||
194 | { | ||
195 | const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); | ||
196 | const u64 nd_high = PFN_PHYS(max_pfn_mapped); | ||
197 | const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); | ||
198 | bool remapped = false; | ||
199 | u64 nd_pa; | ||
200 | void *nd; | ||
201 | int tnid; | ||
202 | |||
203 | /* | ||
204 | * Don't confuse VM with a node that doesn't have the | ||
205 | * minimum amount of memory: | ||
206 | */ | ||
207 | if (end && (end - start) < NODE_MIN_SIZE) | ||
208 | return; | ||
209 | |||
210 | /* initialize remap allocator before aligning to ZONE_ALIGN */ | ||
211 | init_alloc_remap(nid, start, end); | ||
212 | |||
213 | start = roundup(start, ZONE_ALIGN); | ||
214 | |||
215 | printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n", | ||
216 | nid, start, end); | ||
217 | |||
218 | /* | ||
219 | * Allocate node data. Try remap allocator first, node-local | ||
220 | * memory and then any node. Never allocate in DMA zone. | ||
221 | */ | ||
222 | nd = alloc_remap(nid, nd_size); | ||
223 | if (nd) { | ||
224 | nd_pa = __pa(nd); | ||
225 | remapped = true; | ||
226 | } else { | ||
227 | nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, | ||
228 | nd_size, SMP_CACHE_BYTES); | ||
229 | if (nd_pa == MEMBLOCK_ERROR) | ||
230 | nd_pa = memblock_find_in_range(nd_low, nd_high, | ||
231 | nd_size, SMP_CACHE_BYTES); | ||
232 | if (nd_pa == MEMBLOCK_ERROR) { | ||
233 | pr_err("Cannot find %zu bytes in node %d\n", | ||
234 | nd_size, nid); | ||
235 | return; | ||
236 | } | ||
237 | memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); | ||
238 | nd = __va(nd_pa); | ||
239 | } | ||
240 | |||
241 | /* report and initialize */ | ||
242 | printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n", | ||
243 | nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); | ||
244 | tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); | ||
245 | if (!remapped && tnid != nid) | ||
246 | printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); | ||
247 | |||
248 | node_data[nid] = nd; | ||
249 | memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); | ||
250 | NODE_DATA(nid)->node_id = nid; | ||
251 | NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; | ||
252 | NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; | ||
253 | |||
254 | node_set_online(nid); | ||
255 | } | ||
256 | |||
257 | /** | ||
258 | * numa_cleanup_meminfo - Cleanup a numa_meminfo | ||
259 | * @mi: numa_meminfo to clean up | ||
260 | * | ||
261 | * Sanitize @mi by merging and removing unncessary memblks. Also check for | ||
262 | * conflicts and clear unused memblks. | ||
263 | * | ||
264 | * RETURNS: | ||
265 | * 0 on success, -errno on failure. | ||
266 | */ | ||
267 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi) | ||
268 | { | ||
269 | const u64 low = 0; | ||
270 | const u64 high = PFN_PHYS(max_pfn); | ||
271 | int i, j, k; | ||
272 | |||
273 | /* first, trim all entries */ | ||
274 | for (i = 0; i < mi->nr_blks; i++) { | ||
275 | struct numa_memblk *bi = &mi->blk[i]; | ||
276 | |||
277 | /* make sure all blocks are inside the limits */ | ||
278 | bi->start = max(bi->start, low); | ||
279 | bi->end = min(bi->end, high); | ||
280 | |||
281 | /* and there's no empty block */ | ||
282 | if (bi->start >= bi->end) | ||
283 | numa_remove_memblk_from(i--, mi); | ||
284 | } | ||
285 | |||
286 | /* merge neighboring / overlapping entries */ | ||
287 | for (i = 0; i < mi->nr_blks; i++) { | ||
288 | struct numa_memblk *bi = &mi->blk[i]; | ||
289 | |||
290 | for (j = i + 1; j < mi->nr_blks; j++) { | ||
291 | struct numa_memblk *bj = &mi->blk[j]; | ||
292 | u64 start, end; | ||
293 | |||
294 | /* | ||
295 | * See whether there are overlapping blocks. Whine | ||
296 | * about but allow overlaps of the same nid. They | ||
297 | * will be merged below. | ||
298 | */ | ||
299 | if (bi->end > bj->start && bi->start < bj->end) { | ||
300 | if (bi->nid != bj->nid) { | ||
301 | pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n", | ||
302 | bi->nid, bi->start, bi->end, | ||
303 | bj->nid, bj->start, bj->end); | ||
304 | return -EINVAL; | ||
305 | } | ||
306 | pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n", | ||
307 | bi->nid, bi->start, bi->end, | ||
308 | bj->start, bj->end); | ||
309 | } | ||
310 | |||
311 | /* | ||
312 | * Join together blocks on the same node, holes | ||
313 | * between which don't overlap with memory on other | ||
314 | * nodes. | ||
315 | */ | ||
316 | if (bi->nid != bj->nid) | ||
317 | continue; | ||
318 | start = min(bi->start, bj->start); | ||
319 | end = max(bi->end, bj->end); | ||
320 | for (k = 0; k < mi->nr_blks; k++) { | ||
321 | struct numa_memblk *bk = &mi->blk[k]; | ||
322 | |||
323 | if (bi->nid == bk->nid) | ||
324 | continue; | ||
325 | if (start < bk->end && end > bk->start) | ||
326 | break; | ||
327 | } | ||
328 | if (k < mi->nr_blks) | ||
329 | continue; | ||
330 | printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n", | ||
331 | bi->nid, bi->start, bi->end, bj->start, bj->end, | ||
332 | start, end); | ||
333 | bi->start = start; | ||
334 | bi->end = end; | ||
335 | numa_remove_memblk_from(j--, mi); | ||
336 | } | ||
337 | } | ||
338 | |||
339 | /* clear unused ones */ | ||
340 | for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { | ||
341 | mi->blk[i].start = mi->blk[i].end = 0; | ||
342 | mi->blk[i].nid = NUMA_NO_NODE; | ||
343 | } | ||
344 | |||
345 | return 0; | ||
346 | } | ||
347 | |||
348 | /* | ||
349 | * Set nodes, which have memory in @mi, in *@nodemask. | ||
350 | */ | ||
351 | static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, | ||
352 | const struct numa_meminfo *mi) | ||
353 | { | ||
354 | int i; | ||
355 | |||
356 | for (i = 0; i < ARRAY_SIZE(mi->blk); i++) | ||
357 | if (mi->blk[i].start != mi->blk[i].end && | ||
358 | mi->blk[i].nid != NUMA_NO_NODE) | ||
359 | node_set(mi->blk[i].nid, *nodemask); | ||
360 | } | ||
361 | |||
362 | /** | ||
363 | * numa_reset_distance - Reset NUMA distance table | ||
364 | * | ||
365 | * The current table is freed. The next numa_set_distance() call will | ||
366 | * create a new one. | ||
367 | */ | ||
368 | void __init numa_reset_distance(void) | ||
369 | { | ||
370 | size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]); | ||
371 | |||
372 | /* numa_distance could be 1LU marking allocation failure, test cnt */ | ||
373 | if (numa_distance_cnt) | ||
374 | memblock_x86_free_range(__pa(numa_distance), | ||
375 | __pa(numa_distance) + size); | ||
376 | numa_distance_cnt = 0; | ||
377 | numa_distance = NULL; /* enable table creation */ | ||
378 | } | ||
379 | |||
380 | static int __init numa_alloc_distance(void) | ||
381 | { | ||
382 | nodemask_t nodes_parsed; | ||
383 | size_t size; | ||
384 | int i, j, cnt = 0; | ||
385 | u64 phys; | ||
386 | |||
387 | /* size the new table and allocate it */ | ||
388 | nodes_parsed = numa_nodes_parsed; | ||
389 | numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); | ||
390 | |||
391 | for_each_node_mask(i, nodes_parsed) | ||
392 | cnt = i; | ||
393 | cnt++; | ||
394 | size = cnt * cnt * sizeof(numa_distance[0]); | ||
395 | |||
396 | phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), | ||
397 | size, PAGE_SIZE); | ||
398 | if (phys == MEMBLOCK_ERROR) { | ||
399 | pr_warning("NUMA: Warning: can't allocate distance table!\n"); | ||
400 | /* don't retry until explicitly reset */ | ||
401 | numa_distance = (void *)1LU; | ||
402 | return -ENOMEM; | ||
403 | } | ||
404 | memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); | ||
405 | |||
406 | numa_distance = __va(phys); | ||
407 | numa_distance_cnt = cnt; | ||
408 | |||
409 | /* fill with the default distances */ | ||
410 | for (i = 0; i < cnt; i++) | ||
411 | for (j = 0; j < cnt; j++) | ||
412 | numa_distance[i * cnt + j] = i == j ? | ||
413 | LOCAL_DISTANCE : REMOTE_DISTANCE; | ||
414 | printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt); | ||
415 | |||
416 | return 0; | ||
417 | } | ||
418 | |||
419 | /** | ||
420 | * numa_set_distance - Set NUMA distance from one NUMA to another | ||
421 | * @from: the 'from' node to set distance | ||
422 | * @to: the 'to' node to set distance | ||
423 | * @distance: NUMA distance | ||
424 | * | ||
425 | * Set the distance from node @from to @to to @distance. If distance table | ||
426 | * doesn't exist, one which is large enough to accommodate all the currently | ||
427 | * known nodes will be created. | ||
428 | * | ||
429 | * If such table cannot be allocated, a warning is printed and further | ||
430 | * calls are ignored until the distance table is reset with | ||
431 | * numa_reset_distance(). | ||
432 | * | ||
433 | * If @from or @to is higher than the highest known node at the time of | ||
434 | * table creation or @distance doesn't make sense, the call is ignored. | ||
435 | * This is to allow simplification of specific NUMA config implementations. | ||
436 | */ | ||
437 | void __init numa_set_distance(int from, int to, int distance) | ||
438 | { | ||
439 | if (!numa_distance && numa_alloc_distance() < 0) | ||
440 | return; | ||
441 | |||
442 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) { | ||
443 | printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n", | ||
444 | from, to, distance); | ||
445 | return; | ||
446 | } | ||
447 | |||
448 | if ((u8)distance != distance || | ||
449 | (from == to && distance != LOCAL_DISTANCE)) { | ||
450 | pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n", | ||
451 | from, to, distance); | ||
452 | return; | ||
453 | } | ||
454 | |||
455 | numa_distance[from * numa_distance_cnt + to] = distance; | ||
456 | } | ||
457 | |||
458 | int __node_distance(int from, int to) | ||
459 | { | ||
460 | if (from >= numa_distance_cnt || to >= numa_distance_cnt) | ||
461 | return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE; | ||
462 | return numa_distance[from * numa_distance_cnt + to]; | ||
463 | } | ||
464 | EXPORT_SYMBOL(__node_distance); | ||
465 | |||
466 | /* | ||
467 | * Sanity check to catch more bad NUMA configurations (they are amazingly | ||
468 | * common). Make sure the nodes cover all memory. | ||
469 | */ | ||
470 | static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) | ||
471 | { | ||
472 | u64 numaram, e820ram; | ||
473 | int i; | ||
474 | |||
475 | numaram = 0; | ||
476 | for (i = 0; i < mi->nr_blks; i++) { | ||
477 | u64 s = mi->blk[i].start >> PAGE_SHIFT; | ||
478 | u64 e = mi->blk[i].end >> PAGE_SHIFT; | ||
479 | numaram += e - s; | ||
480 | numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e); | ||
481 | if ((s64)numaram < 0) | ||
482 | numaram = 0; | ||
483 | } | ||
484 | |||
485 | e820ram = max_pfn - (memblock_x86_hole_size(0, | ||
486 | PFN_PHYS(max_pfn)) >> PAGE_SHIFT); | ||
487 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ | ||
488 | if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { | ||
489 | printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", | ||
490 | (numaram << PAGE_SHIFT) >> 20, | ||
491 | (e820ram << PAGE_SHIFT) >> 20); | ||
492 | return false; | ||
493 | } | ||
494 | return true; | ||
495 | } | ||
496 | |||
497 | static int __init numa_register_memblks(struct numa_meminfo *mi) | ||
498 | { | ||
499 | int i, nid; | ||
500 | |||
501 | /* Account for nodes with cpus and no memory */ | ||
502 | node_possible_map = numa_nodes_parsed; | ||
503 | numa_nodemask_from_meminfo(&node_possible_map, mi); | ||
504 | if (WARN_ON(nodes_empty(node_possible_map))) | ||
505 | return -EINVAL; | ||
506 | |||
507 | for (i = 0; i < mi->nr_blks; i++) | ||
508 | memblock_x86_register_active_regions(mi->blk[i].nid, | ||
509 | mi->blk[i].start >> PAGE_SHIFT, | ||
510 | mi->blk[i].end >> PAGE_SHIFT); | ||
511 | |||
512 | /* for out of order entries */ | ||
513 | sort_node_map(); | ||
514 | if (!numa_meminfo_cover_memory(mi)) | ||
515 | return -EINVAL; | ||
516 | |||
517 | /* Finally register nodes. */ | ||
518 | for_each_node_mask(nid, node_possible_map) { | ||
519 | u64 start = PFN_PHYS(max_pfn); | ||
520 | u64 end = 0; | ||
521 | |||
522 | for (i = 0; i < mi->nr_blks; i++) { | ||
523 | if (nid != mi->blk[i].nid) | ||
524 | continue; | ||
525 | start = min(mi->blk[i].start, start); | ||
526 | end = max(mi->blk[i].end, end); | ||
527 | } | ||
528 | |||
529 | if (start < end) | ||
530 | setup_node_data(nid, start, end); | ||
531 | } | ||
532 | |||
533 | return 0; | ||
534 | } | ||
535 | |||
536 | /* | ||
537 | * There are unfortunately some poorly designed mainboards around that | ||
538 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
539 | * mapping. To avoid this fill in the mapping for all possible CPUs, | ||
540 | * as the number of CPUs is not known yet. We round robin the existing | ||
541 | * nodes. | ||
542 | */ | ||
543 | static void __init numa_init_array(void) | ||
544 | { | ||
545 | int rr, i; | ||
546 | |||
547 | rr = first_node(node_online_map); | ||
548 | for (i = 0; i < nr_cpu_ids; i++) { | ||
549 | if (early_cpu_to_node(i) != NUMA_NO_NODE) | ||
550 | continue; | ||
551 | numa_set_node(i, rr); | ||
552 | rr = next_node(rr, node_online_map); | ||
553 | if (rr == MAX_NUMNODES) | ||
554 | rr = first_node(node_online_map); | ||
555 | } | ||
556 | } | ||
557 | |||
558 | static int __init numa_init(int (*init_func)(void)) | ||
559 | { | ||
560 | int i; | ||
561 | int ret; | ||
562 | |||
563 | for (i = 0; i < MAX_LOCAL_APIC; i++) | ||
564 | set_apicid_to_node(i, NUMA_NO_NODE); | ||
565 | |||
566 | nodes_clear(numa_nodes_parsed); | ||
567 | nodes_clear(node_possible_map); | ||
568 | nodes_clear(node_online_map); | ||
569 | memset(&numa_meminfo, 0, sizeof(numa_meminfo)); | ||
570 | remove_all_active_ranges(); | ||
571 | numa_reset_distance(); | ||
572 | |||
573 | ret = init_func(); | ||
574 | if (ret < 0) | ||
575 | return ret; | ||
576 | ret = numa_cleanup_meminfo(&numa_meminfo); | ||
577 | if (ret < 0) | ||
578 | return ret; | ||
579 | |||
580 | numa_emulation(&numa_meminfo, numa_distance_cnt); | ||
581 | |||
582 | ret = numa_register_memblks(&numa_meminfo); | ||
583 | if (ret < 0) | ||
584 | return ret; | ||
585 | |||
586 | for (i = 0; i < nr_cpu_ids; i++) { | ||
587 | int nid = early_cpu_to_node(i); | ||
588 | |||
589 | if (nid == NUMA_NO_NODE) | ||
590 | continue; | ||
591 | if (!node_online(nid)) | ||
592 | numa_clear_node(i); | ||
593 | } | ||
594 | numa_init_array(); | ||
595 | return 0; | ||
596 | } | ||
597 | |||
598 | /** | ||
599 | * dummy_numa_init - Fallback dummy NUMA init | ||
600 | * | ||
601 | * Used if there's no underlying NUMA architecture, NUMA initialization | ||
602 | * fails, or NUMA is disabled on the command line. | ||
603 | * | ||
604 | * Must online at least one node and add memory blocks that cover all | ||
605 | * allowed memory. This function must not fail. | ||
606 | */ | ||
607 | static int __init dummy_numa_init(void) | ||
608 | { | ||
609 | printk(KERN_INFO "%s\n", | ||
610 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | ||
611 | printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n", | ||
612 | 0LLU, PFN_PHYS(max_pfn)); | ||
613 | |||
614 | node_set(0, numa_nodes_parsed); | ||
615 | numa_add_memblk(0, 0, PFN_PHYS(max_pfn)); | ||
616 | |||
617 | return 0; | ||
618 | } | ||
619 | |||
620 | /** | ||
621 | * x86_numa_init - Initialize NUMA | ||
622 | * | ||
623 | * Try each configured NUMA initialization method until one succeeds. The | ||
624 | * last fallback is dummy single node config encomapssing whole memory and | ||
625 | * never fails. | ||
626 | */ | ||
627 | void __init x86_numa_init(void) | ||
628 | { | ||
629 | if (!numa_off) { | ||
630 | #ifdef CONFIG_X86_NUMAQ | ||
631 | if (!numa_init(numaq_numa_init)) | ||
632 | return; | ||
633 | #endif | ||
634 | #ifdef CONFIG_ACPI_NUMA | ||
635 | if (!numa_init(x86_acpi_numa_init)) | ||
636 | return; | ||
637 | #endif | ||
638 | #ifdef CONFIG_AMD_NUMA | ||
639 | if (!numa_init(amd_numa_init)) | ||
640 | return; | ||
641 | #endif | ||
642 | } | ||
643 | |||
644 | numa_init(dummy_numa_init); | ||
645 | } | ||
646 | |||
647 | static __init int find_near_online_node(int node) | ||
648 | { | ||
649 | int n, val; | ||
650 | int min_val = INT_MAX; | ||
651 | int best_node = -1; | ||
652 | |||
653 | for_each_online_node(n) { | ||
654 | val = node_distance(node, n); | ||
655 | |||
656 | if (val < min_val) { | ||
657 | min_val = val; | ||
658 | best_node = n; | ||
659 | } | ||
660 | } | ||
661 | |||
662 | return best_node; | ||
663 | } | ||
664 | |||
665 | /* | ||
666 | * Setup early cpu_to_node. | ||
667 | * | ||
668 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], | ||
669 | * and apicid_to_node[] tables have valid entries for a CPU. | ||
670 | * This means we skip cpu_to_node[] initialisation for NUMA | ||
671 | * emulation and faking node case (when running a kernel compiled | ||
672 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] | ||
673 | * is already initialized in a round robin manner at numa_init_array, | ||
674 | * prior to this call, and this initialization is good enough | ||
675 | * for the fake NUMA cases. | ||
676 | * | ||
677 | * Called before the per_cpu areas are setup. | ||
678 | */ | ||
679 | void __init init_cpu_to_node(void) | ||
680 | { | ||
681 | int cpu; | ||
682 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | ||
683 | |||
684 | BUG_ON(cpu_to_apicid == NULL); | ||
685 | |||
686 | for_each_possible_cpu(cpu) { | ||
687 | int node = numa_cpu_node(cpu); | ||
688 | |||
689 | if (node == NUMA_NO_NODE) | ||
690 | continue; | ||
691 | if (!node_online(node)) | ||
692 | node = find_near_online_node(node); | ||
693 | numa_set_node(cpu, node); | ||
694 | } | ||
695 | } | ||
696 | |||
697 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
698 | |||
699 | # ifndef CONFIG_NUMA_EMU | ||
700 | void __cpuinit numa_add_cpu(int cpu) | ||
701 | { | ||
702 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
703 | } | ||
704 | |||
705 | void __cpuinit numa_remove_cpu(int cpu) | ||
706 | { | ||
707 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
708 | } | ||
709 | # endif /* !CONFIG_NUMA_EMU */ | ||
710 | |||
711 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
712 | |||
713 | int __cpu_to_node(int cpu) | ||
714 | { | ||
715 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { | ||
716 | printk(KERN_WARNING | ||
717 | "cpu_to_node(%d): usage too early!\n", cpu); | ||
718 | dump_stack(); | ||
719 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
720 | } | ||
721 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
722 | } | ||
723 | EXPORT_SYMBOL(__cpu_to_node); | ||
724 | |||
725 | /* | ||
726 | * Same function as cpu_to_node() but used if called before the | ||
727 | * per_cpu areas are setup. | ||
728 | */ | ||
729 | int early_cpu_to_node(int cpu) | ||
730 | { | ||
731 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) | ||
732 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
733 | |||
734 | if (!cpu_possible(cpu)) { | ||
735 | printk(KERN_WARNING | ||
736 | "early_cpu_to_node(%d): no per_cpu area!\n", cpu); | ||
737 | dump_stack(); | ||
738 | return NUMA_NO_NODE; | ||
739 | } | ||
740 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
741 | } | ||
742 | |||
743 | void debug_cpumask_set_cpu(int cpu, int node, bool enable) | ||
744 | { | ||
745 | struct cpumask *mask; | ||
746 | char buf[64]; | ||
747 | |||
748 | if (node == NUMA_NO_NODE) { | ||
749 | /* early_cpu_to_node() already emits a warning and trace */ | ||
750 | return; | ||
751 | } | ||
752 | mask = node_to_cpumask_map[node]; | ||
753 | if (!mask) { | ||
754 | pr_err("node_to_cpumask_map[%i] NULL\n", node); | ||
755 | dump_stack(); | ||
756 | return; | ||
757 | } | ||
758 | |||
759 | if (enable) | ||
760 | cpumask_set_cpu(cpu, mask); | ||
761 | else | ||
762 | cpumask_clear_cpu(cpu, mask); | ||
763 | |||
764 | cpulist_scnprintf(buf, sizeof(buf), mask); | ||
765 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | ||
766 | enable ? "numa_add_cpu" : "numa_remove_cpu", | ||
767 | cpu, node, buf); | ||
768 | return; | ||
769 | } | ||
770 | |||
771 | # ifndef CONFIG_NUMA_EMU | ||
772 | static void __cpuinit numa_set_cpumask(int cpu, bool enable) | ||
773 | { | ||
774 | debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable); | ||
775 | } | ||
776 | |||
777 | void __cpuinit numa_add_cpu(int cpu) | ||
778 | { | ||
779 | numa_set_cpumask(cpu, true); | ||
780 | } | ||
781 | |||
782 | void __cpuinit numa_remove_cpu(int cpu) | ||
783 | { | ||
784 | numa_set_cpumask(cpu, false); | ||
785 | } | ||
786 | # endif /* !CONFIG_NUMA_EMU */ | ||
787 | |||
39 | /* | 788 | /* |
40 | * Returns a pointer to the bitmask of CPUs on Node 'node'. | 789 | * Returns a pointer to the bitmask of CPUs on Node 'node'. |
41 | */ | 790 | */ |
@@ -58,4 +807,20 @@ const struct cpumask *cpumask_of_node(int node) | |||
58 | return node_to_cpumask_map[node]; | 807 | return node_to_cpumask_map[node]; |
59 | } | 808 | } |
60 | EXPORT_SYMBOL(cpumask_of_node); | 809 | EXPORT_SYMBOL(cpumask_of_node); |
810 | |||
811 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
812 | |||
813 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
814 | int memory_add_physaddr_to_nid(u64 start) | ||
815 | { | ||
816 | struct numa_meminfo *mi = &numa_meminfo; | ||
817 | int nid = mi->blk[0].nid; | ||
818 | int i; | ||
819 | |||
820 | for (i = 0; i < mi->nr_blks; i++) | ||
821 | if (mi->blk[i].start <= start && mi->blk[i].end > start) | ||
822 | nid = mi->blk[i].nid; | ||
823 | return nid; | ||
824 | } | ||
825 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | ||
61 | #endif | 826 | #endif |
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 809baaaf48b1..849a975d3fa0 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c | |||
@@ -22,38 +22,11 @@ | |||
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/mm.h> | ||
26 | #include <linux/bootmem.h> | 25 | #include <linux/bootmem.h> |
27 | #include <linux/mmzone.h> | 26 | #include <linux/memblock.h> |
28 | #include <linux/highmem.h> | ||
29 | #include <linux/initrd.h> | ||
30 | #include <linux/nodemask.h> | ||
31 | #include <linux/module.h> | 27 | #include <linux/module.h> |
32 | #include <linux/kexec.h> | ||
33 | #include <linux/pfn.h> | ||
34 | #include <linux/swap.h> | ||
35 | #include <linux/acpi.h> | ||
36 | |||
37 | #include <asm/e820.h> | ||
38 | #include <asm/setup.h> | ||
39 | #include <asm/mmzone.h> | ||
40 | #include <asm/bios_ebda.h> | ||
41 | #include <asm/proto.h> | ||
42 | |||
43 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | ||
44 | EXPORT_SYMBOL(node_data); | ||
45 | |||
46 | /* | ||
47 | * numa interface - we expect the numa architecture specific code to have | ||
48 | * populated the following initialisation. | ||
49 | * | ||
50 | * 1) node_online_map - the map of all nodes configured (online) in the system | ||
51 | * 2) node_start_pfn - the starting page frame number for a node | ||
52 | * 3) node_end_pfn - the ending page fram number for a node | ||
53 | */ | ||
54 | unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; | ||
55 | unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; | ||
56 | 28 | ||
29 | #include "numa_internal.h" | ||
57 | 30 | ||
58 | #ifdef CONFIG_DISCONTIGMEM | 31 | #ifdef CONFIG_DISCONTIGMEM |
59 | /* | 32 | /* |
@@ -98,102 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, | |||
98 | } | 71 | } |
99 | #endif | 72 | #endif |
100 | 73 | ||
101 | extern unsigned long find_max_low_pfn(void); | ||
102 | extern unsigned long highend_pfn, highstart_pfn; | 74 | extern unsigned long highend_pfn, highstart_pfn; |
103 | 75 | ||
104 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) | 76 | #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) |
105 | 77 | ||
106 | unsigned long node_remap_size[MAX_NUMNODES]; | ||
107 | static void *node_remap_start_vaddr[MAX_NUMNODES]; | 78 | static void *node_remap_start_vaddr[MAX_NUMNODES]; |
108 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | 79 | void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); |
109 | 80 | ||
110 | static unsigned long kva_start_pfn; | ||
111 | static unsigned long kva_pages; | ||
112 | /* | ||
113 | * FLAT - support for basic PC memory model with discontig enabled, essentially | ||
114 | * a single node with all available processors in it with a flat | ||
115 | * memory map. | ||
116 | */ | ||
117 | int __init get_memcfg_numa_flat(void) | ||
118 | { | ||
119 | printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); | ||
120 | |||
121 | node_start_pfn[0] = 0; | ||
122 | node_end_pfn[0] = max_pfn; | ||
123 | e820_register_active_regions(0, 0, max_pfn); | ||
124 | memory_present(0, 0, max_pfn); | ||
125 | node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); | ||
126 | |||
127 | /* Indicate there is one node available. */ | ||
128 | nodes_clear(node_online_map); | ||
129 | node_set_online(0); | ||
130 | return 1; | ||
131 | } | ||
132 | |||
133 | /* | ||
134 | * Find the highest page frame number we have available for the node | ||
135 | */ | ||
136 | static void __init propagate_e820_map_node(int nid) | ||
137 | { | ||
138 | if (node_end_pfn[nid] > max_pfn) | ||
139 | node_end_pfn[nid] = max_pfn; | ||
140 | /* | ||
141 | * if a user has given mem=XXXX, then we need to make sure | ||
142 | * that the node _starts_ before that, too, not just ends | ||
143 | */ | ||
144 | if (node_start_pfn[nid] > max_pfn) | ||
145 | node_start_pfn[nid] = max_pfn; | ||
146 | BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); | ||
147 | } | ||
148 | |||
149 | /* | ||
150 | * Allocate memory for the pg_data_t for this node via a crude pre-bootmem | ||
151 | * method. For node zero take this from the bottom of memory, for | ||
152 | * subsequent nodes place them at node_remap_start_vaddr which contains | ||
153 | * node local data in physically node local memory. See setup_memory() | ||
154 | * for details. | ||
155 | */ | ||
156 | static void __init allocate_pgdat(int nid) | ||
157 | { | ||
158 | char buf[16]; | ||
159 | |||
160 | if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) | ||
161 | NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; | ||
162 | else { | ||
163 | unsigned long pgdat_phys; | ||
164 | pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT, | ||
165 | max_pfn_mapped<<PAGE_SHIFT, | ||
166 | sizeof(pg_data_t), | ||
167 | PAGE_SIZE); | ||
168 | NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT)); | ||
169 | memset(buf, 0, sizeof(buf)); | ||
170 | sprintf(buf, "NODE_DATA %d", nid); | ||
171 | reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); | ||
172 | } | ||
173 | printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", | ||
174 | nid, (unsigned long)NODE_DATA(nid)); | ||
175 | } | ||
176 | |||
177 | /* | 81 | /* |
178 | * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel | 82 | * Remap memory allocator |
179 | * virtual address space (KVA) is reserved and portions of nodes are mapped | ||
180 | * using it. This is to allow node-local memory to be allocated for | ||
181 | * structures that would normally require ZONE_NORMAL. The memory is | ||
182 | * allocated with alloc_remap() and callers should be prepared to allocate | ||
183 | * from the bootmem allocator instead. | ||
184 | */ | 83 | */ |
185 | static unsigned long node_remap_start_pfn[MAX_NUMNODES]; | 84 | static unsigned long node_remap_start_pfn[MAX_NUMNODES]; |
186 | static void *node_remap_end_vaddr[MAX_NUMNODES]; | 85 | static void *node_remap_end_vaddr[MAX_NUMNODES]; |
187 | static void *node_remap_alloc_vaddr[MAX_NUMNODES]; | 86 | static void *node_remap_alloc_vaddr[MAX_NUMNODES]; |
188 | static unsigned long node_remap_offset[MAX_NUMNODES]; | ||
189 | 87 | ||
88 | /** | ||
89 | * alloc_remap - Allocate remapped memory | ||
90 | * @nid: NUMA node to allocate memory from | ||
91 | * @size: The size of allocation | ||
92 | * | ||
93 | * Allocate @size bytes from the remap area of NUMA node @nid. The | ||
94 | * size of the remap area is predetermined by init_alloc_remap() and | ||
95 | * only the callers considered there should call this function. For | ||
96 | * more info, please read the comment on top of init_alloc_remap(). | ||
97 | * | ||
98 | * The caller must be ready to handle allocation failure from this | ||
99 | * function and fall back to regular memory allocator in such cases. | ||
100 | * | ||
101 | * CONTEXT: | ||
102 | * Single CPU early boot context. | ||
103 | * | ||
104 | * RETURNS: | ||
105 | * Pointer to the allocated memory on success, %NULL on failure. | ||
106 | */ | ||
190 | void *alloc_remap(int nid, unsigned long size) | 107 | void *alloc_remap(int nid, unsigned long size) |
191 | { | 108 | { |
192 | void *allocation = node_remap_alloc_vaddr[nid]; | 109 | void *allocation = node_remap_alloc_vaddr[nid]; |
193 | 110 | ||
194 | size = ALIGN(size, L1_CACHE_BYTES); | 111 | size = ALIGN(size, L1_CACHE_BYTES); |
195 | 112 | ||
196 | if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) | 113 | if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) |
197 | return NULL; | 114 | return NULL; |
198 | 115 | ||
199 | node_remap_alloc_vaddr[nid] += size; | 116 | node_remap_alloc_vaddr[nid] += size; |
@@ -202,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size) | |||
202 | return allocation; | 119 | return allocation; |
203 | } | 120 | } |
204 | 121 | ||
205 | static void __init remap_numa_kva(void) | ||
206 | { | ||
207 | void *vaddr; | ||
208 | unsigned long pfn; | ||
209 | int node; | ||
210 | |||
211 | for_each_online_node(node) { | ||
212 | printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); | ||
213 | for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { | ||
214 | vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT); | ||
215 | printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n", | ||
216 | (unsigned long)vaddr, | ||
217 | node_remap_start_pfn[node] + pfn); | ||
218 | set_pmd_pfn((ulong) vaddr, | ||
219 | node_remap_start_pfn[node] + pfn, | ||
220 | PAGE_KERNEL_LARGE); | ||
221 | } | ||
222 | } | ||
223 | } | ||
224 | |||
225 | #ifdef CONFIG_HIBERNATION | 122 | #ifdef CONFIG_HIBERNATION |
226 | /** | 123 | /** |
227 | * resume_map_numa_kva - add KVA mapping to the temporary page tables created | 124 | * resume_map_numa_kva - add KVA mapping to the temporary page tables created |
@@ -233,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base) | |||
233 | int node; | 130 | int node; |
234 | 131 | ||
235 | for_each_online_node(node) { | 132 | for_each_online_node(node) { |
236 | unsigned long start_va, start_pfn, size, pfn; | 133 | unsigned long start_va, start_pfn, nr_pages, pfn; |
237 | 134 | ||
238 | start_va = (unsigned long)node_remap_start_vaddr[node]; | 135 | start_va = (unsigned long)node_remap_start_vaddr[node]; |
239 | start_pfn = node_remap_start_pfn[node]; | 136 | start_pfn = node_remap_start_pfn[node]; |
240 | size = node_remap_size[node]; | 137 | nr_pages = (node_remap_end_vaddr[node] - |
138 | node_remap_start_vaddr[node]) >> PAGE_SHIFT; | ||
241 | 139 | ||
242 | printk(KERN_DEBUG "%s: node %d\n", __func__, node); | 140 | printk(KERN_DEBUG "%s: node %d\n", __func__, node); |
243 | 141 | ||
244 | for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { | 142 | for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { |
245 | unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); | 143 | unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); |
246 | pgd_t *pgd = pgd_base + pgd_index(vaddr); | 144 | pgd_t *pgd = pgd_base + pgd_index(vaddr); |
247 | pud_t *pud = pud_offset(pgd, vaddr); | 145 | pud_t *pud = pud_offset(pgd, vaddr); |
@@ -257,134 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base) | |||
257 | } | 155 | } |
258 | #endif | 156 | #endif |
259 | 157 | ||
260 | static __init unsigned long calculate_numa_remap_pages(void) | 158 | /** |
261 | { | 159 | * init_alloc_remap - Initialize remap allocator for a NUMA node |
262 | int nid; | 160 | * @nid: NUMA node to initizlie remap allocator for |
263 | unsigned long size, reserve_pages = 0; | 161 | * |
264 | 162 | * NUMA nodes may end up without any lowmem. As allocating pgdat and | |
265 | for_each_online_node(nid) { | 163 | * memmap on a different node with lowmem is inefficient, a special |
266 | u64 node_kva_target; | 164 | * remap allocator is implemented which can be used by alloc_remap(). |
267 | u64 node_kva_final; | 165 | * |
268 | 166 | * For each node, the amount of memory which will be necessary for | |
269 | /* | 167 | * pgdat and memmap is calculated and two memory areas of the size are |
270 | * The acpi/srat node info can show hot-add memroy zones | 168 | * allocated - one in the node and the other in lowmem; then, the area |
271 | * where memory could be added but not currently present. | 169 | * in the node is remapped to the lowmem area. |
272 | */ | 170 | * |
273 | printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", | 171 | * As pgdat and memmap must be allocated in lowmem anyway, this |
274 | nid, node_start_pfn[nid], node_end_pfn[nid]); | 172 | * doesn't waste lowmem address space; however, the actual lowmem |
275 | if (node_start_pfn[nid] > max_pfn) | 173 | * which gets remapped over is wasted. The amount shouldn't be |
276 | continue; | 174 | * problematic on machines this feature will be used. |
277 | if (!node_end_pfn[nid]) | 175 | * |
278 | continue; | 176 | * Initialization failure isn't fatal. alloc_remap() is used |
279 | if (node_end_pfn[nid] > max_pfn) | 177 | * opportunistically and the callers will fall back to other memory |
280 | node_end_pfn[nid] = max_pfn; | 178 | * allocation mechanisms on failure. |
281 | 179 | */ | |
282 | /* ensure the remap includes space for the pgdat. */ | 180 | void __init init_alloc_remap(int nid, u64 start, u64 end) |
283 | size = node_remap_size[nid] + sizeof(pg_data_t); | ||
284 | |||
285 | /* convert size to large (pmd size) pages, rounding up */ | ||
286 | size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; | ||
287 | /* now the roundup is correct, convert to PAGE_SIZE pages */ | ||
288 | size = size * PTRS_PER_PTE; | ||
289 | |||
290 | node_kva_target = round_down(node_end_pfn[nid] - size, | ||
291 | PTRS_PER_PTE); | ||
292 | node_kva_target <<= PAGE_SHIFT; | ||
293 | do { | ||
294 | node_kva_final = find_e820_area(node_kva_target, | ||
295 | ((u64)node_end_pfn[nid])<<PAGE_SHIFT, | ||
296 | ((u64)size)<<PAGE_SHIFT, | ||
297 | LARGE_PAGE_BYTES); | ||
298 | node_kva_target -= LARGE_PAGE_BYTES; | ||
299 | } while (node_kva_final == -1ULL && | ||
300 | (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid])); | ||
301 | |||
302 | if (node_kva_final == -1ULL) | ||
303 | panic("Can not get kva ram\n"); | ||
304 | |||
305 | node_remap_size[nid] = size; | ||
306 | node_remap_offset[nid] = reserve_pages; | ||
307 | reserve_pages += size; | ||
308 | printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" | ||
309 | " node %d at %llx\n", | ||
310 | size, nid, node_kva_final>>PAGE_SHIFT); | ||
311 | |||
312 | /* | ||
313 | * prevent kva address below max_low_pfn want it on system | ||
314 | * with less memory later. | ||
315 | * layout will be: KVA address , KVA RAM | ||
316 | * | ||
317 | * we are supposed to only record the one less then max_low_pfn | ||
318 | * but we could have some hole in high memory, and it will only | ||
319 | * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide | ||
320 | * to use it as free. | ||
321 | * So reserve_early here, hope we don't run out of that array | ||
322 | */ | ||
323 | reserve_early(node_kva_final, | ||
324 | node_kva_final+(((u64)size)<<PAGE_SHIFT), | ||
325 | "KVA RAM"); | ||
326 | |||
327 | node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT; | ||
328 | remove_active_range(nid, node_remap_start_pfn[nid], | ||
329 | node_remap_start_pfn[nid] + size); | ||
330 | } | ||
331 | printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", | ||
332 | reserve_pages); | ||
333 | return reserve_pages; | ||
334 | } | ||
335 | |||
336 | static void init_remap_allocator(int nid) | ||
337 | { | ||
338 | node_remap_start_vaddr[nid] = pfn_to_kaddr( | ||
339 | kva_start_pfn + node_remap_offset[nid]); | ||
340 | node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + | ||
341 | (node_remap_size[nid] * PAGE_SIZE); | ||
342 | node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + | ||
343 | ALIGN(sizeof(pg_data_t), PAGE_SIZE); | ||
344 | |||
345 | printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, | ||
346 | (ulong) node_remap_start_vaddr[nid], | ||
347 | (ulong) node_remap_end_vaddr[nid]); | ||
348 | } | ||
349 | |||
350 | void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | ||
351 | int acpi, int k8) | ||
352 | { | 181 | { |
353 | int nid; | 182 | unsigned long start_pfn = start >> PAGE_SHIFT; |
354 | long kva_target_pfn; | 183 | unsigned long end_pfn = end >> PAGE_SHIFT; |
184 | unsigned long size, pfn; | ||
185 | u64 node_pa, remap_pa; | ||
186 | void *remap_va; | ||
355 | 187 | ||
356 | /* | 188 | /* |
357 | * When mapping a NUMA machine we allocate the node_mem_map arrays | 189 | * The acpi/srat node info can show hot-add memroy zones where |
358 | * from node local memory. They are then mapped directly into KVA | 190 | * memory could be added but not currently present. |
359 | * between zone normal and vmalloc space. Calculate the size of | ||
360 | * this space and use it to adjust the boundary between ZONE_NORMAL | ||
361 | * and ZONE_HIGHMEM. | ||
362 | */ | 191 | */ |
192 | printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", | ||
193 | nid, start_pfn, end_pfn); | ||
194 | |||
195 | /* calculate the necessary space aligned to large page size */ | ||
196 | size = node_memmap_size_bytes(nid, start_pfn, end_pfn); | ||
197 | size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); | ||
198 | size = ALIGN(size, LARGE_PAGE_BYTES); | ||
199 | |||
200 | /* allocate node memory and the lowmem remap area */ | ||
201 | node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); | ||
202 | if (node_pa == MEMBLOCK_ERROR) { | ||
203 | pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", | ||
204 | size, nid); | ||
205 | return; | ||
206 | } | ||
207 | memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); | ||
208 | |||
209 | remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, | ||
210 | max_low_pfn << PAGE_SHIFT, | ||
211 | size, LARGE_PAGE_BYTES); | ||
212 | if (remap_pa == MEMBLOCK_ERROR) { | ||
213 | pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", | ||
214 | size, nid); | ||
215 | memblock_x86_free_range(node_pa, node_pa + size); | ||
216 | return; | ||
217 | } | ||
218 | memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); | ||
219 | remap_va = phys_to_virt(remap_pa); | ||
220 | |||
221 | /* perform actual remap */ | ||
222 | for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) | ||
223 | set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), | ||
224 | (node_pa >> PAGE_SHIFT) + pfn, | ||
225 | PAGE_KERNEL_LARGE); | ||
226 | |||
227 | /* initialize remap allocator parameters */ | ||
228 | node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; | ||
229 | node_remap_start_vaddr[nid] = remap_va; | ||
230 | node_remap_end_vaddr[nid] = remap_va + size; | ||
231 | node_remap_alloc_vaddr[nid] = remap_va; | ||
232 | |||
233 | printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", | ||
234 | nid, node_pa, node_pa + size, remap_va, remap_va + size); | ||
235 | } | ||
363 | 236 | ||
364 | get_memcfg_numa(); | 237 | void __init initmem_init(void) |
365 | 238 | { | |
366 | kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); | 239 | x86_numa_init(); |
367 | |||
368 | kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); | ||
369 | do { | ||
370 | kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT, | ||
371 | max_low_pfn<<PAGE_SHIFT, | ||
372 | kva_pages<<PAGE_SHIFT, | ||
373 | PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT; | ||
374 | kva_target_pfn -= PTRS_PER_PTE; | ||
375 | } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); | ||
376 | |||
377 | if (kva_start_pfn == -1UL) | ||
378 | panic("Can not get kva space\n"); | ||
379 | |||
380 | printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", | ||
381 | kva_start_pfn, max_low_pfn); | ||
382 | printk(KERN_INFO "max_pfn = %lx\n", max_pfn); | ||
383 | 240 | ||
384 | /* avoid clash with initrd */ | ||
385 | reserve_early(kva_start_pfn<<PAGE_SHIFT, | ||
386 | (kva_start_pfn + kva_pages)<<PAGE_SHIFT, | ||
387 | "KVA PG"); | ||
388 | #ifdef CONFIG_HIGHMEM | 241 | #ifdef CONFIG_HIGHMEM |
389 | highstart_pfn = highend_pfn = max_pfn; | 242 | highstart_pfn = highend_pfn = max_pfn; |
390 | if (max_pfn > max_low_pfn) | 243 | if (max_pfn > max_low_pfn) |
@@ -404,54 +257,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, | |||
404 | 257 | ||
405 | printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", | 258 | printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", |
406 | (ulong) pfn_to_kaddr(max_low_pfn)); | 259 | (ulong) pfn_to_kaddr(max_low_pfn)); |
407 | for_each_online_node(nid) { | ||
408 | init_remap_allocator(nid); | ||
409 | |||
410 | allocate_pgdat(nid); | ||
411 | } | ||
412 | remap_numa_kva(); | ||
413 | 260 | ||
414 | printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", | 261 | printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", |
415 | (ulong) pfn_to_kaddr(highstart_pfn)); | 262 | (ulong) pfn_to_kaddr(highstart_pfn)); |
416 | for_each_online_node(nid) | ||
417 | propagate_e820_map_node(nid); | ||
418 | |||
419 | for_each_online_node(nid) { | ||
420 | memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); | ||
421 | NODE_DATA(nid)->node_id = nid; | ||
422 | #ifndef CONFIG_NO_BOOTMEM | ||
423 | NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; | ||
424 | #endif | ||
425 | } | ||
426 | 263 | ||
427 | setup_bootmem_allocator(); | 264 | setup_bootmem_allocator(); |
428 | } | 265 | } |
429 | |||
430 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
431 | static int paddr_to_nid(u64 addr) | ||
432 | { | ||
433 | int nid; | ||
434 | unsigned long pfn = PFN_DOWN(addr); | ||
435 | |||
436 | for_each_node(nid) | ||
437 | if (node_start_pfn[nid] <= pfn && | ||
438 | pfn < node_end_pfn[nid]) | ||
439 | return nid; | ||
440 | |||
441 | return -1; | ||
442 | } | ||
443 | |||
444 | /* | ||
445 | * This function is used to ask node id BEFORE memmap and mem_section's | ||
446 | * initialization (pfn_to_nid() can't be used yet). | ||
447 | * If _PXM is not defined on ACPI's DSDT, node id must be found by this. | ||
448 | */ | ||
449 | int memory_add_physaddr_to_nid(u64 addr) | ||
450 | { | ||
451 | int nid = paddr_to_nid(addr); | ||
452 | return (nid >= 0) ? nid : 0; | ||
453 | } | ||
454 | |||
455 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | ||
456 | #endif | ||
457 | |||
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a7bcc23ef96c..dd27f401f0a0 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c | |||
@@ -2,697 +2,13 @@ | |||
2 | * Generic VM initialization for x86-64 NUMA setups. | 2 | * Generic VM initialization for x86-64 NUMA setups. |
3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. | 3 | * Copyright 2002,2003 Andi Kleen, SuSE Labs. |
4 | */ | 4 | */ |
5 | #include <linux/kernel.h> | ||
6 | #include <linux/mm.h> | ||
7 | #include <linux/string.h> | ||
8 | #include <linux/init.h> | ||
9 | #include <linux/bootmem.h> | 5 | #include <linux/bootmem.h> |
10 | #include <linux/mmzone.h> | ||
11 | #include <linux/ctype.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/nodemask.h> | ||
14 | #include <linux/sched.h> | ||
15 | 6 | ||
16 | #include <asm/e820.h> | 7 | #include "numa_internal.h" |
17 | #include <asm/proto.h> | ||
18 | #include <asm/dma.h> | ||
19 | #include <asm/numa.h> | ||
20 | #include <asm/acpi.h> | ||
21 | #include <asm/k8.h> | ||
22 | 8 | ||
23 | struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; | 9 | void __init initmem_init(void) |
24 | EXPORT_SYMBOL(node_data); | ||
25 | |||
26 | struct memnode memnode; | ||
27 | |||
28 | s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { | ||
29 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
30 | }; | ||
31 | |||
32 | int numa_off __initdata; | ||
33 | static unsigned long __initdata nodemap_addr; | ||
34 | static unsigned long __initdata nodemap_size; | ||
35 | |||
36 | /* | ||
37 | * Map cpu index to node index | ||
38 | */ | ||
39 | DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); | ||
40 | EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); | ||
41 | |||
42 | /* | ||
43 | * Given a shift value, try to populate memnodemap[] | ||
44 | * Returns : | ||
45 | * 1 if OK | ||
46 | * 0 if memnodmap[] too small (of shift too small) | ||
47 | * -1 if node overlap or lost ram (shift too big) | ||
48 | */ | ||
49 | static int __init populate_memnodemap(const struct bootnode *nodes, | ||
50 | int numnodes, int shift, int *nodeids) | ||
51 | { | ||
52 | unsigned long addr, end; | ||
53 | int i, res = -1; | ||
54 | |||
55 | memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); | ||
56 | for (i = 0; i < numnodes; i++) { | ||
57 | addr = nodes[i].start; | ||
58 | end = nodes[i].end; | ||
59 | if (addr >= end) | ||
60 | continue; | ||
61 | if ((end >> shift) >= memnodemapsize) | ||
62 | return 0; | ||
63 | do { | ||
64 | if (memnodemap[addr >> shift] != NUMA_NO_NODE) | ||
65 | return -1; | ||
66 | |||
67 | if (!nodeids) | ||
68 | memnodemap[addr >> shift] = i; | ||
69 | else | ||
70 | memnodemap[addr >> shift] = nodeids[i]; | ||
71 | |||
72 | addr += (1UL << shift); | ||
73 | } while (addr < end); | ||
74 | res = 1; | ||
75 | } | ||
76 | return res; | ||
77 | } | ||
78 | |||
79 | static int __init allocate_cachealigned_memnodemap(void) | ||
80 | { | ||
81 | unsigned long addr; | ||
82 | |||
83 | memnodemap = memnode.embedded_map; | ||
84 | if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) | ||
85 | return 0; | ||
86 | |||
87 | addr = 0x8000; | ||
88 | nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); | ||
89 | nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, | ||
90 | nodemap_size, L1_CACHE_BYTES); | ||
91 | if (nodemap_addr == -1UL) { | ||
92 | printk(KERN_ERR | ||
93 | "NUMA: Unable to allocate Memory to Node hash map\n"); | ||
94 | nodemap_addr = nodemap_size = 0; | ||
95 | return -1; | ||
96 | } | ||
97 | memnodemap = phys_to_virt(nodemap_addr); | ||
98 | reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP"); | ||
99 | |||
100 | printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", | ||
101 | nodemap_addr, nodemap_addr + nodemap_size); | ||
102 | return 0; | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * The LSB of all start and end addresses in the node map is the value of the | ||
107 | * maximum possible shift. | ||
108 | */ | ||
109 | static int __init extract_lsb_from_nodes(const struct bootnode *nodes, | ||
110 | int numnodes) | ||
111 | { | ||
112 | int i, nodes_used = 0; | ||
113 | unsigned long start, end; | ||
114 | unsigned long bitfield = 0, memtop = 0; | ||
115 | |||
116 | for (i = 0; i < numnodes; i++) { | ||
117 | start = nodes[i].start; | ||
118 | end = nodes[i].end; | ||
119 | if (start >= end) | ||
120 | continue; | ||
121 | bitfield |= start; | ||
122 | nodes_used++; | ||
123 | if (end > memtop) | ||
124 | memtop = end; | ||
125 | } | ||
126 | if (nodes_used <= 1) | ||
127 | i = 63; | ||
128 | else | ||
129 | i = find_first_bit(&bitfield, sizeof(unsigned long)*8); | ||
130 | memnodemapsize = (memtop >> i)+1; | ||
131 | return i; | ||
132 | } | ||
133 | |||
134 | int __init compute_hash_shift(struct bootnode *nodes, int numnodes, | ||
135 | int *nodeids) | ||
136 | { | ||
137 | int shift; | ||
138 | |||
139 | shift = extract_lsb_from_nodes(nodes, numnodes); | ||
140 | if (allocate_cachealigned_memnodemap()) | ||
141 | return -1; | ||
142 | printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", | ||
143 | shift); | ||
144 | |||
145 | if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) { | ||
146 | printk(KERN_INFO "Your memory is not aligned you need to " | ||
147 | "rebuild your kernel with a bigger NODEMAPSIZE " | ||
148 | "shift=%d\n", shift); | ||
149 | return -1; | ||
150 | } | ||
151 | return shift; | ||
152 | } | ||
153 | |||
154 | int __meminit __early_pfn_to_nid(unsigned long pfn) | ||
155 | { | ||
156 | return phys_to_nid(pfn << PAGE_SHIFT); | ||
157 | } | ||
158 | |||
159 | static void * __init early_node_mem(int nodeid, unsigned long start, | ||
160 | unsigned long end, unsigned long size, | ||
161 | unsigned long align) | ||
162 | { | ||
163 | unsigned long mem; | ||
164 | |||
165 | /* | ||
166 | * put it on high as possible | ||
167 | * something will go with NODE_DATA | ||
168 | */ | ||
169 | if (start < (MAX_DMA_PFN<<PAGE_SHIFT)) | ||
170 | start = MAX_DMA_PFN<<PAGE_SHIFT; | ||
171 | if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) && | ||
172 | end > (MAX_DMA32_PFN<<PAGE_SHIFT)) | ||
173 | start = MAX_DMA32_PFN<<PAGE_SHIFT; | ||
174 | mem = find_e820_area(start, end, size, align); | ||
175 | if (mem != -1L) | ||
176 | return __va(mem); | ||
177 | |||
178 | /* extend the search scope */ | ||
179 | end = max_pfn_mapped << PAGE_SHIFT; | ||
180 | if (end > (MAX_DMA32_PFN<<PAGE_SHIFT)) | ||
181 | start = MAX_DMA32_PFN<<PAGE_SHIFT; | ||
182 | else | ||
183 | start = MAX_DMA_PFN<<PAGE_SHIFT; | ||
184 | mem = find_e820_area(start, end, size, align); | ||
185 | if (mem != -1L) | ||
186 | return __va(mem); | ||
187 | |||
188 | printk(KERN_ERR "Cannot find %lu bytes in node %d\n", | ||
189 | size, nodeid); | ||
190 | |||
191 | return NULL; | ||
192 | } | ||
193 | |||
194 | /* Initialize bootmem allocator for a node */ | ||
195 | void __init | ||
196 | setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) | ||
197 | { | ||
198 | unsigned long start_pfn, last_pfn, nodedata_phys; | ||
199 | const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); | ||
200 | int nid; | ||
201 | #ifndef CONFIG_NO_BOOTMEM | ||
202 | unsigned long bootmap_start, bootmap_pages, bootmap_size; | ||
203 | void *bootmap; | ||
204 | #endif | ||
205 | |||
206 | if (!end) | ||
207 | return; | ||
208 | |||
209 | /* | ||
210 | * Don't confuse VM with a node that doesn't have the | ||
211 | * minimum amount of memory: | ||
212 | */ | ||
213 | if (end && (end - start) < NODE_MIN_SIZE) | ||
214 | return; | ||
215 | |||
216 | start = roundup(start, ZONE_ALIGN); | ||
217 | |||
218 | printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid, | ||
219 | start, end); | ||
220 | |||
221 | start_pfn = start >> PAGE_SHIFT; | ||
222 | last_pfn = end >> PAGE_SHIFT; | ||
223 | |||
224 | node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, | ||
225 | SMP_CACHE_BYTES); | ||
226 | if (node_data[nodeid] == NULL) | ||
227 | return; | ||
228 | nodedata_phys = __pa(node_data[nodeid]); | ||
229 | reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA"); | ||
230 | printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, | ||
231 | nodedata_phys + pgdat_size - 1); | ||
232 | nid = phys_to_nid(nodedata_phys); | ||
233 | if (nid != nodeid) | ||
234 | printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); | ||
235 | |||
236 | memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); | ||
237 | NODE_DATA(nodeid)->node_id = nodeid; | ||
238 | NODE_DATA(nodeid)->node_start_pfn = start_pfn; | ||
239 | NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; | ||
240 | |||
241 | #ifndef CONFIG_NO_BOOTMEM | ||
242 | NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; | ||
243 | |||
244 | /* | ||
245 | * Find a place for the bootmem map | ||
246 | * nodedata_phys could be on other nodes by alloc_bootmem, | ||
247 | * so need to sure bootmap_start not to be small, otherwise | ||
248 | * early_node_mem will get that with find_e820_area instead | ||
249 | * of alloc_bootmem, that could clash with reserved range | ||
250 | */ | ||
251 | bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); | ||
252 | bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); | ||
253 | /* | ||
254 | * SMP_CACHE_BYTES could be enough, but init_bootmem_node like | ||
255 | * to use that to align to PAGE_SIZE | ||
256 | */ | ||
257 | bootmap = early_node_mem(nodeid, bootmap_start, end, | ||
258 | bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); | ||
259 | if (bootmap == NULL) { | ||
260 | free_early(nodedata_phys, nodedata_phys + pgdat_size); | ||
261 | node_data[nodeid] = NULL; | ||
262 | return; | ||
263 | } | ||
264 | bootmap_start = __pa(bootmap); | ||
265 | reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT), | ||
266 | "BOOTMAP"); | ||
267 | |||
268 | bootmap_size = init_bootmem_node(NODE_DATA(nodeid), | ||
269 | bootmap_start >> PAGE_SHIFT, | ||
270 | start_pfn, last_pfn); | ||
271 | |||
272 | printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", | ||
273 | bootmap_start, bootmap_start + bootmap_size - 1, | ||
274 | bootmap_pages); | ||
275 | nid = phys_to_nid(bootmap_start); | ||
276 | if (nid != nodeid) | ||
277 | printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); | ||
278 | |||
279 | free_bootmem_with_active_regions(nodeid, end); | ||
280 | #endif | ||
281 | |||
282 | node_set_online(nodeid); | ||
283 | } | ||
284 | |||
285 | /* | ||
286 | * There are unfortunately some poorly designed mainboards around that | ||
287 | * only connect memory to a single CPU. This breaks the 1:1 cpu->node | ||
288 | * mapping. To avoid this fill in the mapping for all possible CPUs, | ||
289 | * as the number of CPUs is not known yet. We round robin the existing | ||
290 | * nodes. | ||
291 | */ | ||
292 | void __init numa_init_array(void) | ||
293 | { | ||
294 | int rr, i; | ||
295 | |||
296 | rr = first_node(node_online_map); | ||
297 | for (i = 0; i < nr_cpu_ids; i++) { | ||
298 | if (early_cpu_to_node(i) != NUMA_NO_NODE) | ||
299 | continue; | ||
300 | numa_set_node(i, rr); | ||
301 | rr = next_node(rr, node_online_map); | ||
302 | if (rr == MAX_NUMNODES) | ||
303 | rr = first_node(node_online_map); | ||
304 | } | ||
305 | } | ||
306 | |||
307 | #ifdef CONFIG_NUMA_EMU | ||
308 | /* Numa emulation */ | ||
309 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
310 | static struct bootnode physnodes[MAX_NUMNODES] __initdata; | ||
311 | static char *cmdline __initdata; | ||
312 | |||
313 | static int __init setup_physnodes(unsigned long start, unsigned long end, | ||
314 | int acpi, int k8) | ||
315 | { | ||
316 | int nr_nodes = 0; | ||
317 | int ret = 0; | ||
318 | int i; | ||
319 | |||
320 | #ifdef CONFIG_ACPI_NUMA | ||
321 | if (acpi) | ||
322 | nr_nodes = acpi_get_nodes(physnodes); | ||
323 | #endif | ||
324 | #ifdef CONFIG_K8_NUMA | ||
325 | if (k8) | ||
326 | nr_nodes = k8_get_nodes(physnodes); | ||
327 | #endif | ||
328 | /* | ||
329 | * Basic sanity checking on the physical node map: there may be errors | ||
330 | * if the SRAT or K8 incorrectly reported the topology or the mem= | ||
331 | * kernel parameter is used. | ||
332 | */ | ||
333 | for (i = 0; i < nr_nodes; i++) { | ||
334 | if (physnodes[i].start == physnodes[i].end) | ||
335 | continue; | ||
336 | if (physnodes[i].start > end) { | ||
337 | physnodes[i].end = physnodes[i].start; | ||
338 | continue; | ||
339 | } | ||
340 | if (physnodes[i].end < start) { | ||
341 | physnodes[i].start = physnodes[i].end; | ||
342 | continue; | ||
343 | } | ||
344 | if (physnodes[i].start < start) | ||
345 | physnodes[i].start = start; | ||
346 | if (physnodes[i].end > end) | ||
347 | physnodes[i].end = end; | ||
348 | } | ||
349 | |||
350 | /* | ||
351 | * Remove all nodes that have no memory or were truncated because of the | ||
352 | * limited address range. | ||
353 | */ | ||
354 | for (i = 0; i < nr_nodes; i++) { | ||
355 | if (physnodes[i].start == physnodes[i].end) | ||
356 | continue; | ||
357 | physnodes[ret].start = physnodes[i].start; | ||
358 | physnodes[ret].end = physnodes[i].end; | ||
359 | ret++; | ||
360 | } | ||
361 | |||
362 | /* | ||
363 | * If no physical topology was detected, a single node is faked to cover | ||
364 | * the entire address space. | ||
365 | */ | ||
366 | if (!ret) { | ||
367 | physnodes[ret].start = start; | ||
368 | physnodes[ret].end = end; | ||
369 | ret = 1; | ||
370 | } | ||
371 | return ret; | ||
372 | } | ||
373 | |||
374 | /* | ||
375 | * Setups up nid to range from addr to addr + size. If the end | ||
376 | * boundary is greater than max_addr, then max_addr is used instead. | ||
377 | * The return value is 0 if there is additional memory left for | ||
378 | * allocation past addr and -1 otherwise. addr is adjusted to be at | ||
379 | * the end of the node. | ||
380 | */ | ||
381 | static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) | ||
382 | { | ||
383 | int ret = 0; | ||
384 | nodes[nid].start = *addr; | ||
385 | *addr += size; | ||
386 | if (*addr >= max_addr) { | ||
387 | *addr = max_addr; | ||
388 | ret = -1; | ||
389 | } | ||
390 | nodes[nid].end = *addr; | ||
391 | node_set(nid, node_possible_map); | ||
392 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
393 | nodes[nid].start, nodes[nid].end, | ||
394 | (nodes[nid].end - nodes[nid].start) >> 20); | ||
395 | return ret; | ||
396 | } | ||
397 | |||
398 | /* | ||
399 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
400 | * to max_addr. The return value is the number of nodes allocated. | ||
401 | */ | ||
402 | static int __init split_nodes_interleave(u64 addr, u64 max_addr, | ||
403 | int nr_phys_nodes, int nr_nodes) | ||
404 | { | 10 | { |
405 | nodemask_t physnode_mask = NODE_MASK_NONE; | 11 | x86_numa_init(); |
406 | u64 size; | ||
407 | int big; | ||
408 | int ret = 0; | ||
409 | int i; | ||
410 | |||
411 | if (nr_nodes <= 0) | ||
412 | return -1; | ||
413 | if (nr_nodes > MAX_NUMNODES) { | ||
414 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
415 | nr_nodes, MAX_NUMNODES); | ||
416 | nr_nodes = MAX_NUMNODES; | ||
417 | } | ||
418 | |||
419 | size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes; | ||
420 | /* | ||
421 | * Calculate the number of big nodes that can be allocated as a result | ||
422 | * of consolidating the remainder. | ||
423 | */ | ||
424 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | ||
425 | FAKE_NODE_MIN_SIZE; | ||
426 | |||
427 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
428 | if (!size) { | ||
429 | pr_err("Not enough memory for each node. " | ||
430 | "NUMA emulation disabled.\n"); | ||
431 | return -1; | ||
432 | } | ||
433 | |||
434 | for (i = 0; i < nr_phys_nodes; i++) | ||
435 | if (physnodes[i].start != physnodes[i].end) | ||
436 | node_set(i, physnode_mask); | ||
437 | |||
438 | /* | ||
439 | * Continue to fill physical nodes with fake nodes until there is no | ||
440 | * memory left on any of them. | ||
441 | */ | ||
442 | while (nodes_weight(physnode_mask)) { | ||
443 | for_each_node_mask(i, physnode_mask) { | ||
444 | u64 end = physnodes[i].start + size; | ||
445 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
446 | |||
447 | if (ret < big) | ||
448 | end += FAKE_NODE_MIN_SIZE; | ||
449 | |||
450 | /* | ||
451 | * Continue to add memory to this fake node if its | ||
452 | * non-reserved memory is less than the per-node size. | ||
453 | */ | ||
454 | while (end - physnodes[i].start - | ||
455 | e820_hole_size(physnodes[i].start, end) < size) { | ||
456 | end += FAKE_NODE_MIN_SIZE; | ||
457 | if (end > physnodes[i].end) { | ||
458 | end = physnodes[i].end; | ||
459 | break; | ||
460 | } | ||
461 | } | ||
462 | |||
463 | /* | ||
464 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
465 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
466 | * this one must extend to the boundary. | ||
467 | */ | ||
468 | if (end < dma32_end && dma32_end - end - | ||
469 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
470 | end = dma32_end; | ||
471 | |||
472 | /* | ||
473 | * If there won't be enough non-reserved memory for the | ||
474 | * next node, this one must extend to the end of the | ||
475 | * physical node. | ||
476 | */ | ||
477 | if (physnodes[i].end - end - | ||
478 | e820_hole_size(end, physnodes[i].end) < size) | ||
479 | end = physnodes[i].end; | ||
480 | |||
481 | /* | ||
482 | * Avoid allocating more nodes than requested, which can | ||
483 | * happen as a result of rounding down each node's size | ||
484 | * to FAKE_NODE_MIN_SIZE. | ||
485 | */ | ||
486 | if (nodes_weight(physnode_mask) + ret >= nr_nodes) | ||
487 | end = physnodes[i].end; | ||
488 | |||
489 | if (setup_node_range(ret++, &physnodes[i].start, | ||
490 | end - physnodes[i].start, | ||
491 | physnodes[i].end) < 0) | ||
492 | node_clear(i, physnode_mask); | ||
493 | } | ||
494 | } | ||
495 | return ret; | ||
496 | } | ||
497 | |||
498 | /* | ||
499 | * Returns the end address of a node so that there is at least `size' amount of | ||
500 | * non-reserved memory or `max_addr' is reached. | ||
501 | */ | ||
502 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
503 | { | ||
504 | u64 end = start + size; | ||
505 | |||
506 | while (end - start - e820_hole_size(start, end) < size) { | ||
507 | end += FAKE_NODE_MIN_SIZE; | ||
508 | if (end > max_addr) { | ||
509 | end = max_addr; | ||
510 | break; | ||
511 | } | ||
512 | } | ||
513 | return end; | ||
514 | } | ||
515 | |||
516 | /* | ||
517 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | ||
518 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
519 | */ | ||
520 | static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) | ||
521 | { | ||
522 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
523 | u64 min_size; | ||
524 | int ret = 0; | ||
525 | int i; | ||
526 | |||
527 | if (!size) | ||
528 | return -1; | ||
529 | /* | ||
530 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | ||
531 | * increased accordingly if the requested size is too small. This | ||
532 | * creates a uniform distribution of node sizes across the entire | ||
533 | * machine (but not necessarily over physical nodes). | ||
534 | */ | ||
535 | min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / | ||
536 | MAX_NUMNODES; | ||
537 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
538 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
539 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
540 | FAKE_NODE_MIN_HASH_MASK; | ||
541 | if (size < min_size) { | ||
542 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
543 | size >> 20, min_size >> 20); | ||
544 | size = min_size; | ||
545 | } | ||
546 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
547 | |||
548 | for (i = 0; i < MAX_NUMNODES; i++) | ||
549 | if (physnodes[i].start != physnodes[i].end) | ||
550 | node_set(i, physnode_mask); | ||
551 | /* | ||
552 | * Fill physical nodes with fake nodes of size until there is no memory | ||
553 | * left on any of them. | ||
554 | */ | ||
555 | while (nodes_weight(physnode_mask)) { | ||
556 | for_each_node_mask(i, physnode_mask) { | ||
557 | u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; | ||
558 | u64 end; | ||
559 | |||
560 | end = find_end_of_node(physnodes[i].start, | ||
561 | physnodes[i].end, size); | ||
562 | /* | ||
563 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
564 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
565 | * this one must extend to the boundary. | ||
566 | */ | ||
567 | if (end < dma32_end && dma32_end - end - | ||
568 | e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
569 | end = dma32_end; | ||
570 | |||
571 | /* | ||
572 | * If there won't be enough non-reserved memory for the | ||
573 | * next node, this one must extend to the end of the | ||
574 | * physical node. | ||
575 | */ | ||
576 | if (physnodes[i].end - end - | ||
577 | e820_hole_size(end, physnodes[i].end) < size) | ||
578 | end = physnodes[i].end; | ||
579 | |||
580 | /* | ||
581 | * Setup the fake node that will be allocated as bootmem | ||
582 | * later. If setup_node_range() returns non-zero, there | ||
583 | * is no more memory available on this physical node. | ||
584 | */ | ||
585 | if (setup_node_range(ret++, &physnodes[i].start, | ||
586 | end - physnodes[i].start, | ||
587 | physnodes[i].end) < 0) | ||
588 | node_clear(i, physnode_mask); | ||
589 | } | ||
590 | } | ||
591 | return ret; | ||
592 | } | ||
593 | |||
594 | /* | ||
595 | * Sets up the system RAM area from start_pfn to last_pfn according to the | ||
596 | * numa=fake command-line option. | ||
597 | */ | ||
598 | static int __init numa_emulation(unsigned long start_pfn, | ||
599 | unsigned long last_pfn, int acpi, int k8) | ||
600 | { | ||
601 | u64 addr = start_pfn << PAGE_SHIFT; | ||
602 | u64 max_addr = last_pfn << PAGE_SHIFT; | ||
603 | int num_phys_nodes; | ||
604 | int num_nodes; | ||
605 | int i; | ||
606 | |||
607 | num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); | ||
608 | /* | ||
609 | * If the numa=fake command-line contains a 'M' or 'G', it represents | ||
610 | * the fixed node size. Otherwise, if it is just a single number N, | ||
611 | * split the system RAM into N fake nodes. | ||
612 | */ | ||
613 | if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { | ||
614 | u64 size; | ||
615 | |||
616 | size = memparse(cmdline, &cmdline); | ||
617 | num_nodes = split_nodes_size_interleave(addr, max_addr, size); | ||
618 | } else { | ||
619 | unsigned long n; | ||
620 | |||
621 | n = simple_strtoul(cmdline, NULL, 0); | ||
622 | num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); | ||
623 | } | ||
624 | |||
625 | if (num_nodes < 0) | ||
626 | return num_nodes; | ||
627 | memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); | ||
628 | if (memnode_shift < 0) { | ||
629 | memnode_shift = 0; | ||
630 | printk(KERN_ERR "No NUMA hash function found. NUMA emulation " | ||
631 | "disabled.\n"); | ||
632 | return -1; | ||
633 | } | ||
634 | |||
635 | /* | ||
636 | * We need to vacate all active ranges that may have been registered for | ||
637 | * the e820 memory map. | ||
638 | */ | ||
639 | remove_all_active_ranges(); | ||
640 | for_each_node_mask(i, node_possible_map) { | ||
641 | e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, | ||
642 | nodes[i].end >> PAGE_SHIFT); | ||
643 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
644 | } | ||
645 | acpi_fake_nodes(nodes, num_nodes); | ||
646 | numa_init_array(); | ||
647 | return 0; | ||
648 | } | ||
649 | #endif /* CONFIG_NUMA_EMU */ | ||
650 | |||
651 | void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, | ||
652 | int acpi, int k8) | ||
653 | { | ||
654 | int i; | ||
655 | |||
656 | nodes_clear(node_possible_map); | ||
657 | nodes_clear(node_online_map); | ||
658 | |||
659 | #ifdef CONFIG_NUMA_EMU | ||
660 | if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) | ||
661 | return; | ||
662 | nodes_clear(node_possible_map); | ||
663 | nodes_clear(node_online_map); | ||
664 | #endif | ||
665 | |||
666 | #ifdef CONFIG_ACPI_NUMA | ||
667 | if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, | ||
668 | last_pfn << PAGE_SHIFT)) | ||
669 | return; | ||
670 | nodes_clear(node_possible_map); | ||
671 | nodes_clear(node_online_map); | ||
672 | #endif | ||
673 | |||
674 | #ifdef CONFIG_K8_NUMA | ||
675 | if (!numa_off && k8 && !k8_scan_nodes()) | ||
676 | return; | ||
677 | nodes_clear(node_possible_map); | ||
678 | nodes_clear(node_online_map); | ||
679 | #endif | ||
680 | printk(KERN_INFO "%s\n", | ||
681 | numa_off ? "NUMA turned off" : "No NUMA configuration found"); | ||
682 | |||
683 | printk(KERN_INFO "Faking a node at %016lx-%016lx\n", | ||
684 | start_pfn << PAGE_SHIFT, | ||
685 | last_pfn << PAGE_SHIFT); | ||
686 | /* setup dummy node covering all memory */ | ||
687 | memnode_shift = 63; | ||
688 | memnodemap = memnode.embedded_map; | ||
689 | memnodemap[0] = 0; | ||
690 | node_set_online(0); | ||
691 | node_set(0, node_possible_map); | ||
692 | for (i = 0; i < nr_cpu_ids; i++) | ||
693 | numa_set_node(i, 0); | ||
694 | e820_register_active_regions(0, start_pfn, last_pfn); | ||
695 | setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT); | ||
696 | } | 12 | } |
697 | 13 | ||
698 | unsigned long __init numa_free_all_bootmem(void) | 14 | unsigned long __init numa_free_all_bootmem(void) |
@@ -703,199 +19,7 @@ unsigned long __init numa_free_all_bootmem(void) | |||
703 | for_each_online_node(i) | 19 | for_each_online_node(i) |
704 | pages += free_all_bootmem_node(NODE_DATA(i)); | 20 | pages += free_all_bootmem_node(NODE_DATA(i)); |
705 | 21 | ||
706 | #ifdef CONFIG_NO_BOOTMEM | ||
707 | pages += free_all_memory_core_early(MAX_NUMNODES); | 22 | pages += free_all_memory_core_early(MAX_NUMNODES); |
708 | #endif | ||
709 | 23 | ||
710 | return pages; | 24 | return pages; |
711 | } | 25 | } |
712 | |||
713 | static __init int numa_setup(char *opt) | ||
714 | { | ||
715 | if (!opt) | ||
716 | return -EINVAL; | ||
717 | if (!strncmp(opt, "off", 3)) | ||
718 | numa_off = 1; | ||
719 | #ifdef CONFIG_NUMA_EMU | ||
720 | if (!strncmp(opt, "fake=", 5)) | ||
721 | cmdline = opt + 5; | ||
722 | #endif | ||
723 | #ifdef CONFIG_ACPI_NUMA | ||
724 | if (!strncmp(opt, "noacpi", 6)) | ||
725 | acpi_numa = -1; | ||
726 | #endif | ||
727 | return 0; | ||
728 | } | ||
729 | early_param("numa", numa_setup); | ||
730 | |||
731 | #ifdef CONFIG_NUMA | ||
732 | |||
733 | static __init int find_near_online_node(int node) | ||
734 | { | ||
735 | int n, val; | ||
736 | int min_val = INT_MAX; | ||
737 | int best_node = -1; | ||
738 | |||
739 | for_each_online_node(n) { | ||
740 | val = node_distance(node, n); | ||
741 | |||
742 | if (val < min_val) { | ||
743 | min_val = val; | ||
744 | best_node = n; | ||
745 | } | ||
746 | } | ||
747 | |||
748 | return best_node; | ||
749 | } | ||
750 | |||
751 | /* | ||
752 | * Setup early cpu_to_node. | ||
753 | * | ||
754 | * Populate cpu_to_node[] only if x86_cpu_to_apicid[], | ||
755 | * and apicid_to_node[] tables have valid entries for a CPU. | ||
756 | * This means we skip cpu_to_node[] initialisation for NUMA | ||
757 | * emulation and faking node case (when running a kernel compiled | ||
758 | * for NUMA on a non NUMA box), which is OK as cpu_to_node[] | ||
759 | * is already initialized in a round robin manner at numa_init_array, | ||
760 | * prior to this call, and this initialization is good enough | ||
761 | * for the fake NUMA cases. | ||
762 | * | ||
763 | * Called before the per_cpu areas are setup. | ||
764 | */ | ||
765 | void __init init_cpu_to_node(void) | ||
766 | { | ||
767 | int cpu; | ||
768 | u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); | ||
769 | |||
770 | BUG_ON(cpu_to_apicid == NULL); | ||
771 | |||
772 | for_each_possible_cpu(cpu) { | ||
773 | int node; | ||
774 | u16 apicid = cpu_to_apicid[cpu]; | ||
775 | |||
776 | if (apicid == BAD_APICID) | ||
777 | continue; | ||
778 | node = apicid_to_node[apicid]; | ||
779 | if (node == NUMA_NO_NODE) | ||
780 | continue; | ||
781 | if (!node_online(node)) | ||
782 | node = find_near_online_node(node); | ||
783 | numa_set_node(cpu, node); | ||
784 | } | ||
785 | } | ||
786 | #endif | ||
787 | |||
788 | |||
789 | void __cpuinit numa_set_node(int cpu, int node) | ||
790 | { | ||
791 | int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); | ||
792 | |||
793 | /* early setting, no percpu area yet */ | ||
794 | if (cpu_to_node_map) { | ||
795 | cpu_to_node_map[cpu] = node; | ||
796 | return; | ||
797 | } | ||
798 | |||
799 | #ifdef CONFIG_DEBUG_PER_CPU_MAPS | ||
800 | if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { | ||
801 | printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); | ||
802 | dump_stack(); | ||
803 | return; | ||
804 | } | ||
805 | #endif | ||
806 | per_cpu(x86_cpu_to_node_map, cpu) = node; | ||
807 | |||
808 | if (node != NUMA_NO_NODE) | ||
809 | set_cpu_numa_node(cpu, node); | ||
810 | } | ||
811 | |||
812 | void __cpuinit numa_clear_node(int cpu) | ||
813 | { | ||
814 | numa_set_node(cpu, NUMA_NO_NODE); | ||
815 | } | ||
816 | |||
817 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
818 | |||
819 | void __cpuinit numa_add_cpu(int cpu) | ||
820 | { | ||
821 | cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
822 | } | ||
823 | |||
824 | void __cpuinit numa_remove_cpu(int cpu) | ||
825 | { | ||
826 | cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); | ||
827 | } | ||
828 | |||
829 | #else /* CONFIG_DEBUG_PER_CPU_MAPS */ | ||
830 | |||
831 | /* | ||
832 | * --------- debug versions of the numa functions --------- | ||
833 | */ | ||
834 | static void __cpuinit numa_set_cpumask(int cpu, int enable) | ||
835 | { | ||
836 | int node = early_cpu_to_node(cpu); | ||
837 | struct cpumask *mask; | ||
838 | char buf[64]; | ||
839 | |||
840 | mask = node_to_cpumask_map[node]; | ||
841 | if (mask == NULL) { | ||
842 | printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); | ||
843 | dump_stack(); | ||
844 | return; | ||
845 | } | ||
846 | |||
847 | if (enable) | ||
848 | cpumask_set_cpu(cpu, mask); | ||
849 | else | ||
850 | cpumask_clear_cpu(cpu, mask); | ||
851 | |||
852 | cpulist_scnprintf(buf, sizeof(buf), mask); | ||
853 | printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", | ||
854 | enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); | ||
855 | } | ||
856 | |||
857 | void __cpuinit numa_add_cpu(int cpu) | ||
858 | { | ||
859 | numa_set_cpumask(cpu, 1); | ||
860 | } | ||
861 | |||
862 | void __cpuinit numa_remove_cpu(int cpu) | ||
863 | { | ||
864 | numa_set_cpumask(cpu, 0); | ||
865 | } | ||
866 | |||
867 | int __cpu_to_node(int cpu) | ||
868 | { | ||
869 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) { | ||
870 | printk(KERN_WARNING | ||
871 | "cpu_to_node(%d): usage too early!\n", cpu); | ||
872 | dump_stack(); | ||
873 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
874 | } | ||
875 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
876 | } | ||
877 | EXPORT_SYMBOL(__cpu_to_node); | ||
878 | |||
879 | /* | ||
880 | * Same function as cpu_to_node() but used if called before the | ||
881 | * per_cpu areas are setup. | ||
882 | */ | ||
883 | int early_cpu_to_node(int cpu) | ||
884 | { | ||
885 | if (early_per_cpu_ptr(x86_cpu_to_node_map)) | ||
886 | return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; | ||
887 | |||
888 | if (!cpu_possible(cpu)) { | ||
889 | printk(KERN_WARNING | ||
890 | "early_cpu_to_node(%d): no per_cpu area!\n", cpu); | ||
891 | dump_stack(); | ||
892 | return NUMA_NO_NODE; | ||
893 | } | ||
894 | return per_cpu(x86_cpu_to_node_map, cpu); | ||
895 | } | ||
896 | |||
897 | /* | ||
898 | * --------- end of debug versions of the numa functions --------- | ||
899 | */ | ||
900 | |||
901 | #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ | ||
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c new file mode 100644 index 000000000000..d0ed086b6247 --- /dev/null +++ b/arch/x86/mm/numa_emulation.c | |||
@@ -0,0 +1,492 @@ | |||
1 | /* | ||
2 | * NUMA emulation | ||
3 | */ | ||
4 | #include <linux/kernel.h> | ||
5 | #include <linux/errno.h> | ||
6 | #include <linux/topology.h> | ||
7 | #include <linux/memblock.h> | ||
8 | #include <linux/bootmem.h> | ||
9 | #include <asm/dma.h> | ||
10 | |||
11 | #include "numa_internal.h" | ||
12 | |||
13 | static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata; | ||
14 | static char *emu_cmdline __initdata; | ||
15 | |||
16 | void __init numa_emu_cmdline(char *str) | ||
17 | { | ||
18 | emu_cmdline = str; | ||
19 | } | ||
20 | |||
21 | static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) | ||
22 | { | ||
23 | int i; | ||
24 | |||
25 | for (i = 0; i < mi->nr_blks; i++) | ||
26 | if (mi->blk[i].nid == nid) | ||
27 | return i; | ||
28 | return -ENOENT; | ||
29 | } | ||
30 | |||
31 | /* | ||
32 | * Sets up nid to range from @start to @end. The return value is -errno if | ||
33 | * something went wrong, 0 otherwise. | ||
34 | */ | ||
35 | static int __init emu_setup_memblk(struct numa_meminfo *ei, | ||
36 | struct numa_meminfo *pi, | ||
37 | int nid, int phys_blk, u64 size) | ||
38 | { | ||
39 | struct numa_memblk *eb = &ei->blk[ei->nr_blks]; | ||
40 | struct numa_memblk *pb = &pi->blk[phys_blk]; | ||
41 | |||
42 | if (ei->nr_blks >= NR_NODE_MEMBLKS) { | ||
43 | pr_err("NUMA: Too many emulated memblks, failing emulation\n"); | ||
44 | return -EINVAL; | ||
45 | } | ||
46 | |||
47 | ei->nr_blks++; | ||
48 | eb->start = pb->start; | ||
49 | eb->end = pb->start + size; | ||
50 | eb->nid = nid; | ||
51 | |||
52 | if (emu_nid_to_phys[nid] == NUMA_NO_NODE) | ||
53 | emu_nid_to_phys[nid] = pb->nid; | ||
54 | |||
55 | pb->start += size; | ||
56 | if (pb->start >= pb->end) { | ||
57 | WARN_ON_ONCE(pb->start > pb->end); | ||
58 | numa_remove_memblk_from(phys_blk, pi); | ||
59 | } | ||
60 | |||
61 | printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid, | ||
62 | eb->start, eb->end, (eb->end - eb->start) >> 20); | ||
63 | return 0; | ||
64 | } | ||
65 | |||
66 | /* | ||
67 | * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr | ||
68 | * to max_addr. The return value is the number of nodes allocated. | ||
69 | */ | ||
70 | static int __init split_nodes_interleave(struct numa_meminfo *ei, | ||
71 | struct numa_meminfo *pi, | ||
72 | u64 addr, u64 max_addr, int nr_nodes) | ||
73 | { | ||
74 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
75 | u64 size; | ||
76 | int big; | ||
77 | int nid = 0; | ||
78 | int i, ret; | ||
79 | |||
80 | if (nr_nodes <= 0) | ||
81 | return -1; | ||
82 | if (nr_nodes > MAX_NUMNODES) { | ||
83 | pr_info("numa=fake=%d too large, reducing to %d\n", | ||
84 | nr_nodes, MAX_NUMNODES); | ||
85 | nr_nodes = MAX_NUMNODES; | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * Calculate target node size. x86_32 freaks on __udivdi3() so do | ||
90 | * the division in ulong number of pages and convert back. | ||
91 | */ | ||
92 | size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); | ||
93 | size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); | ||
94 | |||
95 | /* | ||
96 | * Calculate the number of big nodes that can be allocated as a result | ||
97 | * of consolidating the remainder. | ||
98 | */ | ||
99 | big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / | ||
100 | FAKE_NODE_MIN_SIZE; | ||
101 | |||
102 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
103 | if (!size) { | ||
104 | pr_err("Not enough memory for each node. " | ||
105 | "NUMA emulation disabled.\n"); | ||
106 | return -1; | ||
107 | } | ||
108 | |||
109 | for (i = 0; i < pi->nr_blks; i++) | ||
110 | node_set(pi->blk[i].nid, physnode_mask); | ||
111 | |||
112 | /* | ||
113 | * Continue to fill physical nodes with fake nodes until there is no | ||
114 | * memory left on any of them. | ||
115 | */ | ||
116 | while (nodes_weight(physnode_mask)) { | ||
117 | for_each_node_mask(i, physnode_mask) { | ||
118 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
119 | u64 start, limit, end; | ||
120 | int phys_blk; | ||
121 | |||
122 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
123 | if (phys_blk < 0) { | ||
124 | node_clear(i, physnode_mask); | ||
125 | continue; | ||
126 | } | ||
127 | start = pi->blk[phys_blk].start; | ||
128 | limit = pi->blk[phys_blk].end; | ||
129 | end = start + size; | ||
130 | |||
131 | if (nid < big) | ||
132 | end += FAKE_NODE_MIN_SIZE; | ||
133 | |||
134 | /* | ||
135 | * Continue to add memory to this fake node if its | ||
136 | * non-reserved memory is less than the per-node size. | ||
137 | */ | ||
138 | while (end - start - | ||
139 | memblock_x86_hole_size(start, end) < size) { | ||
140 | end += FAKE_NODE_MIN_SIZE; | ||
141 | if (end > limit) { | ||
142 | end = limit; | ||
143 | break; | ||
144 | } | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
149 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
150 | * this one must extend to the boundary. | ||
151 | */ | ||
152 | if (end < dma32_end && dma32_end - end - | ||
153 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
154 | end = dma32_end; | ||
155 | |||
156 | /* | ||
157 | * If there won't be enough non-reserved memory for the | ||
158 | * next node, this one must extend to the end of the | ||
159 | * physical node. | ||
160 | */ | ||
161 | if (limit - end - | ||
162 | memblock_x86_hole_size(end, limit) < size) | ||
163 | end = limit; | ||
164 | |||
165 | ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, | ||
166 | phys_blk, | ||
167 | min(end, limit) - start); | ||
168 | if (ret < 0) | ||
169 | return ret; | ||
170 | } | ||
171 | } | ||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | /* | ||
176 | * Returns the end address of a node so that there is at least `size' amount of | ||
177 | * non-reserved memory or `max_addr' is reached. | ||
178 | */ | ||
179 | static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) | ||
180 | { | ||
181 | u64 end = start + size; | ||
182 | |||
183 | while (end - start - memblock_x86_hole_size(start, end) < size) { | ||
184 | end += FAKE_NODE_MIN_SIZE; | ||
185 | if (end > max_addr) { | ||
186 | end = max_addr; | ||
187 | break; | ||
188 | } | ||
189 | } | ||
190 | return end; | ||
191 | } | ||
192 | |||
193 | /* | ||
194 | * Sets up fake nodes of `size' interleaved over physical nodes ranging from | ||
195 | * `addr' to `max_addr'. The return value is the number of nodes allocated. | ||
196 | */ | ||
197 | static int __init split_nodes_size_interleave(struct numa_meminfo *ei, | ||
198 | struct numa_meminfo *pi, | ||
199 | u64 addr, u64 max_addr, u64 size) | ||
200 | { | ||
201 | nodemask_t physnode_mask = NODE_MASK_NONE; | ||
202 | u64 min_size; | ||
203 | int nid = 0; | ||
204 | int i, ret; | ||
205 | |||
206 | if (!size) | ||
207 | return -1; | ||
208 | /* | ||
209 | * The limit on emulated nodes is MAX_NUMNODES, so the size per node is | ||
210 | * increased accordingly if the requested size is too small. This | ||
211 | * creates a uniform distribution of node sizes across the entire | ||
212 | * machine (but not necessarily over physical nodes). | ||
213 | */ | ||
214 | min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / | ||
215 | MAX_NUMNODES; | ||
216 | min_size = max(min_size, FAKE_NODE_MIN_SIZE); | ||
217 | if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) | ||
218 | min_size = (min_size + FAKE_NODE_MIN_SIZE) & | ||
219 | FAKE_NODE_MIN_HASH_MASK; | ||
220 | if (size < min_size) { | ||
221 | pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", | ||
222 | size >> 20, min_size >> 20); | ||
223 | size = min_size; | ||
224 | } | ||
225 | size &= FAKE_NODE_MIN_HASH_MASK; | ||
226 | |||
227 | for (i = 0; i < pi->nr_blks; i++) | ||
228 | node_set(pi->blk[i].nid, physnode_mask); | ||
229 | |||
230 | /* | ||
231 | * Fill physical nodes with fake nodes of size until there is no memory | ||
232 | * left on any of them. | ||
233 | */ | ||
234 | while (nodes_weight(physnode_mask)) { | ||
235 | for_each_node_mask(i, physnode_mask) { | ||
236 | u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN); | ||
237 | u64 start, limit, end; | ||
238 | int phys_blk; | ||
239 | |||
240 | phys_blk = emu_find_memblk_by_nid(i, pi); | ||
241 | if (phys_blk < 0) { | ||
242 | node_clear(i, physnode_mask); | ||
243 | continue; | ||
244 | } | ||
245 | start = pi->blk[phys_blk].start; | ||
246 | limit = pi->blk[phys_blk].end; | ||
247 | |||
248 | end = find_end_of_node(start, limit, size); | ||
249 | /* | ||
250 | * If there won't be at least FAKE_NODE_MIN_SIZE of | ||
251 | * non-reserved memory in ZONE_DMA32 for the next node, | ||
252 | * this one must extend to the boundary. | ||
253 | */ | ||
254 | if (end < dma32_end && dma32_end - end - | ||
255 | memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) | ||
256 | end = dma32_end; | ||
257 | |||
258 | /* | ||
259 | * If there won't be enough non-reserved memory for the | ||
260 | * next node, this one must extend to the end of the | ||
261 | * physical node. | ||
262 | */ | ||
263 | if (limit - end - | ||
264 | memblock_x86_hole_size(end, limit) < size) | ||
265 | end = limit; | ||
266 | |||
267 | ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, | ||
268 | phys_blk, | ||
269 | min(end, limit) - start); | ||
270 | if (ret < 0) | ||
271 | return ret; | ||
272 | } | ||
273 | } | ||
274 | return 0; | ||
275 | } | ||
276 | |||
277 | /** | ||
278 | * numa_emulation - Emulate NUMA nodes | ||
279 | * @numa_meminfo: NUMA configuration to massage | ||
280 | * @numa_dist_cnt: The size of the physical NUMA distance table | ||
281 | * | ||
282 | * Emulate NUMA nodes according to the numa=fake kernel parameter. | ||
283 | * @numa_meminfo contains the physical memory configuration and is modified | ||
284 | * to reflect the emulated configuration on success. @numa_dist_cnt is | ||
285 | * used to determine the size of the physical distance table. | ||
286 | * | ||
287 | * On success, the following modifications are made. | ||
288 | * | ||
289 | * - @numa_meminfo is updated to reflect the emulated nodes. | ||
290 | * | ||
291 | * - __apicid_to_node[] is updated such that APIC IDs are mapped to the | ||
292 | * emulated nodes. | ||
293 | * | ||
294 | * - NUMA distance table is rebuilt to represent distances between emulated | ||
295 | * nodes. The distances are determined considering how emulated nodes | ||
296 | * are mapped to physical nodes and match the actual distances. | ||
297 | * | ||
298 | * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical | ||
299 | * nodes. This is used by numa_add_cpu() and numa_remove_cpu(). | ||
300 | * | ||
301 | * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with | ||
302 | * identity mapping and no other modification is made. | ||
303 | */ | ||
304 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) | ||
305 | { | ||
306 | static struct numa_meminfo ei __initdata; | ||
307 | static struct numa_meminfo pi __initdata; | ||
308 | const u64 max_addr = PFN_PHYS(max_pfn); | ||
309 | u8 *phys_dist = NULL; | ||
310 | size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); | ||
311 | int max_emu_nid, dfl_phys_nid; | ||
312 | int i, j, ret; | ||
313 | |||
314 | if (!emu_cmdline) | ||
315 | goto no_emu; | ||
316 | |||
317 | memset(&ei, 0, sizeof(ei)); | ||
318 | pi = *numa_meminfo; | ||
319 | |||
320 | for (i = 0; i < MAX_NUMNODES; i++) | ||
321 | emu_nid_to_phys[i] = NUMA_NO_NODE; | ||
322 | |||
323 | /* | ||
324 | * If the numa=fake command-line contains a 'M' or 'G', it represents | ||
325 | * the fixed node size. Otherwise, if it is just a single number N, | ||
326 | * split the system RAM into N fake nodes. | ||
327 | */ | ||
328 | if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) { | ||
329 | u64 size; | ||
330 | |||
331 | size = memparse(emu_cmdline, &emu_cmdline); | ||
332 | ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size); | ||
333 | } else { | ||
334 | unsigned long n; | ||
335 | |||
336 | n = simple_strtoul(emu_cmdline, NULL, 0); | ||
337 | ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n); | ||
338 | } | ||
339 | |||
340 | if (ret < 0) | ||
341 | goto no_emu; | ||
342 | |||
343 | if (numa_cleanup_meminfo(&ei) < 0) { | ||
344 | pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n"); | ||
345 | goto no_emu; | ||
346 | } | ||
347 | |||
348 | /* copy the physical distance table */ | ||
349 | if (numa_dist_cnt) { | ||
350 | u64 phys; | ||
351 | |||
352 | phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), | ||
353 | phys_size, PAGE_SIZE); | ||
354 | if (phys == MEMBLOCK_ERROR) { | ||
355 | pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); | ||
356 | goto no_emu; | ||
357 | } | ||
358 | memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); | ||
359 | phys_dist = __va(phys); | ||
360 | |||
361 | for (i = 0; i < numa_dist_cnt; i++) | ||
362 | for (j = 0; j < numa_dist_cnt; j++) | ||
363 | phys_dist[i * numa_dist_cnt + j] = | ||
364 | node_distance(i, j); | ||
365 | } | ||
366 | |||
367 | /* | ||
368 | * Determine the max emulated nid and the default phys nid to use | ||
369 | * for unmapped nodes. | ||
370 | */ | ||
371 | max_emu_nid = 0; | ||
372 | dfl_phys_nid = NUMA_NO_NODE; | ||
373 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) { | ||
374 | if (emu_nid_to_phys[i] != NUMA_NO_NODE) { | ||
375 | max_emu_nid = i; | ||
376 | if (dfl_phys_nid == NUMA_NO_NODE) | ||
377 | dfl_phys_nid = emu_nid_to_phys[i]; | ||
378 | } | ||
379 | } | ||
380 | if (dfl_phys_nid == NUMA_NO_NODE) { | ||
381 | pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n"); | ||
382 | goto no_emu; | ||
383 | } | ||
384 | |||
385 | /* commit */ | ||
386 | *numa_meminfo = ei; | ||
387 | |||
388 | /* | ||
389 | * Transform __apicid_to_node table to use emulated nids by | ||
390 | * reverse-mapping phys_nid. The maps should always exist but fall | ||
391 | * back to zero just in case. | ||
392 | */ | ||
393 | for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) { | ||
394 | if (__apicid_to_node[i] == NUMA_NO_NODE) | ||
395 | continue; | ||
396 | for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++) | ||
397 | if (__apicid_to_node[i] == emu_nid_to_phys[j]) | ||
398 | break; | ||
399 | __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0; | ||
400 | } | ||
401 | |||
402 | /* make sure all emulated nodes are mapped to a physical node */ | ||
403 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
404 | if (emu_nid_to_phys[i] == NUMA_NO_NODE) | ||
405 | emu_nid_to_phys[i] = dfl_phys_nid; | ||
406 | |||
407 | /* transform distance table */ | ||
408 | numa_reset_distance(); | ||
409 | for (i = 0; i < max_emu_nid + 1; i++) { | ||
410 | for (j = 0; j < max_emu_nid + 1; j++) { | ||
411 | int physi = emu_nid_to_phys[i]; | ||
412 | int physj = emu_nid_to_phys[j]; | ||
413 | int dist; | ||
414 | |||
415 | if (physi >= numa_dist_cnt || physj >= numa_dist_cnt) | ||
416 | dist = physi == physj ? | ||
417 | LOCAL_DISTANCE : REMOTE_DISTANCE; | ||
418 | else | ||
419 | dist = phys_dist[physi * numa_dist_cnt + physj]; | ||
420 | |||
421 | numa_set_distance(i, j, dist); | ||
422 | } | ||
423 | } | ||
424 | |||
425 | /* free the copied physical distance table */ | ||
426 | if (phys_dist) | ||
427 | memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); | ||
428 | return; | ||
429 | |||
430 | no_emu: | ||
431 | /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */ | ||
432 | for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) | ||
433 | emu_nid_to_phys[i] = i; | ||
434 | } | ||
435 | |||
436 | #ifndef CONFIG_DEBUG_PER_CPU_MAPS | ||
437 | void __cpuinit numa_add_cpu(int cpu) | ||
438 | { | ||
439 | int physnid, nid; | ||
440 | |||
441 | nid = early_cpu_to_node(cpu); | ||
442 | BUG_ON(nid == NUMA_NO_NODE || !node_online(nid)); | ||
443 | |||
444 | physnid = emu_nid_to_phys[nid]; | ||
445 | |||
446 | /* | ||
447 | * Map the cpu to each emulated node that is allocated on the physical | ||
448 | * node of the cpu's apic id. | ||
449 | */ | ||
450 | for_each_online_node(nid) | ||
451 | if (emu_nid_to_phys[nid] == physnid) | ||
452 | cpumask_set_cpu(cpu, node_to_cpumask_map[nid]); | ||
453 | } | ||
454 | |||
455 | void __cpuinit numa_remove_cpu(int cpu) | ||
456 | { | ||
457 | int i; | ||
458 | |||
459 | for_each_online_node(i) | ||
460 | cpumask_clear_cpu(cpu, node_to_cpumask_map[i]); | ||
461 | } | ||
462 | #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
463 | static void __cpuinit numa_set_cpumask(int cpu, bool enable) | ||
464 | { | ||
465 | int nid, physnid; | ||
466 | |||
467 | nid = early_cpu_to_node(cpu); | ||
468 | if (nid == NUMA_NO_NODE) { | ||
469 | /* early_cpu_to_node() already emits a warning and trace */ | ||
470 | return; | ||
471 | } | ||
472 | |||
473 | physnid = emu_nid_to_phys[nid]; | ||
474 | |||
475 | for_each_online_node(nid) { | ||
476 | if (emu_nid_to_phys[nid] != physnid) | ||
477 | continue; | ||
478 | |||
479 | debug_cpumask_set_cpu(cpu, nid, enable); | ||
480 | } | ||
481 | } | ||
482 | |||
483 | void __cpuinit numa_add_cpu(int cpu) | ||
484 | { | ||
485 | numa_set_cpumask(cpu, true); | ||
486 | } | ||
487 | |||
488 | void __cpuinit numa_remove_cpu(int cpu) | ||
489 | { | ||
490 | numa_set_cpumask(cpu, false); | ||
491 | } | ||
492 | #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ | ||
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h new file mode 100644 index 000000000000..7178c3afe05e --- /dev/null +++ b/arch/x86/mm/numa_internal.h | |||
@@ -0,0 +1,39 @@ | |||
1 | #ifndef __X86_MM_NUMA_INTERNAL_H | ||
2 | #define __X86_MM_NUMA_INTERNAL_H | ||
3 | |||
4 | #include <linux/types.h> | ||
5 | #include <asm/numa.h> | ||
6 | |||
7 | struct numa_memblk { | ||
8 | u64 start; | ||
9 | u64 end; | ||
10 | int nid; | ||
11 | }; | ||
12 | |||
13 | struct numa_meminfo { | ||
14 | int nr_blks; | ||
15 | struct numa_memblk blk[NR_NODE_MEMBLKS]; | ||
16 | }; | ||
17 | |||
18 | void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi); | ||
19 | int __init numa_cleanup_meminfo(struct numa_meminfo *mi); | ||
20 | void __init numa_reset_distance(void); | ||
21 | |||
22 | void __init x86_numa_init(void); | ||
23 | |||
24 | #ifdef CONFIG_X86_64 | ||
25 | static inline void init_alloc_remap(int nid, u64 start, u64 end) { } | ||
26 | #else | ||
27 | void __init init_alloc_remap(int nid, u64 start, u64 end); | ||
28 | #endif | ||
29 | |||
30 | #ifdef CONFIG_NUMA_EMU | ||
31 | void __init numa_emulation(struct numa_meminfo *numa_meminfo, | ||
32 | int numa_dist_cnt); | ||
33 | #else | ||
34 | static inline void numa_emulation(struct numa_meminfo *numa_meminfo, | ||
35 | int numa_dist_cnt) | ||
36 | { } | ||
37 | #endif | ||
38 | |||
39 | #endif /* __X86_MM_NUMA_INTERNAL_H */ | ||
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 532e7933d606..f9e526742fa1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/pfn.h> | 13 | #include <linux/pfn.h> |
14 | #include <linux/percpu.h> | 14 | #include <linux/percpu.h> |
15 | #include <linux/gfp.h> | 15 | #include <linux/gfp.h> |
16 | #include <linux/pci.h> | ||
16 | 17 | ||
17 | #include <asm/e820.h> | 18 | #include <asm/e820.h> |
18 | #include <asm/processor.h> | 19 | #include <asm/processor.h> |
@@ -56,12 +57,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM]; | |||
56 | 57 | ||
57 | void update_page_count(int level, unsigned long pages) | 58 | void update_page_count(int level, unsigned long pages) |
58 | { | 59 | { |
59 | unsigned long flags; | ||
60 | |||
61 | /* Protect against CPA */ | 60 | /* Protect against CPA */ |
62 | spin_lock_irqsave(&pgd_lock, flags); | 61 | spin_lock(&pgd_lock); |
63 | direct_pages_count[level] += pages; | 62 | direct_pages_count[level] += pages; |
64 | spin_unlock_irqrestore(&pgd_lock, flags); | 63 | spin_unlock(&pgd_lock); |
65 | } | 64 | } |
66 | 65 | ||
67 | static void split_page_count(int level) | 66 | static void split_page_count(int level) |
@@ -260,8 +259,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, | |||
260 | * The BIOS area between 640k and 1Mb needs to be executable for | 259 | * The BIOS area between 640k and 1Mb needs to be executable for |
261 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. | 260 | * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. |
262 | */ | 261 | */ |
263 | if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) | 262 | #ifdef CONFIG_PCI_BIOS |
263 | if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) | ||
264 | pgprot_val(forbidden) |= _PAGE_NX; | 264 | pgprot_val(forbidden) |= _PAGE_NX; |
265 | #endif | ||
265 | 266 | ||
266 | /* | 267 | /* |
267 | * The kernel text needs to be executable for obvious reasons | 268 | * The kernel text needs to be executable for obvious reasons |
@@ -309,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, | |||
309 | * these shared mappings are made of small page mappings. | 310 | * these shared mappings are made of small page mappings. |
310 | * Thus this don't enforce !RW mapping for small page kernel | 311 | * Thus this don't enforce !RW mapping for small page kernel |
311 | * text mapping logic will help Linux Xen parvirt guest boot | 312 | * text mapping logic will help Linux Xen parvirt guest boot |
312 | * aswell. | 313 | * as well. |
313 | */ | 314 | */ |
314 | if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) | 315 | if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) |
315 | pgprot_val(forbidden) |= _PAGE_RW; | 316 | pgprot_val(forbidden) |= _PAGE_RW; |
@@ -391,16 +392,16 @@ static int | |||
391 | try_preserve_large_page(pte_t *kpte, unsigned long address, | 392 | try_preserve_large_page(pte_t *kpte, unsigned long address, |
392 | struct cpa_data *cpa) | 393 | struct cpa_data *cpa) |
393 | { | 394 | { |
394 | unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; | 395 | unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; |
395 | pte_t new_pte, old_pte, *tmp; | 396 | pte_t new_pte, old_pte, *tmp; |
396 | pgprot_t old_prot, new_prot; | 397 | pgprot_t old_prot, new_prot, req_prot; |
397 | int i, do_split = 1; | 398 | int i, do_split = 1; |
398 | unsigned int level; | 399 | unsigned int level; |
399 | 400 | ||
400 | if (cpa->force_split) | 401 | if (cpa->force_split) |
401 | return 1; | 402 | return 1; |
402 | 403 | ||
403 | spin_lock_irqsave(&pgd_lock, flags); | 404 | spin_lock(&pgd_lock); |
404 | /* | 405 | /* |
405 | * Check for races, another CPU might have split this page | 406 | * Check for races, another CPU might have split this page |
406 | * up already: | 407 | * up already: |
@@ -438,10 +439,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
438 | * We are safe now. Check whether the new pgprot is the same: | 439 | * We are safe now. Check whether the new pgprot is the same: |
439 | */ | 440 | */ |
440 | old_pte = *kpte; | 441 | old_pte = *kpte; |
441 | old_prot = new_prot = pte_pgprot(old_pte); | 442 | old_prot = new_prot = req_prot = pte_pgprot(old_pte); |
442 | 443 | ||
443 | pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); | 444 | pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr); |
444 | pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); | 445 | pgprot_val(req_prot) |= pgprot_val(cpa->mask_set); |
445 | 446 | ||
446 | /* | 447 | /* |
447 | * old_pte points to the large page base address. So we need | 448 | * old_pte points to the large page base address. So we need |
@@ -450,17 +451,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
450 | pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); | 451 | pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); |
451 | cpa->pfn = pfn; | 452 | cpa->pfn = pfn; |
452 | 453 | ||
453 | new_prot = static_protections(new_prot, address, pfn); | 454 | new_prot = static_protections(req_prot, address, pfn); |
454 | 455 | ||
455 | /* | 456 | /* |
456 | * We need to check the full range, whether | 457 | * We need to check the full range, whether |
457 | * static_protection() requires a different pgprot for one of | 458 | * static_protection() requires a different pgprot for one of |
458 | * the pages in the range we try to preserve: | 459 | * the pages in the range we try to preserve: |
459 | */ | 460 | */ |
460 | addr = address + PAGE_SIZE; | 461 | addr = address & pmask; |
461 | pfn++; | 462 | pfn = pte_pfn(old_pte); |
462 | for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { | 463 | for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) { |
463 | pgprot_t chk_prot = static_protections(new_prot, addr, pfn); | 464 | pgprot_t chk_prot = static_protections(req_prot, addr, pfn); |
464 | 465 | ||
465 | if (pgprot_val(chk_prot) != pgprot_val(new_prot)) | 466 | if (pgprot_val(chk_prot) != pgprot_val(new_prot)) |
466 | goto out_unlock; | 467 | goto out_unlock; |
@@ -483,7 +484,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
483 | * that we limited the number of possible pages already to | 484 | * that we limited the number of possible pages already to |
484 | * the number of pages in the large page. | 485 | * the number of pages in the large page. |
485 | */ | 486 | */ |
486 | if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { | 487 | if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) { |
487 | /* | 488 | /* |
488 | * The address is aligned and the number of pages | 489 | * The address is aligned and the number of pages |
489 | * covers the full page. | 490 | * covers the full page. |
@@ -495,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, | |||
495 | } | 496 | } |
496 | 497 | ||
497 | out_unlock: | 498 | out_unlock: |
498 | spin_unlock_irqrestore(&pgd_lock, flags); | 499 | spin_unlock(&pgd_lock); |
499 | 500 | ||
500 | return do_split; | 501 | return do_split; |
501 | } | 502 | } |
502 | 503 | ||
503 | static int split_large_page(pte_t *kpte, unsigned long address) | 504 | static int split_large_page(pte_t *kpte, unsigned long address) |
504 | { | 505 | { |
505 | unsigned long flags, pfn, pfninc = 1; | 506 | unsigned long pfn, pfninc = 1; |
506 | unsigned int i, level; | 507 | unsigned int i, level; |
507 | pte_t *pbase, *tmp; | 508 | pte_t *pbase, *tmp; |
508 | pgprot_t ref_prot; | 509 | pgprot_t ref_prot; |
@@ -516,7 +517,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) | |||
516 | if (!base) | 517 | if (!base) |
517 | return -ENOMEM; | 518 | return -ENOMEM; |
518 | 519 | ||
519 | spin_lock_irqsave(&pgd_lock, flags); | 520 | spin_lock(&pgd_lock); |
520 | /* | 521 | /* |
521 | * Check for races, another CPU might have split this page | 522 | * Check for races, another CPU might have split this page |
522 | * up for us already: | 523 | * up for us already: |
@@ -588,7 +589,7 @@ out_unlock: | |||
588 | */ | 589 | */ |
589 | if (base) | 590 | if (base) |
590 | __free_page(base); | 591 | __free_page(base); |
591 | spin_unlock_irqrestore(&pgd_lock, flags); | 592 | spin_unlock(&pgd_lock); |
592 | 593 | ||
593 | return 0; | 594 | return 0; |
594 | } | 595 | } |
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index 38e6d174c497..9f0614daea85 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c | |||
@@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) | |||
414 | unsigned char *p; | 414 | unsigned char *p; |
415 | struct prefix_bits prf; | 415 | struct prefix_bits prf; |
416 | int i; | 416 | int i; |
417 | unsigned long rv; | ||
418 | 417 | ||
419 | p = (unsigned char *)ins_addr; | 418 | p = (unsigned char *)ins_addr; |
420 | p += skip_prefix(p, &prf); | 419 | p += skip_prefix(p, &prf); |
421 | p += get_opcode(p, &opcode); | 420 | p += get_opcode(p, &opcode); |
422 | for (i = 0; i < ARRAY_SIZE(reg_rop); i++) | 421 | for (i = 0; i < ARRAY_SIZE(reg_rop); i++) |
423 | if (reg_rop[i] == opcode) { | 422 | if (reg_rop[i] == opcode) |
424 | rv = REG_READ; | ||
425 | goto do_work; | 423 | goto do_work; |
426 | } | ||
427 | 424 | ||
428 | for (i = 0; i < ARRAY_SIZE(reg_wop); i++) | 425 | for (i = 0; i < ARRAY_SIZE(reg_wop); i++) |
429 | if (reg_wop[i] == opcode) { | 426 | if (reg_wop[i] == opcode) |
430 | rv = REG_WRITE; | ||
431 | goto do_work; | 427 | goto do_work; |
432 | } | ||
433 | 428 | ||
434 | printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " | 429 | printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " |
435 | "0x%02x\n", opcode); | 430 | "0x%02x\n", opcode); |
@@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr) | |||
474 | unsigned char *p; | 469 | unsigned char *p; |
475 | struct prefix_bits prf; | 470 | struct prefix_bits prf; |
476 | int i; | 471 | int i; |
477 | unsigned long rv; | ||
478 | 472 | ||
479 | p = (unsigned char *)ins_addr; | 473 | p = (unsigned char *)ins_addr; |
480 | p += skip_prefix(p, &prf); | 474 | p += skip_prefix(p, &prf); |
481 | p += get_opcode(p, &opcode); | 475 | p += get_opcode(p, &opcode); |
482 | for (i = 0; i < ARRAY_SIZE(imm_wop); i++) | 476 | for (i = 0; i < ARRAY_SIZE(imm_wop); i++) |
483 | if (imm_wop[i] == opcode) { | 477 | if (imm_wop[i] == opcode) |
484 | rv = IMM_WRITE; | ||
485 | goto do_work; | 478 | goto do_work; |
486 | } | ||
487 | 479 | ||
488 | printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " | 480 | printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " |
489 | "0x%02x\n", opcode); | 481 | "0x%02x\n", opcode); |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5c4ee422590e..8573b83a63d0 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd) | |||
87 | #define UNSHARED_PTRS_PER_PGD \ | 87 | #define UNSHARED_PTRS_PER_PGD \ |
88 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) | 88 | (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) |
89 | 89 | ||
90 | static void pgd_ctor(pgd_t *pgd) | 90 | |
91 | static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) | ||
92 | { | ||
93 | BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); | ||
94 | virt_to_page(pgd)->index = (pgoff_t)mm; | ||
95 | } | ||
96 | |||
97 | struct mm_struct *pgd_page_get_mm(struct page *page) | ||
98 | { | ||
99 | return (struct mm_struct *)page->index; | ||
100 | } | ||
101 | |||
102 | static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) | ||
91 | { | 103 | { |
92 | /* If the pgd points to a shared pagetable level (either the | 104 | /* If the pgd points to a shared pagetable level (either the |
93 | ptes in non-PAE, or shared PMD in PAE), then just copy the | 105 | ptes in non-PAE, or shared PMD in PAE), then just copy the |
@@ -98,27 +110,23 @@ static void pgd_ctor(pgd_t *pgd) | |||
98 | clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, | 110 | clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, |
99 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | 111 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, |
100 | KERNEL_PGD_PTRS); | 112 | KERNEL_PGD_PTRS); |
101 | paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, | ||
102 | __pa(swapper_pg_dir) >> PAGE_SHIFT, | ||
103 | KERNEL_PGD_BOUNDARY, | ||
104 | KERNEL_PGD_PTRS); | ||
105 | } | 113 | } |
106 | 114 | ||
107 | /* list required to sync kernel mapping updates */ | 115 | /* list required to sync kernel mapping updates */ |
108 | if (!SHARED_KERNEL_PMD) | 116 | if (!SHARED_KERNEL_PMD) { |
117 | pgd_set_mm(pgd, mm); | ||
109 | pgd_list_add(pgd); | 118 | pgd_list_add(pgd); |
119 | } | ||
110 | } | 120 | } |
111 | 121 | ||
112 | static void pgd_dtor(pgd_t *pgd) | 122 | static void pgd_dtor(pgd_t *pgd) |
113 | { | 123 | { |
114 | unsigned long flags; /* can be called from interrupt context */ | ||
115 | |||
116 | if (SHARED_KERNEL_PMD) | 124 | if (SHARED_KERNEL_PMD) |
117 | return; | 125 | return; |
118 | 126 | ||
119 | spin_lock_irqsave(&pgd_lock, flags); | 127 | spin_lock(&pgd_lock); |
120 | pgd_list_del(pgd); | 128 | pgd_list_del(pgd); |
121 | spin_unlock_irqrestore(&pgd_lock, flags); | 129 | spin_unlock(&pgd_lock); |
122 | } | 130 | } |
123 | 131 | ||
124 | /* | 132 | /* |
@@ -160,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) | |||
160 | * section 8.1: in PAE mode we explicitly have to flush the | 168 | * section 8.1: in PAE mode we explicitly have to flush the |
161 | * TLB via cr3 if the top-level pgd is changed... | 169 | * TLB via cr3 if the top-level pgd is changed... |
162 | */ | 170 | */ |
163 | if (mm == current->active_mm) | 171 | flush_tlb_mm(mm); |
164 | write_cr3(read_cr3()); | ||
165 | } | 172 | } |
166 | #else /* !CONFIG_X86_PAE */ | 173 | #else /* !CONFIG_X86_PAE */ |
167 | 174 | ||
@@ -250,7 +257,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
250 | { | 257 | { |
251 | pgd_t *pgd; | 258 | pgd_t *pgd; |
252 | pmd_t *pmds[PREALLOCATED_PMDS]; | 259 | pmd_t *pmds[PREALLOCATED_PMDS]; |
253 | unsigned long flags; | ||
254 | 260 | ||
255 | pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); | 261 | pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); |
256 | 262 | ||
@@ -270,12 +276,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) | |||
270 | * respect to anything walking the pgd_list, so that they | 276 | * respect to anything walking the pgd_list, so that they |
271 | * never see a partially populated pgd. | 277 | * never see a partially populated pgd. |
272 | */ | 278 | */ |
273 | spin_lock_irqsave(&pgd_lock, flags); | 279 | spin_lock(&pgd_lock); |
274 | 280 | ||
275 | pgd_ctor(pgd); | 281 | pgd_ctor(mm, pgd); |
276 | pgd_prepopulate_pmd(mm, pgd, pmds); | 282 | pgd_prepopulate_pmd(mm, pgd, pmds); |
277 | 283 | ||
278 | spin_unlock_irqrestore(&pgd_lock, flags); | 284 | spin_unlock(&pgd_lock); |
279 | 285 | ||
280 | return pgd; | 286 | return pgd; |
281 | 287 | ||
@@ -310,6 +316,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma, | |||
310 | return changed; | 316 | return changed; |
311 | } | 317 | } |
312 | 318 | ||
319 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
320 | int pmdp_set_access_flags(struct vm_area_struct *vma, | ||
321 | unsigned long address, pmd_t *pmdp, | ||
322 | pmd_t entry, int dirty) | ||
323 | { | ||
324 | int changed = !pmd_same(*pmdp, entry); | ||
325 | |||
326 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
327 | |||
328 | if (changed && dirty) { | ||
329 | *pmdp = entry; | ||
330 | pmd_update_defer(vma->vm_mm, address, pmdp); | ||
331 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
332 | } | ||
333 | |||
334 | return changed; | ||
335 | } | ||
336 | #endif | ||
337 | |||
313 | int ptep_test_and_clear_young(struct vm_area_struct *vma, | 338 | int ptep_test_and_clear_young(struct vm_area_struct *vma, |
314 | unsigned long addr, pte_t *ptep) | 339 | unsigned long addr, pte_t *ptep) |
315 | { | 340 | { |
@@ -325,6 +350,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma, | |||
325 | return ret; | 350 | return ret; |
326 | } | 351 | } |
327 | 352 | ||
353 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
354 | int pmdp_test_and_clear_young(struct vm_area_struct *vma, | ||
355 | unsigned long addr, pmd_t *pmdp) | ||
356 | { | ||
357 | int ret = 0; | ||
358 | |||
359 | if (pmd_young(*pmdp)) | ||
360 | ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, | ||
361 | (unsigned long *)pmdp); | ||
362 | |||
363 | if (ret) | ||
364 | pmd_update(vma->vm_mm, addr, pmdp); | ||
365 | |||
366 | return ret; | ||
367 | } | ||
368 | #endif | ||
369 | |||
328 | int ptep_clear_flush_young(struct vm_area_struct *vma, | 370 | int ptep_clear_flush_young(struct vm_area_struct *vma, |
329 | unsigned long address, pte_t *ptep) | 371 | unsigned long address, pte_t *ptep) |
330 | { | 372 | { |
@@ -337,6 +379,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma, | |||
337 | return young; | 379 | return young; |
338 | } | 380 | } |
339 | 381 | ||
382 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
383 | int pmdp_clear_flush_young(struct vm_area_struct *vma, | ||
384 | unsigned long address, pmd_t *pmdp) | ||
385 | { | ||
386 | int young; | ||
387 | |||
388 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
389 | |||
390 | young = pmdp_test_and_clear_young(vma, address, pmdp); | ||
391 | if (young) | ||
392 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
393 | |||
394 | return young; | ||
395 | } | ||
396 | |||
397 | void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
398 | unsigned long address, pmd_t *pmdp) | ||
399 | { | ||
400 | int set; | ||
401 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
402 | set = !test_and_set_bit(_PAGE_BIT_SPLITTING, | ||
403 | (unsigned long *)pmdp); | ||
404 | if (set) { | ||
405 | pmd_update(vma->vm_mm, address, pmdp); | ||
406 | /* need tlb flush only to serialize against gup-fast */ | ||
407 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
408 | } | ||
409 | } | ||
410 | #endif | ||
411 | |||
340 | /** | 412 | /** |
341 | * reserve_top_address - reserves a hole in the top of kernel address space | 413 | * reserve_top_address - reserves a hole in the top of kernel address space |
342 | * @reserve - size of hole to reserve | 414 | * @reserve - size of hole to reserve |
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index a3250aa34086..410531d3c292 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c | |||
@@ -41,7 +41,7 @@ void __init x86_report_nx(void) | |||
41 | { | 41 | { |
42 | if (!cpu_has_nx) { | 42 | if (!cpu_has_nx) { |
43 | printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " | 43 | printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " |
44 | "missing in CPU or disabled in BIOS!\n"); | 44 | "missing in CPU!\n"); |
45 | } else { | 45 | } else { |
46 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | 46 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
47 | if (disable_nx) { | 47 | if (disable_nx) { |
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c new file mode 100644 index 000000000000..81dbfdeb080d --- /dev/null +++ b/arch/x86/mm/srat.c | |||
@@ -0,0 +1,184 @@ | |||
1 | /* | ||
2 | * ACPI 3.0 based NUMA setup | ||
3 | * Copyright 2004 Andi Kleen, SuSE Labs. | ||
4 | * | ||
5 | * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. | ||
6 | * | ||
7 | * Called from acpi_numa_init while reading the SRAT and SLIT tables. | ||
8 | * Assumes all memory regions belonging to a single proximity domain | ||
9 | * are in one chunk. Holes between them will be included in the node. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/acpi.h> | ||
14 | #include <linux/mmzone.h> | ||
15 | #include <linux/bitmap.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/topology.h> | ||
18 | #include <linux/bootmem.h> | ||
19 | #include <linux/memblock.h> | ||
20 | #include <linux/mm.h> | ||
21 | #include <asm/proto.h> | ||
22 | #include <asm/numa.h> | ||
23 | #include <asm/e820.h> | ||
24 | #include <asm/apic.h> | ||
25 | #include <asm/uv/uv.h> | ||
26 | |||
27 | int acpi_numa __initdata; | ||
28 | |||
29 | static __init int setup_node(int pxm) | ||
30 | { | ||
31 | return acpi_map_pxm_to_node(pxm); | ||
32 | } | ||
33 | |||
34 | static __init void bad_srat(void) | ||
35 | { | ||
36 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | ||
37 | acpi_numa = -1; | ||
38 | } | ||
39 | |||
40 | static __init inline int srat_disabled(void) | ||
41 | { | ||
42 | return acpi_numa < 0; | ||
43 | } | ||
44 | |||
45 | /* Callback for SLIT parsing */ | ||
46 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | ||
47 | { | ||
48 | int i, j; | ||
49 | |||
50 | for (i = 0; i < slit->locality_count; i++) | ||
51 | for (j = 0; j < slit->locality_count; j++) | ||
52 | numa_set_distance(pxm_to_node(i), pxm_to_node(j), | ||
53 | slit->entry[slit->locality_count * i + j]); | ||
54 | } | ||
55 | |||
56 | /* Callback for Proximity Domain -> x2APIC mapping */ | ||
57 | void __init | ||
58 | acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) | ||
59 | { | ||
60 | int pxm, node; | ||
61 | int apic_id; | ||
62 | |||
63 | if (srat_disabled()) | ||
64 | return; | ||
65 | if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { | ||
66 | bad_srat(); | ||
67 | return; | ||
68 | } | ||
69 | if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
70 | return; | ||
71 | pxm = pa->proximity_domain; | ||
72 | node = setup_node(pxm); | ||
73 | if (node < 0) { | ||
74 | printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); | ||
75 | bad_srat(); | ||
76 | return; | ||
77 | } | ||
78 | |||
79 | apic_id = pa->apic_id; | ||
80 | if (apic_id >= MAX_LOCAL_APIC) { | ||
81 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); | ||
82 | return; | ||
83 | } | ||
84 | set_apicid_to_node(apic_id, node); | ||
85 | node_set(node, numa_nodes_parsed); | ||
86 | acpi_numa = 1; | ||
87 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", | ||
88 | pxm, apic_id, node); | ||
89 | } | ||
90 | |||
91 | /* Callback for Proximity Domain -> LAPIC mapping */ | ||
92 | void __init | ||
93 | acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | ||
94 | { | ||
95 | int pxm, node; | ||
96 | int apic_id; | ||
97 | |||
98 | if (srat_disabled()) | ||
99 | return; | ||
100 | if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { | ||
101 | bad_srat(); | ||
102 | return; | ||
103 | } | ||
104 | if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
105 | return; | ||
106 | pxm = pa->proximity_domain_lo; | ||
107 | node = setup_node(pxm); | ||
108 | if (node < 0) { | ||
109 | printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); | ||
110 | bad_srat(); | ||
111 | return; | ||
112 | } | ||
113 | |||
114 | if (get_uv_system_type() >= UV_X2APIC) | ||
115 | apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; | ||
116 | else | ||
117 | apic_id = pa->apic_id; | ||
118 | |||
119 | if (apic_id >= MAX_LOCAL_APIC) { | ||
120 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); | ||
121 | return; | ||
122 | } | ||
123 | |||
124 | set_apicid_to_node(apic_id, node); | ||
125 | node_set(node, numa_nodes_parsed); | ||
126 | acpi_numa = 1; | ||
127 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", | ||
128 | pxm, apic_id, node); | ||
129 | } | ||
130 | |||
131 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
132 | static inline int save_add_info(void) {return 1;} | ||
133 | #else | ||
134 | static inline int save_add_info(void) {return 0;} | ||
135 | #endif | ||
136 | |||
137 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ | ||
138 | void __init | ||
139 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | ||
140 | { | ||
141 | u64 start, end; | ||
142 | int node, pxm; | ||
143 | |||
144 | if (srat_disabled()) | ||
145 | return; | ||
146 | if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { | ||
147 | bad_srat(); | ||
148 | return; | ||
149 | } | ||
150 | if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) | ||
151 | return; | ||
152 | |||
153 | if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) | ||
154 | return; | ||
155 | start = ma->base_address; | ||
156 | end = start + ma->length; | ||
157 | pxm = ma->proximity_domain; | ||
158 | node = setup_node(pxm); | ||
159 | if (node < 0) { | ||
160 | printk(KERN_ERR "SRAT: Too many proximity domains.\n"); | ||
161 | bad_srat(); | ||
162 | return; | ||
163 | } | ||
164 | |||
165 | if (numa_add_memblk(node, start, end) < 0) { | ||
166 | bad_srat(); | ||
167 | return; | ||
168 | } | ||
169 | |||
170 | printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm, | ||
171 | start, end); | ||
172 | } | ||
173 | |||
174 | void __init acpi_numa_arch_fixup(void) {} | ||
175 | |||
176 | int __init x86_acpi_numa_init(void) | ||
177 | { | ||
178 | int ret; | ||
179 | |||
180 | ret = acpi_numa_init(); | ||
181 | if (ret < 0) | ||
182 | return ret; | ||
183 | return srat_disabled() ? -EINVAL : 0; | ||
184 | } | ||
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c deleted file mode 100644 index 9324f13492d5..000000000000 --- a/arch/x86/mm/srat_32.c +++ /dev/null | |||
@@ -1,285 +0,0 @@ | |||
1 | /* | ||
2 | * Some of the code in this file has been gleaned from the 64 bit | ||
3 | * discontigmem support code base. | ||
4 | * | ||
5 | * Copyright (C) 2002, IBM Corp. | ||
6 | * | ||
7 | * All rights reserved. | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, but | ||
15 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or | ||
17 | * NON INFRINGEMENT. See the GNU General Public License for more | ||
18 | * details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public License | ||
21 | * along with this program; if not, write to the Free Software | ||
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
23 | * | ||
24 | * Send feedback to Pat Gaughen <gone@us.ibm.com> | ||
25 | */ | ||
26 | #include <linux/mm.h> | ||
27 | #include <linux/bootmem.h> | ||
28 | #include <linux/mmzone.h> | ||
29 | #include <linux/acpi.h> | ||
30 | #include <linux/nodemask.h> | ||
31 | #include <asm/srat.h> | ||
32 | #include <asm/topology.h> | ||
33 | #include <asm/smp.h> | ||
34 | #include <asm/e820.h> | ||
35 | |||
36 | /* | ||
37 | * proximity macros and definitions | ||
38 | */ | ||
39 | #define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ | ||
40 | #define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ | ||
41 | #define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) | ||
42 | #define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) | ||
43 | /* bitmap length; _PXM is at most 255 */ | ||
44 | #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) | ||
45 | static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ | ||
46 | |||
47 | #define MAX_CHUNKS_PER_NODE 3 | ||
48 | #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) | ||
49 | struct node_memory_chunk_s { | ||
50 | unsigned long start_pfn; | ||
51 | unsigned long end_pfn; | ||
52 | u8 pxm; // proximity domain of node | ||
53 | u8 nid; // which cnode contains this chunk? | ||
54 | u8 bank; // which mem bank on this node | ||
55 | }; | ||
56 | static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS]; | ||
57 | |||
58 | static int __initdata num_memory_chunks; /* total number of memory chunks */ | ||
59 | static u8 __initdata apicid_to_pxm[MAX_APICID]; | ||
60 | |||
61 | int numa_off __initdata; | ||
62 | int acpi_numa __initdata; | ||
63 | |||
64 | static __init void bad_srat(void) | ||
65 | { | ||
66 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | ||
67 | acpi_numa = -1; | ||
68 | num_memory_chunks = 0; | ||
69 | } | ||
70 | |||
71 | static __init inline int srat_disabled(void) | ||
72 | { | ||
73 | return numa_off || acpi_numa < 0; | ||
74 | } | ||
75 | |||
76 | /* Identify CPU proximity domains */ | ||
77 | void __init | ||
78 | acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) | ||
79 | { | ||
80 | if (srat_disabled()) | ||
81 | return; | ||
82 | if (cpu_affinity->header.length != | ||
83 | sizeof(struct acpi_srat_cpu_affinity)) { | ||
84 | bad_srat(); | ||
85 | return; | ||
86 | } | ||
87 | |||
88 | if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
89 | return; /* empty entry */ | ||
90 | |||
91 | /* mark this node as "seen" in node bitmap */ | ||
92 | BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); | ||
93 | |||
94 | apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; | ||
95 | |||
96 | printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", | ||
97 | cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo); | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * Identify memory proximity domains and hot-remove capabilities. | ||
102 | * Fill node memory chunk list structure. | ||
103 | */ | ||
104 | void __init | ||
105 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity) | ||
106 | { | ||
107 | unsigned long long paddr, size; | ||
108 | unsigned long start_pfn, end_pfn; | ||
109 | u8 pxm; | ||
110 | struct node_memory_chunk_s *p, *q, *pend; | ||
111 | |||
112 | if (srat_disabled()) | ||
113 | return; | ||
114 | if (memory_affinity->header.length != | ||
115 | sizeof(struct acpi_srat_mem_affinity)) { | ||
116 | bad_srat(); | ||
117 | return; | ||
118 | } | ||
119 | |||
120 | if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0) | ||
121 | return; /* empty entry */ | ||
122 | |||
123 | pxm = memory_affinity->proximity_domain & 0xff; | ||
124 | |||
125 | /* mark this node as "seen" in node bitmap */ | ||
126 | BMAP_SET(pxm_bitmap, pxm); | ||
127 | |||
128 | /* calculate info for memory chunk structure */ | ||
129 | paddr = memory_affinity->base_address; | ||
130 | size = memory_affinity->length; | ||
131 | |||
132 | start_pfn = paddr >> PAGE_SHIFT; | ||
133 | end_pfn = (paddr + size) >> PAGE_SHIFT; | ||
134 | |||
135 | |||
136 | if (num_memory_chunks >= MAXCHUNKS) { | ||
137 | printk(KERN_WARNING "Too many mem chunks in SRAT." | ||
138 | " Ignoring %lld MBytes at %llx\n", | ||
139 | size/(1024*1024), paddr); | ||
140 | return; | ||
141 | } | ||
142 | |||
143 | /* Insertion sort based on base address */ | ||
144 | pend = &node_memory_chunk[num_memory_chunks]; | ||
145 | for (p = &node_memory_chunk[0]; p < pend; p++) { | ||
146 | if (start_pfn < p->start_pfn) | ||
147 | break; | ||
148 | } | ||
149 | if (p < pend) { | ||
150 | for (q = pend; q >= p; q--) | ||
151 | *(q + 1) = *q; | ||
152 | } | ||
153 | p->start_pfn = start_pfn; | ||
154 | p->end_pfn = end_pfn; | ||
155 | p->pxm = pxm; | ||
156 | |||
157 | num_memory_chunks++; | ||
158 | |||
159 | printk(KERN_DEBUG "Memory range %08lx to %08lx" | ||
160 | " in proximity domain %02x %s\n", | ||
161 | start_pfn, end_pfn, | ||
162 | pxm, | ||
163 | ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ? | ||
164 | "enabled and removable" : "enabled" ) ); | ||
165 | } | ||
166 | |||
167 | /* Callback for SLIT parsing */ | ||
168 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | ||
169 | { | ||
170 | } | ||
171 | |||
172 | void acpi_numa_arch_fixup(void) | ||
173 | { | ||
174 | } | ||
175 | /* | ||
176 | * The SRAT table always lists ascending addresses, so can always | ||
177 | * assume that the first "start" address that you see is the real | ||
178 | * start of the node, and that the current "end" address is after | ||
179 | * the previous one. | ||
180 | */ | ||
181 | static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) | ||
182 | { | ||
183 | /* | ||
184 | * Only add present memory as told by the e820. | ||
185 | * There is no guarantee from the SRAT that the memory it | ||
186 | * enumerates is present at boot time because it represents | ||
187 | * *possible* memory hotplug areas the same as normal RAM. | ||
188 | */ | ||
189 | if (memory_chunk->start_pfn >= max_pfn) { | ||
190 | printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", | ||
191 | memory_chunk->start_pfn, memory_chunk->end_pfn); | ||
192 | return -1; | ||
193 | } | ||
194 | if (memory_chunk->nid != nid) | ||
195 | return -1; | ||
196 | |||
197 | if (!node_has_online_mem(nid)) | ||
198 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
199 | |||
200 | if (node_start_pfn[nid] > memory_chunk->start_pfn) | ||
201 | node_start_pfn[nid] = memory_chunk->start_pfn; | ||
202 | |||
203 | if (node_end_pfn[nid] < memory_chunk->end_pfn) | ||
204 | node_end_pfn[nid] = memory_chunk->end_pfn; | ||
205 | |||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | int __init get_memcfg_from_srat(void) | ||
210 | { | ||
211 | int i, j, nid; | ||
212 | |||
213 | |||
214 | if (srat_disabled()) | ||
215 | goto out_fail; | ||
216 | |||
217 | if (num_memory_chunks == 0) { | ||
218 | printk(KERN_DEBUG | ||
219 | "could not find any ACPI SRAT memory areas.\n"); | ||
220 | goto out_fail; | ||
221 | } | ||
222 | |||
223 | /* Calculate total number of nodes in system from PXM bitmap and create | ||
224 | * a set of sequential node IDs starting at zero. (ACPI doesn't seem | ||
225 | * to specify the range of _PXM values.) | ||
226 | */ | ||
227 | /* | ||
228 | * MCD - we no longer HAVE to number nodes sequentially. PXM domain | ||
229 | * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically | ||
230 | * 32, so we will continue numbering them in this manner until MAX_NUMNODES | ||
231 | * approaches MAX_PXM_DOMAINS for i386. | ||
232 | */ | ||
233 | nodes_clear(node_online_map); | ||
234 | for (i = 0; i < MAX_PXM_DOMAINS; i++) { | ||
235 | if (BMAP_TEST(pxm_bitmap, i)) { | ||
236 | int nid = acpi_map_pxm_to_node(i); | ||
237 | node_set_online(nid); | ||
238 | } | ||
239 | } | ||
240 | BUG_ON(num_online_nodes() == 0); | ||
241 | |||
242 | /* set cnode id in memory chunk structure */ | ||
243 | for (i = 0; i < num_memory_chunks; i++) | ||
244 | node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm); | ||
245 | |||
246 | printk(KERN_DEBUG "pxm bitmap: "); | ||
247 | for (i = 0; i < sizeof(pxm_bitmap); i++) { | ||
248 | printk(KERN_CONT "%02x ", pxm_bitmap[i]); | ||
249 | } | ||
250 | printk(KERN_CONT "\n"); | ||
251 | printk(KERN_DEBUG "Number of logical nodes in system = %d\n", | ||
252 | num_online_nodes()); | ||
253 | printk(KERN_DEBUG "Number of memory chunks in system = %d\n", | ||
254 | num_memory_chunks); | ||
255 | |||
256 | for (i = 0; i < MAX_APICID; i++) | ||
257 | apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]); | ||
258 | |||
259 | for (j = 0; j < num_memory_chunks; j++){ | ||
260 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; | ||
261 | printk(KERN_DEBUG | ||
262 | "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", | ||
263 | j, chunk->nid, chunk->start_pfn, chunk->end_pfn); | ||
264 | if (node_read_chunk(chunk->nid, chunk)) | ||
265 | continue; | ||
266 | |||
267 | e820_register_active_regions(chunk->nid, chunk->start_pfn, | ||
268 | min(chunk->end_pfn, max_pfn)); | ||
269 | } | ||
270 | /* for out of order entries in SRAT */ | ||
271 | sort_node_map(); | ||
272 | |||
273 | for_each_online_node(nid) { | ||
274 | unsigned long start = node_start_pfn[nid]; | ||
275 | unsigned long end = min(node_end_pfn[nid], max_pfn); | ||
276 | |||
277 | memory_present(nid, start, end); | ||
278 | node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); | ||
279 | } | ||
280 | return 1; | ||
281 | out_fail: | ||
282 | printk(KERN_DEBUG "failed to get NUMA memory information from SRAT" | ||
283 | " table\n"); | ||
284 | return 0; | ||
285 | } | ||
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c deleted file mode 100644 index 9c0d0d399c30..000000000000 --- a/arch/x86/mm/srat_64.c +++ /dev/null | |||
@@ -1,564 +0,0 @@ | |||
1 | /* | ||
2 | * ACPI 3.0 based NUMA setup | ||
3 | * Copyright 2004 Andi Kleen, SuSE Labs. | ||
4 | * | ||
5 | * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs. | ||
6 | * | ||
7 | * Called from acpi_numa_init while reading the SRAT and SLIT tables. | ||
8 | * Assumes all memory regions belonging to a single proximity domain | ||
9 | * are in one chunk. Holes between them will be included in the node. | ||
10 | */ | ||
11 | |||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/acpi.h> | ||
14 | #include <linux/mmzone.h> | ||
15 | #include <linux/bitmap.h> | ||
16 | #include <linux/module.h> | ||
17 | #include <linux/topology.h> | ||
18 | #include <linux/bootmem.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <asm/proto.h> | ||
21 | #include <asm/numa.h> | ||
22 | #include <asm/e820.h> | ||
23 | #include <asm/apic.h> | ||
24 | #include <asm/uv/uv.h> | ||
25 | |||
26 | int acpi_numa __initdata; | ||
27 | |||
28 | static struct acpi_table_slit *acpi_slit; | ||
29 | |||
30 | static nodemask_t nodes_parsed __initdata; | ||
31 | static nodemask_t cpu_nodes_parsed __initdata; | ||
32 | static struct bootnode nodes[MAX_NUMNODES] __initdata; | ||
33 | static struct bootnode nodes_add[MAX_NUMNODES]; | ||
34 | |||
35 | static int num_node_memblks __initdata; | ||
36 | static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; | ||
37 | static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; | ||
38 | |||
39 | static __init int setup_node(int pxm) | ||
40 | { | ||
41 | return acpi_map_pxm_to_node(pxm); | ||
42 | } | ||
43 | |||
44 | static __init int conflicting_memblks(unsigned long start, unsigned long end) | ||
45 | { | ||
46 | int i; | ||
47 | for (i = 0; i < num_node_memblks; i++) { | ||
48 | struct bootnode *nd = &node_memblk_range[i]; | ||
49 | if (nd->start == nd->end) | ||
50 | continue; | ||
51 | if (nd->end > start && nd->start < end) | ||
52 | return memblk_nodeid[i]; | ||
53 | if (nd->end == end && nd->start == start) | ||
54 | return memblk_nodeid[i]; | ||
55 | } | ||
56 | return -1; | ||
57 | } | ||
58 | |||
59 | static __init void cutoff_node(int i, unsigned long start, unsigned long end) | ||
60 | { | ||
61 | struct bootnode *nd = &nodes[i]; | ||
62 | |||
63 | if (nd->start < start) { | ||
64 | nd->start = start; | ||
65 | if (nd->end < nd->start) | ||
66 | nd->start = nd->end; | ||
67 | } | ||
68 | if (nd->end > end) { | ||
69 | nd->end = end; | ||
70 | if (nd->start > nd->end) | ||
71 | nd->start = nd->end; | ||
72 | } | ||
73 | } | ||
74 | |||
75 | static __init void bad_srat(void) | ||
76 | { | ||
77 | int i; | ||
78 | printk(KERN_ERR "SRAT: SRAT not used.\n"); | ||
79 | acpi_numa = -1; | ||
80 | for (i = 0; i < MAX_LOCAL_APIC; i++) | ||
81 | apicid_to_node[i] = NUMA_NO_NODE; | ||
82 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
83 | nodes[i].start = nodes[i].end = 0; | ||
84 | nodes_add[i].start = nodes_add[i].end = 0; | ||
85 | } | ||
86 | remove_all_active_ranges(); | ||
87 | } | ||
88 | |||
89 | static __init inline int srat_disabled(void) | ||
90 | { | ||
91 | return numa_off || acpi_numa < 0; | ||
92 | } | ||
93 | |||
94 | /* Callback for SLIT parsing */ | ||
95 | void __init acpi_numa_slit_init(struct acpi_table_slit *slit) | ||
96 | { | ||
97 | unsigned length; | ||
98 | unsigned long phys; | ||
99 | |||
100 | length = slit->header.length; | ||
101 | phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length, | ||
102 | PAGE_SIZE); | ||
103 | |||
104 | if (phys == -1L) | ||
105 | panic(" Can not save slit!\n"); | ||
106 | |||
107 | acpi_slit = __va(phys); | ||
108 | memcpy(acpi_slit, slit, length); | ||
109 | reserve_early(phys, phys + length, "ACPI SLIT"); | ||
110 | } | ||
111 | |||
112 | /* Callback for Proximity Domain -> x2APIC mapping */ | ||
113 | void __init | ||
114 | acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) | ||
115 | { | ||
116 | int pxm, node; | ||
117 | int apic_id; | ||
118 | |||
119 | if (srat_disabled()) | ||
120 | return; | ||
121 | if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) { | ||
122 | bad_srat(); | ||
123 | return; | ||
124 | } | ||
125 | if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
126 | return; | ||
127 | pxm = pa->proximity_domain; | ||
128 | node = setup_node(pxm); | ||
129 | if (node < 0) { | ||
130 | printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); | ||
131 | bad_srat(); | ||
132 | return; | ||
133 | } | ||
134 | |||
135 | apic_id = pa->apic_id; | ||
136 | apicid_to_node[apic_id] = node; | ||
137 | node_set(node, cpu_nodes_parsed); | ||
138 | acpi_numa = 1; | ||
139 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n", | ||
140 | pxm, apic_id, node); | ||
141 | } | ||
142 | |||
143 | /* Callback for Proximity Domain -> LAPIC mapping */ | ||
144 | void __init | ||
145 | acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) | ||
146 | { | ||
147 | int pxm, node; | ||
148 | int apic_id; | ||
149 | |||
150 | if (srat_disabled()) | ||
151 | return; | ||
152 | if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { | ||
153 | bad_srat(); | ||
154 | return; | ||
155 | } | ||
156 | if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) | ||
157 | return; | ||
158 | pxm = pa->proximity_domain_lo; | ||
159 | node = setup_node(pxm); | ||
160 | if (node < 0) { | ||
161 | printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); | ||
162 | bad_srat(); | ||
163 | return; | ||
164 | } | ||
165 | |||
166 | if (get_uv_system_type() >= UV_X2APIC) | ||
167 | apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; | ||
168 | else | ||
169 | apic_id = pa->apic_id; | ||
170 | apicid_to_node[apic_id] = node; | ||
171 | node_set(node, cpu_nodes_parsed); | ||
172 | acpi_numa = 1; | ||
173 | printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n", | ||
174 | pxm, apic_id, node); | ||
175 | } | ||
176 | |||
177 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | ||
178 | static inline int save_add_info(void) {return 1;} | ||
179 | #else | ||
180 | static inline int save_add_info(void) {return 0;} | ||
181 | #endif | ||
182 | /* | ||
183 | * Update nodes_add[] | ||
184 | * This code supports one contiguous hot add area per node | ||
185 | */ | ||
186 | static void __init | ||
187 | update_nodes_add(int node, unsigned long start, unsigned long end) | ||
188 | { | ||
189 | unsigned long s_pfn = start >> PAGE_SHIFT; | ||
190 | unsigned long e_pfn = end >> PAGE_SHIFT; | ||
191 | int changed = 0; | ||
192 | struct bootnode *nd = &nodes_add[node]; | ||
193 | |||
194 | /* I had some trouble with strange memory hotadd regions breaking | ||
195 | the boot. Be very strict here and reject anything unexpected. | ||
196 | If you want working memory hotadd write correct SRATs. | ||
197 | |||
198 | The node size check is a basic sanity check to guard against | ||
199 | mistakes */ | ||
200 | if ((signed long)(end - start) < NODE_MIN_SIZE) { | ||
201 | printk(KERN_ERR "SRAT: Hotplug area too small\n"); | ||
202 | return; | ||
203 | } | ||
204 | |||
205 | /* This check might be a bit too strict, but I'm keeping it for now. */ | ||
206 | if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) { | ||
207 | printk(KERN_ERR | ||
208 | "SRAT: Hotplug area %lu -> %lu has existing memory\n", | ||
209 | s_pfn, e_pfn); | ||
210 | return; | ||
211 | } | ||
212 | |||
213 | /* Looks good */ | ||
214 | |||
215 | if (nd->start == nd->end) { | ||
216 | nd->start = start; | ||
217 | nd->end = end; | ||
218 | changed = 1; | ||
219 | } else { | ||
220 | if (nd->start == end) { | ||
221 | nd->start = start; | ||
222 | changed = 1; | ||
223 | } | ||
224 | if (nd->end == start) { | ||
225 | nd->end = end; | ||
226 | changed = 1; | ||
227 | } | ||
228 | if (!changed) | ||
229 | printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); | ||
230 | } | ||
231 | |||
232 | if (changed) { | ||
233 | node_set(node, cpu_nodes_parsed); | ||
234 | printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", | ||
235 | nd->start, nd->end); | ||
236 | } | ||
237 | } | ||
238 | |||
239 | /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ | ||
240 | void __init | ||
241 | acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) | ||
242 | { | ||
243 | struct bootnode *nd, oldnode; | ||
244 | unsigned long start, end; | ||
245 | int node, pxm; | ||
246 | int i; | ||
247 | |||
248 | if (srat_disabled()) | ||
249 | return; | ||
250 | if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) { | ||
251 | bad_srat(); | ||
252 | return; | ||
253 | } | ||
254 | if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0) | ||
255 | return; | ||
256 | |||
257 | if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info()) | ||
258 | return; | ||
259 | start = ma->base_address; | ||
260 | end = start + ma->length; | ||
261 | pxm = ma->proximity_domain; | ||
262 | node = setup_node(pxm); | ||
263 | if (node < 0) { | ||
264 | printk(KERN_ERR "SRAT: Too many proximity domains.\n"); | ||
265 | bad_srat(); | ||
266 | return; | ||
267 | } | ||
268 | i = conflicting_memblks(start, end); | ||
269 | if (i == node) { | ||
270 | printk(KERN_WARNING | ||
271 | "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n", | ||
272 | pxm, start, end, nodes[i].start, nodes[i].end); | ||
273 | } else if (i >= 0) { | ||
274 | printk(KERN_ERR | ||
275 | "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n", | ||
276 | pxm, start, end, node_to_pxm(i), | ||
277 | nodes[i].start, nodes[i].end); | ||
278 | bad_srat(); | ||
279 | return; | ||
280 | } | ||
281 | nd = &nodes[node]; | ||
282 | oldnode = *nd; | ||
283 | if (!node_test_and_set(node, nodes_parsed)) { | ||
284 | nd->start = start; | ||
285 | nd->end = end; | ||
286 | } else { | ||
287 | if (start < nd->start) | ||
288 | nd->start = start; | ||
289 | if (nd->end < end) | ||
290 | nd->end = end; | ||
291 | } | ||
292 | |||
293 | printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, | ||
294 | start, end); | ||
295 | |||
296 | if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { | ||
297 | update_nodes_add(node, start, end); | ||
298 | /* restore nodes[node] */ | ||
299 | *nd = oldnode; | ||
300 | if ((nd->start | nd->end) == 0) | ||
301 | node_clear(node, nodes_parsed); | ||
302 | } | ||
303 | |||
304 | node_memblk_range[num_node_memblks].start = start; | ||
305 | node_memblk_range[num_node_memblks].end = end; | ||
306 | memblk_nodeid[num_node_memblks] = node; | ||
307 | num_node_memblks++; | ||
308 | } | ||
309 | |||
310 | /* Sanity check to catch more bad SRATs (they are amazingly common). | ||
311 | Make sure the PXMs cover all memory. */ | ||
312 | static int __init nodes_cover_memory(const struct bootnode *nodes) | ||
313 | { | ||
314 | int i; | ||
315 | unsigned long pxmram, e820ram; | ||
316 | |||
317 | pxmram = 0; | ||
318 | for_each_node_mask(i, nodes_parsed) { | ||
319 | unsigned long s = nodes[i].start >> PAGE_SHIFT; | ||
320 | unsigned long e = nodes[i].end >> PAGE_SHIFT; | ||
321 | pxmram += e - s; | ||
322 | pxmram -= __absent_pages_in_range(i, s, e); | ||
323 | if ((long)pxmram < 0) | ||
324 | pxmram = 0; | ||
325 | } | ||
326 | |||
327 | e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT); | ||
328 | /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ | ||
329 | if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { | ||
330 | printk(KERN_ERR | ||
331 | "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", | ||
332 | (pxmram << PAGE_SHIFT) >> 20, | ||
333 | (e820ram << PAGE_SHIFT) >> 20); | ||
334 | return 0; | ||
335 | } | ||
336 | return 1; | ||
337 | } | ||
338 | |||
339 | void __init acpi_numa_arch_fixup(void) {} | ||
340 | |||
341 | int __init acpi_get_nodes(struct bootnode *physnodes) | ||
342 | { | ||
343 | int i; | ||
344 | int ret = 0; | ||
345 | |||
346 | for_each_node_mask(i, nodes_parsed) { | ||
347 | physnodes[ret].start = nodes[i].start; | ||
348 | physnodes[ret].end = nodes[i].end; | ||
349 | ret++; | ||
350 | } | ||
351 | return ret; | ||
352 | } | ||
353 | |||
354 | /* Use the information discovered above to actually set up the nodes. */ | ||
355 | int __init acpi_scan_nodes(unsigned long start, unsigned long end) | ||
356 | { | ||
357 | int i; | ||
358 | |||
359 | if (acpi_numa <= 0) | ||
360 | return -1; | ||
361 | |||
362 | /* First clean up the node list */ | ||
363 | for (i = 0; i < MAX_NUMNODES; i++) | ||
364 | cutoff_node(i, start, end); | ||
365 | |||
366 | /* | ||
367 | * Join together blocks on the same node, holes between | ||
368 | * which don't overlap with memory on other nodes. | ||
369 | */ | ||
370 | for (i = 0; i < num_node_memblks; ++i) { | ||
371 | int j, k; | ||
372 | |||
373 | for (j = i + 1; j < num_node_memblks; ++j) { | ||
374 | unsigned long start, end; | ||
375 | |||
376 | if (memblk_nodeid[i] != memblk_nodeid[j]) | ||
377 | continue; | ||
378 | start = min(node_memblk_range[i].end, | ||
379 | node_memblk_range[j].end); | ||
380 | end = max(node_memblk_range[i].start, | ||
381 | node_memblk_range[j].start); | ||
382 | for (k = 0; k < num_node_memblks; ++k) { | ||
383 | if (memblk_nodeid[i] == memblk_nodeid[k]) | ||
384 | continue; | ||
385 | if (start < node_memblk_range[k].end && | ||
386 | end > node_memblk_range[k].start) | ||
387 | break; | ||
388 | } | ||
389 | if (k < num_node_memblks) | ||
390 | continue; | ||
391 | start = min(node_memblk_range[i].start, | ||
392 | node_memblk_range[j].start); | ||
393 | end = max(node_memblk_range[i].end, | ||
394 | node_memblk_range[j].end); | ||
395 | printk(KERN_INFO "SRAT: Node %d " | ||
396 | "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n", | ||
397 | memblk_nodeid[i], | ||
398 | node_memblk_range[i].start, | ||
399 | node_memblk_range[i].end, | ||
400 | node_memblk_range[j].start, | ||
401 | node_memblk_range[j].end, | ||
402 | start, end); | ||
403 | node_memblk_range[i].start = start; | ||
404 | node_memblk_range[i].end = end; | ||
405 | k = --num_node_memblks - j; | ||
406 | memmove(memblk_nodeid + j, memblk_nodeid + j+1, | ||
407 | k * sizeof(*memblk_nodeid)); | ||
408 | memmove(node_memblk_range + j, node_memblk_range + j+1, | ||
409 | k * sizeof(*node_memblk_range)); | ||
410 | --j; | ||
411 | } | ||
412 | } | ||
413 | |||
414 | memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, | ||
415 | memblk_nodeid); | ||
416 | if (memnode_shift < 0) { | ||
417 | printk(KERN_ERR | ||
418 | "SRAT: No NUMA node hash function found. Contact maintainer\n"); | ||
419 | bad_srat(); | ||
420 | return -1; | ||
421 | } | ||
422 | |||
423 | for (i = 0; i < num_node_memblks; i++) | ||
424 | e820_register_active_regions(memblk_nodeid[i], | ||
425 | node_memblk_range[i].start >> PAGE_SHIFT, | ||
426 | node_memblk_range[i].end >> PAGE_SHIFT); | ||
427 | |||
428 | /* for out of order entries in SRAT */ | ||
429 | sort_node_map(); | ||
430 | if (!nodes_cover_memory(nodes)) { | ||
431 | bad_srat(); | ||
432 | return -1; | ||
433 | } | ||
434 | |||
435 | /* Account for nodes with cpus and no memory */ | ||
436 | nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); | ||
437 | |||
438 | /* Finally register nodes */ | ||
439 | for_each_node_mask(i, node_possible_map) | ||
440 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
441 | /* Try again in case setup_node_bootmem missed one due | ||
442 | to missing bootmem */ | ||
443 | for_each_node_mask(i, node_possible_map) | ||
444 | if (!node_online(i)) | ||
445 | setup_node_bootmem(i, nodes[i].start, nodes[i].end); | ||
446 | |||
447 | for (i = 0; i < nr_cpu_ids; i++) { | ||
448 | int node = early_cpu_to_node(i); | ||
449 | |||
450 | if (node == NUMA_NO_NODE) | ||
451 | continue; | ||
452 | if (!node_online(node)) | ||
453 | numa_clear_node(i); | ||
454 | } | ||
455 | numa_init_array(); | ||
456 | return 0; | ||
457 | } | ||
458 | |||
459 | #ifdef CONFIG_NUMA_EMU | ||
460 | static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { | ||
461 | [0 ... MAX_NUMNODES-1] = PXM_INVAL | ||
462 | }; | ||
463 | static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { | ||
464 | [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE | ||
465 | }; | ||
466 | static int __init find_node_by_addr(unsigned long addr) | ||
467 | { | ||
468 | int ret = NUMA_NO_NODE; | ||
469 | int i; | ||
470 | |||
471 | for_each_node_mask(i, nodes_parsed) { | ||
472 | /* | ||
473 | * Find the real node that this emulated node appears on. For | ||
474 | * the sake of simplicity, we only use a real node's starting | ||
475 | * address to determine which emulated node it appears on. | ||
476 | */ | ||
477 | if (addr >= nodes[i].start && addr < nodes[i].end) { | ||
478 | ret = i; | ||
479 | break; | ||
480 | } | ||
481 | } | ||
482 | return ret; | ||
483 | } | ||
484 | |||
485 | /* | ||
486 | * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID | ||
487 | * mappings that respect the real ACPI topology but reflect our emulated | ||
488 | * environment. For each emulated node, we find which real node it appears on | ||
489 | * and create PXM to NID mappings for those fake nodes which mirror that | ||
490 | * locality. SLIT will now represent the correct distances between emulated | ||
491 | * nodes as a result of the real topology. | ||
492 | */ | ||
493 | void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) | ||
494 | { | ||
495 | int i, j; | ||
496 | |||
497 | printk(KERN_INFO "Faking PXM affinity for fake nodes on real " | ||
498 | "topology.\n"); | ||
499 | for (i = 0; i < num_nodes; i++) { | ||
500 | int nid, pxm; | ||
501 | |||
502 | nid = find_node_by_addr(fake_nodes[i].start); | ||
503 | if (nid == NUMA_NO_NODE) | ||
504 | continue; | ||
505 | pxm = node_to_pxm(nid); | ||
506 | if (pxm == PXM_INVAL) | ||
507 | continue; | ||
508 | fake_node_to_pxm_map[i] = pxm; | ||
509 | /* | ||
510 | * For each apicid_to_node mapping that exists for this real | ||
511 | * node, it must now point to the fake node ID. | ||
512 | */ | ||
513 | for (j = 0; j < MAX_LOCAL_APIC; j++) | ||
514 | if (apicid_to_node[j] == nid && | ||
515 | fake_apicid_to_node[j] == NUMA_NO_NODE) | ||
516 | fake_apicid_to_node[j] = i; | ||
517 | } | ||
518 | for (i = 0; i < num_nodes; i++) | ||
519 | __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i); | ||
520 | memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node)); | ||
521 | |||
522 | nodes_clear(nodes_parsed); | ||
523 | for (i = 0; i < num_nodes; i++) | ||
524 | if (fake_nodes[i].start != fake_nodes[i].end) | ||
525 | node_set(i, nodes_parsed); | ||
526 | } | ||
527 | |||
528 | static int null_slit_node_compare(int a, int b) | ||
529 | { | ||
530 | return node_to_pxm(a) == node_to_pxm(b); | ||
531 | } | ||
532 | #else | ||
533 | static int null_slit_node_compare(int a, int b) | ||
534 | { | ||
535 | return a == b; | ||
536 | } | ||
537 | #endif /* CONFIG_NUMA_EMU */ | ||
538 | |||
539 | int __node_distance(int a, int b) | ||
540 | { | ||
541 | int index; | ||
542 | |||
543 | if (!acpi_slit) | ||
544 | return null_slit_node_compare(a, b) ? LOCAL_DISTANCE : | ||
545 | REMOTE_DISTANCE; | ||
546 | index = acpi_slit->locality_count * node_to_pxm(a); | ||
547 | return acpi_slit->entry[index + node_to_pxm(b)]; | ||
548 | } | ||
549 | |||
550 | EXPORT_SYMBOL(__node_distance); | ||
551 | |||
552 | #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY) | ||
553 | int memory_add_physaddr_to_nid(u64 start) | ||
554 | { | ||
555 | int i, ret = 0; | ||
556 | |||
557 | for_each_node(i) | ||
558 | if (nodes_add[i].start <= start && nodes_add[i].end > start) | ||
559 | ret = i; | ||
560 | |||
561 | return ret; | ||
562 | } | ||
563 | EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); | ||
564 | #endif | ||
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c03f14ab6667..d6c0418c3e47 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/smp.h> | 5 | #include <linux/smp.h> |
6 | #include <linux/interrupt.h> | 6 | #include <linux/interrupt.h> |
7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
8 | #include <linux/cpu.h> | ||
8 | 9 | ||
9 | #include <asm/tlbflush.h> | 10 | #include <asm/tlbflush.h> |
10 | #include <asm/mmu_context.h> | 11 | #include <asm/mmu_context.h> |
@@ -52,6 +53,8 @@ union smp_flush_state { | |||
52 | want false sharing in the per cpu data segment. */ | 53 | want false sharing in the per cpu data segment. */ |
53 | static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; | 54 | static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; |
54 | 55 | ||
56 | static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset); | ||
57 | |||
55 | /* | 58 | /* |
56 | * We cannot call mmdrop() because we are in interrupt context, | 59 | * We cannot call mmdrop() because we are in interrupt context, |
57 | * instead update mm->cpu_vm_mask. | 60 | * instead update mm->cpu_vm_mask. |
@@ -173,15 +176,11 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
173 | union smp_flush_state *f; | 176 | union smp_flush_state *f; |
174 | 177 | ||
175 | /* Caller has disabled preemption */ | 178 | /* Caller has disabled preemption */ |
176 | sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; | 179 | sender = this_cpu_read(tlb_vector_offset); |
177 | f = &flush_state[sender]; | 180 | f = &flush_state[sender]; |
178 | 181 | ||
179 | /* | 182 | if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) |
180 | * Could avoid this lock when | 183 | raw_spin_lock(&f->tlbstate_lock); |
181 | * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is | ||
182 | * probably not worth checking this for a cache-hot lock. | ||
183 | */ | ||
184 | raw_spin_lock(&f->tlbstate_lock); | ||
185 | 184 | ||
186 | f->flush_mm = mm; | 185 | f->flush_mm = mm; |
187 | f->flush_va = va; | 186 | f->flush_va = va; |
@@ -199,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask, | |||
199 | 198 | ||
200 | f->flush_mm = NULL; | 199 | f->flush_mm = NULL; |
201 | f->flush_va = 0; | 200 | f->flush_va = 0; |
202 | raw_spin_unlock(&f->tlbstate_lock); | 201 | if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS) |
202 | raw_spin_unlock(&f->tlbstate_lock); | ||
203 | } | 203 | } |
204 | 204 | ||
205 | void native_flush_tlb_others(const struct cpumask *cpumask, | 205 | void native_flush_tlb_others(const struct cpumask *cpumask, |
@@ -208,16 +208,57 @@ void native_flush_tlb_others(const struct cpumask *cpumask, | |||
208 | if (is_uv_system()) { | 208 | if (is_uv_system()) { |
209 | unsigned int cpu; | 209 | unsigned int cpu; |
210 | 210 | ||
211 | cpu = get_cpu(); | 211 | cpu = smp_processor_id(); |
212 | cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); | 212 | cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); |
213 | if (cpumask) | 213 | if (cpumask) |
214 | flush_tlb_others_ipi(cpumask, mm, va); | 214 | flush_tlb_others_ipi(cpumask, mm, va); |
215 | put_cpu(); | ||
216 | return; | 215 | return; |
217 | } | 216 | } |
218 | flush_tlb_others_ipi(cpumask, mm, va); | 217 | flush_tlb_others_ipi(cpumask, mm, va); |
219 | } | 218 | } |
220 | 219 | ||
220 | static void __cpuinit calculate_tlb_offset(void) | ||
221 | { | ||
222 | int cpu, node, nr_node_vecs, idx = 0; | ||
223 | /* | ||
224 | * we are changing tlb_vector_offset for each CPU in runtime, but this | ||
225 | * will not cause inconsistency, as the write is atomic under X86. we | ||
226 | * might see more lock contentions in a short time, but after all CPU's | ||
227 | * tlb_vector_offset are changed, everything should go normal | ||
228 | * | ||
229 | * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might | ||
230 | * waste some vectors. | ||
231 | **/ | ||
232 | if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS) | ||
233 | nr_node_vecs = 1; | ||
234 | else | ||
235 | nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; | ||
236 | |||
237 | for_each_online_node(node) { | ||
238 | int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) * | ||
239 | nr_node_vecs; | ||
240 | int cpu_offset = 0; | ||
241 | for_each_cpu(cpu, cpumask_of_node(node)) { | ||
242 | per_cpu(tlb_vector_offset, cpu) = node_offset + | ||
243 | cpu_offset; | ||
244 | cpu_offset++; | ||
245 | cpu_offset = cpu_offset % nr_node_vecs; | ||
246 | } | ||
247 | idx++; | ||
248 | } | ||
249 | } | ||
250 | |||
251 | static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n, | ||
252 | unsigned long action, void *hcpu) | ||
253 | { | ||
254 | switch (action & 0xf) { | ||
255 | case CPU_ONLINE: | ||
256 | case CPU_DEAD: | ||
257 | calculate_tlb_offset(); | ||
258 | } | ||
259 | return NOTIFY_OK; | ||
260 | } | ||
261 | |||
221 | static int __cpuinit init_smp_flush(void) | 262 | static int __cpuinit init_smp_flush(void) |
222 | { | 263 | { |
223 | int i; | 264 | int i; |
@@ -225,6 +266,8 @@ static int __cpuinit init_smp_flush(void) | |||
225 | for (i = 0; i < ARRAY_SIZE(flush_state); i++) | 266 | for (i = 0; i < ARRAY_SIZE(flush_state); i++) |
226 | raw_spin_lock_init(&flush_state[i].tlbstate_lock); | 267 | raw_spin_lock_init(&flush_state[i].tlbstate_lock); |
227 | 268 | ||
269 | calculate_tlb_offset(); | ||
270 | hotcpu_notifier(tlb_cpuhp_notify, 0); | ||
228 | return 0; | 271 | return 0; |
229 | } | 272 | } |
230 | core_initcall(init_smp_flush); | 273 | core_initcall(init_smp_flush); |