aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/mm')
-rw-r--r--arch/x86_64/mm/fault.c5
-rw-r--r--arch/x86_64/mm/init.c172
-rw-r--r--arch/x86_64/mm/k8topology.c9
-rw-r--r--arch/x86_64/mm/numa.c306
-rw-r--r--arch/x86_64/mm/pageattr.c32
-rw-r--r--arch/x86_64/mm/srat.c8
6 files changed, 306 insertions, 226 deletions
diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c
index 6ada7231f3ab..de99dba2c515 100644
--- a/arch/x86_64/mm/fault.c
+++ b/arch/x86_64/mm/fault.c
@@ -585,7 +585,7 @@ do_sigbus:
585} 585}
586 586
587DEFINE_SPINLOCK(pgd_lock); 587DEFINE_SPINLOCK(pgd_lock);
588struct page *pgd_list; 588LIST_HEAD(pgd_list);
589 589
590void vmalloc_sync_all(void) 590void vmalloc_sync_all(void)
591{ 591{
@@ -605,8 +605,7 @@ void vmalloc_sync_all(void)
605 if (pgd_none(*pgd_ref)) 605 if (pgd_none(*pgd_ref))
606 continue; 606 continue;
607 spin_lock(&pgd_lock); 607 spin_lock(&pgd_lock);
608 for (page = pgd_list; page; 608 list_for_each_entry(page, &pgd_list, lru) {
609 page = (struct page *)page->index) {
610 pgd_t *pgd; 609 pgd_t *pgd;
611 pgd = (pgd_t *)page_address(page) + pgd_index(address); 610 pgd = (pgd_t *)page_address(page) + pgd_index(address);
612 if (pgd_none(*pgd)) 611 if (pgd_none(*pgd))
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index ec31534eb104..282b0a8f00ad 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -22,10 +22,12 @@
22#include <linux/bootmem.h> 22#include <linux/bootmem.h>
23#include <linux/proc_fs.h> 23#include <linux/proc_fs.h>
24#include <linux/pci.h> 24#include <linux/pci.h>
25#include <linux/pfn.h>
25#include <linux/poison.h> 26#include <linux/poison.h>
26#include <linux/dma-mapping.h> 27#include <linux/dma-mapping.h>
27#include <linux/module.h> 28#include <linux/module.h>
28#include <linux/memory_hotplug.h> 29#include <linux/memory_hotplug.h>
30#include <linux/nmi.h>
29 31
30#include <asm/processor.h> 32#include <asm/processor.h>
31#include <asm/system.h> 33#include <asm/system.h>
@@ -46,7 +48,7 @@
46#define Dprintk(x...) 48#define Dprintk(x...)
47#endif 49#endif
48 50
49struct dma_mapping_ops* dma_ops; 51const struct dma_mapping_ops* dma_ops;
50EXPORT_SYMBOL(dma_ops); 52EXPORT_SYMBOL(dma_ops);
51 53
52static unsigned long dma_reserve __initdata; 54static unsigned long dma_reserve __initdata;
@@ -72,6 +74,11 @@ void show_mem(void)
72 74
73 for_each_online_pgdat(pgdat) { 75 for_each_online_pgdat(pgdat) {
74 for (i = 0; i < pgdat->node_spanned_pages; ++i) { 76 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
77 /* this loop can take a while with 256 GB and 4k pages
78 so update the NMI watchdog */
79 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
80 touch_nmi_watchdog();
81 }
75 page = pfn_to_page(pgdat->node_start_pfn + i); 82 page = pfn_to_page(pgdat->node_start_pfn + i);
76 total++; 83 total++;
77 if (PageReserved(page)) 84 if (PageReserved(page))
@@ -167,23 +174,9 @@ __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
167 174
168unsigned long __initdata table_start, table_end; 175unsigned long __initdata table_start, table_end;
169 176
170extern pmd_t temp_boot_pmds[]; 177static __meminit void *alloc_low_page(unsigned long *phys)
171
172static struct temp_map {
173 pmd_t *pmd;
174 void *address;
175 int allocated;
176} temp_mappings[] __initdata = {
177 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
178 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
179 {}
180};
181
182static __meminit void *alloc_low_page(int *index, unsigned long *phys)
183{ 178{
184 struct temp_map *ti; 179 unsigned long pfn = table_end++;
185 int i;
186 unsigned long pfn = table_end++, paddr;
187 void *adr; 180 void *adr;
188 181
189 if (after_bootmem) { 182 if (after_bootmem) {
@@ -194,57 +187,63 @@ static __meminit void *alloc_low_page(int *index, unsigned long *phys)
194 187
195 if (pfn >= end_pfn) 188 if (pfn >= end_pfn)
196 panic("alloc_low_page: ran out of memory"); 189 panic("alloc_low_page: ran out of memory");
197 for (i = 0; temp_mappings[i].allocated; i++) { 190
198 if (!temp_mappings[i].pmd) 191 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
199 panic("alloc_low_page: ran out of temp mappings");
200 }
201 ti = &temp_mappings[i];
202 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
203 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
204 ti->allocated = 1;
205 __flush_tlb();
206 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
207 memset(adr, 0, PAGE_SIZE); 192 memset(adr, 0, PAGE_SIZE);
208 *index = i; 193 *phys = pfn * PAGE_SIZE;
209 *phys = pfn * PAGE_SIZE; 194 return adr;
210 return adr; 195}
211}
212 196
213static __meminit void unmap_low_page(int i) 197static __meminit void unmap_low_page(void *adr)
214{ 198{
215 struct temp_map *ti;
216 199
217 if (after_bootmem) 200 if (after_bootmem)
218 return; 201 return;
219 202
220 ti = &temp_mappings[i]; 203 early_iounmap(adr, PAGE_SIZE);
221 set_pmd(ti->pmd, __pmd(0));
222 ti->allocated = 0;
223} 204}
224 205
225/* Must run before zap_low_mappings */ 206/* Must run before zap_low_mappings */
226__init void *early_ioremap(unsigned long addr, unsigned long size) 207__init void *early_ioremap(unsigned long addr, unsigned long size)
227{ 208{
228 unsigned long map = round_down(addr, LARGE_PAGE_SIZE); 209 unsigned long vaddr;
229 210 pmd_t *pmd, *last_pmd;
230 /* actually usually some more */ 211 int i, pmds;
231 if (size >= LARGE_PAGE_SIZE) { 212
232 return NULL; 213 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
214 vaddr = __START_KERNEL_map;
215 pmd = level2_kernel_pgt;
216 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
217 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
218 for (i = 0; i < pmds; i++) {
219 if (pmd_present(pmd[i]))
220 goto next;
221 }
222 vaddr += addr & ~PMD_MASK;
223 addr &= PMD_MASK;
224 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
225 set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE));
226 __flush_tlb();
227 return (void *)vaddr;
228 next:
229 ;
233 } 230 }
234 set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); 231 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
235 map += LARGE_PAGE_SIZE; 232 return NULL;
236 set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE));
237 __flush_tlb();
238 return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1));
239} 233}
240 234
241/* To avoid virtual aliases later */ 235/* To avoid virtual aliases later */
242__init void early_iounmap(void *addr, unsigned long size) 236__init void early_iounmap(void *addr, unsigned long size)
243{ 237{
244 if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) 238 unsigned long vaddr;
245 printk("early_iounmap: bad address %p\n", addr); 239 pmd_t *pmd;
246 set_pmd(temp_mappings[0].pmd, __pmd(0)); 240 int i, pmds;
247 set_pmd(temp_mappings[1].pmd, __pmd(0)); 241
242 vaddr = (unsigned long)addr;
243 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
244 pmd = level2_kernel_pgt + pmd_index(vaddr);
245 for (i = 0; i < pmds; i++)
246 pmd_clear(pmd + i);
248 __flush_tlb(); 247 __flush_tlb();
249} 248}
250 249
@@ -289,7 +288,6 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
289 288
290 289
291 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { 290 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
292 int map;
293 unsigned long pmd_phys; 291 unsigned long pmd_phys;
294 pud_t *pud = pud_page + pud_index(addr); 292 pud_t *pud = pud_page + pud_index(addr);
295 pmd_t *pmd; 293 pmd_t *pmd;
@@ -307,12 +305,12 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne
307 continue; 305 continue;
308 } 306 }
309 307
310 pmd = alloc_low_page(&map, &pmd_phys); 308 pmd = alloc_low_page(&pmd_phys);
311 spin_lock(&init_mm.page_table_lock); 309 spin_lock(&init_mm.page_table_lock);
312 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); 310 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
313 phys_pmd_init(pmd, addr, end); 311 phys_pmd_init(pmd, addr, end);
314 spin_unlock(&init_mm.page_table_lock); 312 spin_unlock(&init_mm.page_table_lock);
315 unmap_low_page(map); 313 unmap_low_page(pmd);
316 } 314 }
317 __flush_tlb(); 315 __flush_tlb();
318} 316}
@@ -364,7 +362,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
364 end = (unsigned long)__va(end); 362 end = (unsigned long)__va(end);
365 363
366 for (; start < end; start = next) { 364 for (; start < end; start = next) {
367 int map;
368 unsigned long pud_phys; 365 unsigned long pud_phys;
369 pgd_t *pgd = pgd_offset_k(start); 366 pgd_t *pgd = pgd_offset_k(start);
370 pud_t *pud; 367 pud_t *pud;
@@ -372,7 +369,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
372 if (after_bootmem) 369 if (after_bootmem)
373 pud = pud_offset(pgd, start & PGDIR_MASK); 370 pud = pud_offset(pgd, start & PGDIR_MASK);
374 else 371 else
375 pud = alloc_low_page(&map, &pud_phys); 372 pud = alloc_low_page(&pud_phys);
376 373
377 next = start + PGDIR_SIZE; 374 next = start + PGDIR_SIZE;
378 if (next > end) 375 if (next > end)
@@ -380,7 +377,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
380 phys_pud_init(pud, __pa(start), __pa(next)); 377 phys_pud_init(pud, __pa(start), __pa(next));
381 if (!after_bootmem) 378 if (!after_bootmem)
382 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); 379 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
383 unmap_low_page(map); 380 unmap_low_page(pud);
384 } 381 }
385 382
386 if (!after_bootmem) 383 if (!after_bootmem)
@@ -388,21 +385,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end)
388 __flush_tlb_all(); 385 __flush_tlb_all();
389} 386}
390 387
391void __cpuinit zap_low_mappings(int cpu)
392{
393 if (cpu == 0) {
394 pgd_t *pgd = pgd_offset_k(0UL);
395 pgd_clear(pgd);
396 } else {
397 /*
398 * For AP's, zap the low identity mappings by changing the cr3
399 * to init_level4_pgt and doing local flush tlb all
400 */
401 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
402 }
403 __flush_tlb_all();
404}
405
406#ifndef CONFIG_NUMA 388#ifndef CONFIG_NUMA
407void __init paging_init(void) 389void __init paging_init(void)
408{ 390{
@@ -579,15 +561,6 @@ void __init mem_init(void)
579 reservedpages << (PAGE_SHIFT-10), 561 reservedpages << (PAGE_SHIFT-10),
580 datasize >> 10, 562 datasize >> 10,
581 initsize >> 10); 563 initsize >> 10);
582
583#ifdef CONFIG_SMP
584 /*
585 * Sync boot_level4_pgt mappings with the init_level4_pgt
586 * except for the low identity mappings which are already zapped
587 * in init_level4_pgt. This sync-up is essential for AP's bringup
588 */
589 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
590#endif
591} 564}
592 565
593void free_init_pages(char *what, unsigned long begin, unsigned long end) 566void free_init_pages(char *what, unsigned long begin, unsigned long end)
@@ -597,37 +570,44 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
597 if (begin >= end) 570 if (begin >= end)
598 return; 571 return;
599 572
600 printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); 573 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
601 for (addr = begin; addr < end; addr += PAGE_SIZE) { 574 for (addr = begin; addr < end; addr += PAGE_SIZE) {
602 ClearPageReserved(virt_to_page(addr)); 575 struct page *page = pfn_to_page(addr >> PAGE_SHIFT);
603 init_page_count(virt_to_page(addr)); 576 ClearPageReserved(page);
604 memset((void *)(addr & ~(PAGE_SIZE-1)), 577 init_page_count(page);
605 POISON_FREE_INITMEM, PAGE_SIZE); 578 memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE);
606 free_page(addr); 579 if (addr >= __START_KERNEL_map)
580 change_page_attr_addr(addr, 1, __pgprot(0));
581 __free_page(page);
607 totalram_pages++; 582 totalram_pages++;
608 } 583 }
584 if (addr > __START_KERNEL_map)
585 global_flush_tlb();
609} 586}
610 587
611void free_initmem(void) 588void free_initmem(void)
612{ 589{
613 memset(__initdata_begin, POISON_FREE_INITDATA,
614 __initdata_end - __initdata_begin);
615 free_init_pages("unused kernel memory", 590 free_init_pages("unused kernel memory",
616 (unsigned long)(&__init_begin), 591 __pa_symbol(&__init_begin),
617 (unsigned long)(&__init_end)); 592 __pa_symbol(&__init_end));
618} 593}
619 594
620#ifdef CONFIG_DEBUG_RODATA 595#ifdef CONFIG_DEBUG_RODATA
621 596
622void mark_rodata_ro(void) 597void mark_rodata_ro(void)
623{ 598{
624 unsigned long addr = (unsigned long)__start_rodata; 599 unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size;
625 600
626 for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) 601#ifdef CONFIG_HOTPLUG_CPU
627 change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); 602 /* It must still be possible to apply SMP alternatives. */
603 if (num_possible_cpus() > 1)
604 start = PFN_ALIGN(__va(__pa_symbol(&_etext)));
605#endif
606 size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start;
607 change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO);
628 608
629 printk ("Write protecting the kernel read-only data: %luk\n", 609 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
630 (__end_rodata - __start_rodata) >> 10); 610 size >> 10);
631 611
632 /* 612 /*
633 * change_page_attr_addr() requires a global_flush_tlb() call after it. 613 * change_page_attr_addr() requires a global_flush_tlb() call after it.
@@ -642,7 +622,7 @@ void mark_rodata_ro(void)
642#ifdef CONFIG_BLK_DEV_INITRD 622#ifdef CONFIG_BLK_DEV_INITRD
643void free_initrd_mem(unsigned long start, unsigned long end) 623void free_initrd_mem(unsigned long start, unsigned long end)
644{ 624{
645 free_init_pages("initrd memory", start, end); 625 free_init_pages("initrd memory", __pa(start), __pa(end));
646} 626}
647#endif 627#endif
648 628
diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c
index b5b8dba28b4e..f983c75825d0 100644
--- a/arch/x86_64/mm/k8topology.c
+++ b/arch/x86_64/mm/k8topology.c
@@ -49,11 +49,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
49 int found = 0; 49 int found = 0;
50 u32 reg; 50 u32 reg;
51 unsigned numnodes; 51 unsigned numnodes;
52 nodemask_t nodes_parsed;
53 unsigned dualcore = 0; 52 unsigned dualcore = 0;
54 53
55 nodes_clear(nodes_parsed);
56
57 if (!early_pci_allowed()) 54 if (!early_pci_allowed())
58 return -1; 55 return -1;
59 56
@@ -65,6 +62,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
65 62
66 reg = read_pci_config(0, nb, 0, 0x60); 63 reg = read_pci_config(0, nb, 0, 0x60);
67 numnodes = ((reg >> 4) & 0xF) + 1; 64 numnodes = ((reg >> 4) & 0xF) + 1;
65 if (numnodes <= 1)
66 return -1;
68 67
69 printk(KERN_INFO "Number of nodes %d\n", numnodes); 68 printk(KERN_INFO "Number of nodes %d\n", numnodes);
70 69
@@ -102,7 +101,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
102 nodeid, (base>>8)&3, (limit>>8) & 3); 101 nodeid, (base>>8)&3, (limit>>8) & 3);
103 return -1; 102 return -1;
104 } 103 }
105 if (node_isset(nodeid, nodes_parsed)) { 104 if (node_isset(nodeid, node_possible_map)) {
106 printk(KERN_INFO "Node %d already present. Skipping\n", 105 printk(KERN_INFO "Node %d already present. Skipping\n",
107 nodeid); 106 nodeid);
108 continue; 107 continue;
@@ -155,7 +154,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
155 154
156 prevbase = base; 155 prevbase = base;
157 156
158 node_set(nodeid, nodes_parsed); 157 node_set(nodeid, node_possible_map);
159 } 158 }
160 159
161 if (!found) 160 if (!found)
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 41b8fb069924..51548947ad3b 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -273,125 +273,213 @@ void __init numa_init_array(void)
273 273
274#ifdef CONFIG_NUMA_EMU 274#ifdef CONFIG_NUMA_EMU
275/* Numa emulation */ 275/* Numa emulation */
276int numa_fake __initdata = 0; 276#define E820_ADDR_HOLE_SIZE(start, end) \
277 (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \
278 PAGE_SHIFT)
279char *cmdline __initdata;
277 280
278/* 281/*
279 * This function is used to find out if the start and end correspond to 282 * Setups up nid to range from addr to addr + size. If the end boundary is
280 * different zones. 283 * greater than max_addr, then max_addr is used instead. The return value is 0
284 * if there is additional memory left for allocation past addr and -1 otherwise.
285 * addr is adjusted to be at the end of the node.
281 */ 286 */
282int zone_cross_over(unsigned long start, unsigned long end) 287static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
288 u64 size, u64 max_addr)
283{ 289{
284 if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) && 290 int ret = 0;
285 (end >= (MAX_DMA32_PFN << PAGE_SHIFT))) 291 nodes[nid].start = *addr;
286 return 1; 292 *addr += size;
287 return 0; 293 if (*addr >= max_addr) {
294 *addr = max_addr;
295 ret = -1;
296 }
297 nodes[nid].end = *addr;
298 node_set(nid, node_possible_map);
299 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
300 nodes[nid].start, nodes[nid].end,
301 (nodes[nid].end - nodes[nid].start) >> 20);
302 return ret;
288} 303}
289 304
290static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) 305/*
306 * Splits num_nodes nodes up equally starting at node_start. The return value
307 * is the number of nodes split up and addr is adjusted to be at the end of the
308 * last node allocated.
309 */
310static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
311 u64 max_addr, int node_start,
312 int num_nodes)
291{ 313{
292 int i, big; 314 unsigned int big;
293 struct bootnode nodes[MAX_NUMNODES]; 315 u64 size;
294 unsigned long sz, old_sz; 316 int i;
295 unsigned long hole_size;
296 unsigned long start, end;
297 unsigned long max_addr = (end_pfn << PAGE_SHIFT);
298
299 start = (start_pfn << PAGE_SHIFT);
300 hole_size = e820_hole_size(start, max_addr);
301 sz = (max_addr - start - hole_size) / numa_fake;
302
303 /* Kludge needed for the hash function */
304
305 old_sz = sz;
306 /*
307 * Round down to the nearest FAKE_NODE_MIN_SIZE.
308 */
309 sz &= FAKE_NODE_MIN_HASH_MASK;
310 317
318 if (num_nodes <= 0)
319 return -1;
320 if (num_nodes > MAX_NUMNODES)
321 num_nodes = MAX_NUMNODES;
322 size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
323 num_nodes;
311 /* 324 /*
312 * We ensure that each node is at least 64MB big. Smaller than this 325 * Calculate the number of big nodes that can be allocated as a result
313 * size can cause VM hiccups. 326 * of consolidating the leftovers.
314 */ 327 */
315 if (sz == 0) { 328 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
316 printk(KERN_INFO "Not enough memory for %d nodes. Reducing " 329 FAKE_NODE_MIN_SIZE;
317 "the number of nodes\n", numa_fake); 330
318 numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE; 331 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
319 printk(KERN_INFO "Number of fake nodes will be = %d\n", 332 size &= FAKE_NODE_MIN_HASH_MASK;
320 numa_fake); 333 if (!size) {
321 sz = FAKE_NODE_MIN_SIZE; 334 printk(KERN_ERR "Not enough memory for each node. "
335 "NUMA emulation disabled.\n");
336 return -1;
322 } 337 }
323 /* 338
324 * Find out how many nodes can get an extra NODE_MIN_SIZE granule. 339 for (i = node_start; i < num_nodes + node_start; i++) {
325 * This logic ensures the extra memory gets distributed among as many 340 u64 end = *addr + size;
326 * nodes as possible (as compared to one single node getting all that
327 * extra memory.
328 */
329 big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
330 printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
331 "%d\n",
332 (sz >> 20), (hole_size >> 20), big);
333 memset(&nodes,0,sizeof(nodes));
334 end = start;
335 for (i = 0; i < numa_fake; i++) {
336 /*
337 * In case we are not able to allocate enough memory for all
338 * the nodes, we reduce the number of fake nodes.
339 */
340 if (end >= max_addr) {
341 numa_fake = i - 1;
342 break;
343 }
344 start = nodes[i].start = end;
345 /*
346 * Final node can have all the remaining memory.
347 */
348 if (i == numa_fake-1)
349 sz = max_addr - start;
350 end = nodes[i].start + sz;
351 /*
352 * Fir "big" number of nodes get extra granule.
353 */
354 if (i < big) 341 if (i < big)
355 end += FAKE_NODE_MIN_SIZE; 342 end += FAKE_NODE_MIN_SIZE;
356 /* 343 /*
357 * Iterate over the range to ensure that this node gets at 344 * The final node can have the remaining system RAM. Other
358 * least sz amount of RAM (excluding holes) 345 * nodes receive roughly the same amount of available pages.
359 */ 346 */
360 while ((end - start - e820_hole_size(start, end)) < sz) { 347 if (i == num_nodes + node_start - 1)
361 end += FAKE_NODE_MIN_SIZE; 348 end = max_addr;
362 if (end >= max_addr) 349 else
363 break; 350 while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
351 size) {
352 end += FAKE_NODE_MIN_SIZE;
353 if (end > max_addr) {
354 end = max_addr;
355 break;
356 }
357 }
358 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
359 break;
360 }
361 return i - node_start + 1;
362}
363
364/*
365 * Splits the remaining system RAM into chunks of size. The remaining memory is
366 * always assigned to a final node and can be asymmetric. Returns the number of
367 * nodes split.
368 */
369static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
370 u64 max_addr, int node_start, u64 size)
371{
372 int i = node_start;
373 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
374 while (!setup_node_range(i++, nodes, addr, size, max_addr))
375 ;
376 return i - node_start;
377}
378
379/*
380 * Sets up the system RAM area from start_pfn to end_pfn according to the
381 * numa=fake command-line option.
382 */
383static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
384{
385 struct bootnode nodes[MAX_NUMNODES];
386 u64 addr = start_pfn << PAGE_SHIFT;
387 u64 max_addr = end_pfn << PAGE_SHIFT;
388 int num_nodes = 0;
389 int coeff_flag;
390 int coeff = -1;
391 int num = 0;
392 u64 size;
393 int i;
394
395 memset(&nodes, 0, sizeof(nodes));
396 /*
397 * If the numa=fake command-line is just a single number N, split the
398 * system RAM into N fake nodes.
399 */
400 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
401 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
402 simple_strtol(cmdline, NULL, 0));
403 if (num_nodes < 0)
404 return num_nodes;
405 goto out;
406 }
407
408 /* Parse the command line. */
409 for (coeff_flag = 0; ; cmdline++) {
410 if (*cmdline && isdigit(*cmdline)) {
411 num = num * 10 + *cmdline - '0';
412 continue;
364 } 413 }
365 /* 414 if (*cmdline == '*') {
366 * Look at the next node to make sure there is some real memory 415 if (num > 0)
367 * to map. Bad things happen when the only memory present 416 coeff = num;
368 * in a zone on a fake node is IO hole. 417 coeff_flag = 1;
369 */ 418 }
370 while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) { 419 if (!*cmdline || *cmdline == ',') {
371 if (zone_cross_over(start, end + sz)) { 420 if (!coeff_flag)
372 end = (MAX_DMA32_PFN << PAGE_SHIFT); 421 coeff = 1;
422 /*
423 * Round down to the nearest FAKE_NODE_MIN_SIZE.
424 * Command-line coefficients are in megabytes.
425 */
426 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
427 if (size)
428 for (i = 0; i < coeff; i++, num_nodes++)
429 if (setup_node_range(num_nodes, nodes,
430 &addr, size, max_addr) < 0)
431 goto done;
432 if (!*cmdline)
373 break; 433 break;
374 } 434 coeff_flag = 0;
375 if (end >= max_addr) 435 coeff = -1;
436 }
437 num = 0;
438 }
439done:
440 if (!num_nodes)
441 return -1;
442 /* Fill remainder of system RAM, if appropriate. */
443 if (addr < max_addr) {
444 if (coeff_flag && coeff < 0) {
445 /* Split remaining nodes into num-sized chunks */
446 num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
447 num_nodes, num);
448 goto out;
449 }
450 switch (*(cmdline - 1)) {
451 case '*':
452 /* Split remaining nodes into coeff chunks */
453 if (coeff <= 0)
376 break; 454 break;
377 end += FAKE_NODE_MIN_SIZE; 455 num_nodes += split_nodes_equally(nodes, &addr, max_addr,
456 num_nodes, coeff);
457 break;
458 case ',':
459 /* Do not allocate remaining system RAM */
460 break;
461 default:
462 /* Give one final node */
463 setup_node_range(num_nodes, nodes, &addr,
464 max_addr - addr, max_addr);
465 num_nodes++;
378 } 466 }
379 if (end > max_addr) 467 }
380 end = max_addr; 468out:
381 nodes[i].end = end; 469 memnode_shift = compute_hash_shift(nodes, num_nodes);
382 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", 470 if (memnode_shift < 0) {
383 i, 471 memnode_shift = 0;
384 nodes[i].start, nodes[i].end, 472 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
385 (nodes[i].end - nodes[i].start) >> 20); 473 "disabled.\n");
386 node_set_online(i); 474 return -1;
387 } 475 }
388 memnode_shift = compute_hash_shift(nodes, numa_fake); 476
389 if (memnode_shift < 0) { 477 /*
390 memnode_shift = 0; 478 * We need to vacate all active ranges that may have been registered by
391 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n"); 479 * SRAT.
392 return -1; 480 */
393 } 481 remove_all_active_ranges();
394 for_each_online_node(i) { 482 for_each_node_mask(i, node_possible_map) {
395 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 483 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
396 nodes[i].end >> PAGE_SHIFT); 484 nodes[i].end >> PAGE_SHIFT);
397 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 485 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
@@ -399,26 +487,32 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
399 numa_init_array(); 487 numa_init_array();
400 return 0; 488 return 0;
401} 489}
402#endif 490#undef E820_ADDR_HOLE_SIZE
491#endif /* CONFIG_NUMA_EMU */
403 492
404void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) 493void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
405{ 494{
406 int i; 495 int i;
407 496
497 nodes_clear(node_possible_map);
498
408#ifdef CONFIG_NUMA_EMU 499#ifdef CONFIG_NUMA_EMU
409 if (numa_fake && !numa_emulation(start_pfn, end_pfn)) 500 if (cmdline && !numa_emulation(start_pfn, end_pfn))
410 return; 501 return;
502 nodes_clear(node_possible_map);
411#endif 503#endif
412 504
413#ifdef CONFIG_ACPI_NUMA 505#ifdef CONFIG_ACPI_NUMA
414 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 506 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
415 end_pfn << PAGE_SHIFT)) 507 end_pfn << PAGE_SHIFT))
416 return; 508 return;
509 nodes_clear(node_possible_map);
417#endif 510#endif
418 511
419#ifdef CONFIG_K8_NUMA 512#ifdef CONFIG_K8_NUMA
420 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) 513 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
421 return; 514 return;
515 nodes_clear(node_possible_map);
422#endif 516#endif
423 printk(KERN_INFO "%s\n", 517 printk(KERN_INFO "%s\n",
424 numa_off ? "NUMA turned off" : "No NUMA configuration found"); 518 numa_off ? "NUMA turned off" : "No NUMA configuration found");
@@ -432,6 +526,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
432 memnodemap[0] = 0; 526 memnodemap[0] = 0;
433 nodes_clear(node_online_map); 527 nodes_clear(node_online_map);
434 node_set_online(0); 528 node_set_online(0);
529 node_set(0, node_possible_map);
435 for (i = 0; i < NR_CPUS; i++) 530 for (i = 0; i < NR_CPUS; i++)
436 numa_set_node(i, 0); 531 numa_set_node(i, 0);
437 node_to_cpumask[0] = cpumask_of_cpu(0); 532 node_to_cpumask[0] = cpumask_of_cpu(0);
@@ -486,11 +581,8 @@ static __init int numa_setup(char *opt)
486 if (!strncmp(opt,"off",3)) 581 if (!strncmp(opt,"off",3))
487 numa_off = 1; 582 numa_off = 1;
488#ifdef CONFIG_NUMA_EMU 583#ifdef CONFIG_NUMA_EMU
489 if(!strncmp(opt, "fake=", 5)) { 584 if (!strncmp(opt, "fake=", 5))
490 numa_fake = simple_strtoul(opt+5,NULL,0); ; 585 cmdline = opt + 5;
491 if (numa_fake >= MAX_NUMNODES)
492 numa_fake = MAX_NUMNODES;
493 }
494#endif 586#endif
495#ifdef CONFIG_ACPI_NUMA 587#ifdef CONFIG_ACPI_NUMA
496 if (!strncmp(opt,"noacpi",6)) 588 if (!strncmp(opt,"noacpi",6))
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 081409aa3452..bf4aa8dd4254 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -51,7 +51,6 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot,
51 SetPagePrivate(base); 51 SetPagePrivate(base);
52 page_private(base) = 0; 52 page_private(base) = 0;
53 53
54 address = __pa(address);
55 addr = address & LARGE_PAGE_MASK; 54 addr = address & LARGE_PAGE_MASK;
56 pbase = (pte_t *)page_address(base); 55 pbase = (pte_t *)page_address(base);
57 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { 56 for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
@@ -101,13 +100,12 @@ static inline void save_page(struct page *fpage)
101 * No more special protections in this 2/4MB area - revert to a 100 * No more special protections in this 2/4MB area - revert to a
102 * large page again. 101 * large page again.
103 */ 102 */
104static void revert_page(unsigned long address, pgprot_t ref_prot) 103static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot)
105{ 104{
106 pgd_t *pgd; 105 pgd_t *pgd;
107 pud_t *pud; 106 pud_t *pud;
108 pmd_t *pmd; 107 pmd_t *pmd;
109 pte_t large_pte; 108 pte_t large_pte;
110 unsigned long pfn;
111 109
112 pgd = pgd_offset_k(address); 110 pgd = pgd_offset_k(address);
113 BUG_ON(pgd_none(*pgd)); 111 BUG_ON(pgd_none(*pgd));
@@ -115,7 +113,6 @@ static void revert_page(unsigned long address, pgprot_t ref_prot)
115 BUG_ON(pud_none(*pud)); 113 BUG_ON(pud_none(*pud));
116 pmd = pmd_offset(pud, address); 114 pmd = pmd_offset(pud, address);
117 BUG_ON(pmd_val(*pmd) & _PAGE_PSE); 115 BUG_ON(pmd_val(*pmd) & _PAGE_PSE);
118 pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT;
119 large_pte = pfn_pte(pfn, ref_prot); 116 large_pte = pfn_pte(pfn, ref_prot);
120 large_pte = pte_mkhuge(large_pte); 117 large_pte = pte_mkhuge(large_pte);
121 set_pte((pte_t *)pmd, large_pte); 118 set_pte((pte_t *)pmd, large_pte);
@@ -141,7 +138,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
141 */ 138 */
142 struct page *split; 139 struct page *split;
143 ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); 140 ref_prot2 = pte_pgprot(pte_clrhuge(*kpte));
144 split = split_large_page(address, prot, ref_prot2); 141 split = split_large_page(pfn << PAGE_SHIFT, prot,
142 ref_prot2);
145 if (!split) 143 if (!split)
146 return -ENOMEM; 144 return -ENOMEM;
147 set_pte(kpte, mk_pte(split, ref_prot2)); 145 set_pte(kpte, mk_pte(split, ref_prot2));
@@ -160,7 +158,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
160 158
161 if (page_private(kpte_page) == 0) { 159 if (page_private(kpte_page) == 0) {
162 save_page(kpte_page); 160 save_page(kpte_page);
163 revert_page(address, ref_prot); 161 revert_page(address, pfn, ref_prot);
164 } 162 }
165 return 0; 163 return 0;
166} 164}
@@ -180,22 +178,32 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot,
180 */ 178 */
181int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) 179int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot)
182{ 180{
183 int err = 0; 181 unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT;
182 int err = 0, kernel_map = 0;
184 int i; 183 int i;
185 184
185 if (address >= __START_KERNEL_map
186 && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) {
187 address = (unsigned long)__va(__pa(address));
188 kernel_map = 1;
189 }
190
186 down_write(&init_mm.mmap_sem); 191 down_write(&init_mm.mmap_sem);
187 for (i = 0; i < numpages; i++, address += PAGE_SIZE) { 192 for (i = 0; i < numpages; i++, address += PAGE_SIZE) {
188 unsigned long pfn = __pa(address) >> PAGE_SHIFT; 193 unsigned long pfn = __pa(address) >> PAGE_SHIFT;
189 194
190 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); 195 if (!kernel_map || pte_present(pfn_pte(0, prot))) {
191 if (err) 196 err = __change_page_attr(address, pfn, prot, PAGE_KERNEL);
192 break; 197 if (err)
198 break;
199 }
193 /* Handle kernel mapping too which aliases part of the 200 /* Handle kernel mapping too which aliases part of the
194 * lowmem */ 201 * lowmem */
195 if (__pa(address) < KERNEL_TEXT_SIZE) { 202 if ((pfn >= phys_base_pfn) &&
203 ((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) {
196 unsigned long addr2; 204 unsigned long addr2;
197 pgprot_t prot2; 205 pgprot_t prot2;
198 addr2 = __START_KERNEL_map + __pa(address); 206 addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT);
199 /* Make sure the kernel mappings stay executable */ 207 /* Make sure the kernel mappings stay executable */
200 prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); 208 prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot)));
201 err = __change_page_attr(addr2, pfn, prot2, 209 err = __change_page_attr(addr2, pfn, prot2,
diff --git a/arch/x86_64/mm/srat.c b/arch/x86_64/mm/srat.c
index 2efe215fc76a..1e76bb0a7277 100644
--- a/arch/x86_64/mm/srat.c
+++ b/arch/x86_64/mm/srat.c
@@ -419,19 +419,21 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
419 return -1; 419 return -1;
420 } 420 }
421 421
422 node_possible_map = nodes_parsed;
423
422 /* Finally register nodes */ 424 /* Finally register nodes */
423 for_each_node_mask(i, nodes_parsed) 425 for_each_node_mask(i, node_possible_map)
424 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 426 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
425 /* Try again in case setup_node_bootmem missed one due 427 /* Try again in case setup_node_bootmem missed one due
426 to missing bootmem */ 428 to missing bootmem */
427 for_each_node_mask(i, nodes_parsed) 429 for_each_node_mask(i, node_possible_map)
428 if (!node_online(i)) 430 if (!node_online(i))
429 setup_node_bootmem(i, nodes[i].start, nodes[i].end); 431 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
430 432
431 for (i = 0; i < NR_CPUS; i++) { 433 for (i = 0; i < NR_CPUS; i++) {
432 if (cpu_to_node[i] == NUMA_NO_NODE) 434 if (cpu_to_node[i] == NUMA_NO_NODE)
433 continue; 435 continue;
434 if (!node_isset(cpu_to_node[i], nodes_parsed)) 436 if (!node_isset(cpu_to_node[i], node_possible_map))
435 numa_set_node(i, NUMA_NO_NODE); 437 numa_set_node(i, NUMA_NO_NODE);
436 } 438 }
437 numa_init_array(); 439 numa_init_array();