aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/hugetlbpage.c1
-rw-r--r--arch/x86/mm/init.c40
-rw-r--r--arch/x86/mm/init_32.c17
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/ioremap.c55
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c16
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h2
-rw-r--r--arch/x86/mm/kmmio.c1
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/mmio-mod.c1
-rw-r--r--arch/x86/mm/numa_32.c3
-rw-r--r--arch/x86/mm/numa_64.c332
-rw-r--r--arch/x86/mm/pageattr.c27
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/mm/pgtable.c32
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/mm/srat_64.c51
-rw-r--r--arch/x86/mm/tlb.c8
19 files changed, 346 insertions, 261 deletions
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f46c340727b8..069ce7c37c01 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -9,7 +9,6 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/hugetlb.h> 10#include <linux/hugetlb.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
13#include <linux/err.h> 12#include <linux/err.h>
14#include <linux/sysctl.h> 13#include <linux/sysctl.h>
15#include <asm/mman.h> 14#include <asm/mman.h>
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index d406c5239019..b278535b14aa 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
1#include <linux/gfp.h>
1#include <linux/initrd.h> 2#include <linux/initrd.h>
2#include <linux/ioport.h> 3#include <linux/ioport.h>
3#include <linux/swap.h> 4#include <linux/swap.h>
@@ -266,16 +267,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
266 if (!after_bootmem) 267 if (!after_bootmem)
267 find_early_table_space(end, use_pse, use_gbpages); 268 find_early_table_space(end, use_pse, use_gbpages);
268 269
269#ifdef CONFIG_X86_32
270 for (i = 0; i < nr_range; i++)
271 kernel_physical_mapping_init(mr[i].start, mr[i].end,
272 mr[i].page_size_mask);
273 ret = end;
274#else /* CONFIG_X86_64 */
275 for (i = 0; i < nr_range; i++) 270 for (i = 0; i < nr_range; i++)
276 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 271 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
277 mr[i].page_size_mask); 272 mr[i].page_size_mask);
278#endif
279 273
280#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
281 early_ioremap_page_table_range_init(); 275 early_ioremap_page_table_range_init();
@@ -338,11 +332,23 @@ int devmem_is_allowed(unsigned long pagenr)
338 332
339void free_init_pages(char *what, unsigned long begin, unsigned long end) 333void free_init_pages(char *what, unsigned long begin, unsigned long end)
340{ 334{
341 unsigned long addr = begin; 335 unsigned long addr;
336 unsigned long begin_aligned, end_aligned;
337
338 /* Make sure boundaries are page aligned */
339 begin_aligned = PAGE_ALIGN(begin);
340 end_aligned = end & PAGE_MASK;
342 341
343 if (addr >= end) 342 if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
343 begin = begin_aligned;
344 end = end_aligned;
345 }
346
347 if (begin >= end)
344 return; 348 return;
345 349
350 addr = begin;
351
346 /* 352 /*
347 * If debugging page accesses then do not free this memory but 353 * If debugging page accesses then do not free this memory but
348 * mark them not present - any buggy init-section access will 354 * mark them not present - any buggy init-section access will
@@ -350,7 +356,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
350 */ 356 */
351#ifdef CONFIG_DEBUG_PAGEALLOC 357#ifdef CONFIG_DEBUG_PAGEALLOC
352 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", 358 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
353 begin, PAGE_ALIGN(end)); 359 begin, end);
354 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 360 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
355#else 361#else
356 /* 362 /*
@@ -365,8 +371,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
365 for (; addr < end; addr += PAGE_SIZE) { 371 for (; addr < end; addr += PAGE_SIZE) {
366 ClearPageReserved(virt_to_page(addr)); 372 ClearPageReserved(virt_to_page(addr));
367 init_page_count(virt_to_page(addr)); 373 init_page_count(virt_to_page(addr));
368 memset((void *)(addr & ~(PAGE_SIZE-1)), 374 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
369 POISON_FREE_INITMEM, PAGE_SIZE);
370 free_page(addr); 375 free_page(addr);
371 totalram_pages++; 376 totalram_pages++;
372 } 377 }
@@ -383,6 +388,15 @@ void free_initmem(void)
383#ifdef CONFIG_BLK_DEV_INITRD 388#ifdef CONFIG_BLK_DEV_INITRD
384void free_initrd_mem(unsigned long start, unsigned long end) 389void free_initrd_mem(unsigned long start, unsigned long end)
385{ 390{
386 free_init_pages("initrd memory", start, end); 391 /*
392 * end could be not aligned, and We can not align that,
393 * decompresser could be confused by aligned initrd_end
394 * We already reserve the end partial page before in
395 * - i386_start_kernel()
396 * - x86_64_start_kernel()
397 * - relocate_initrd()
398 * So here We can do PAGE_ALIGN() safely to get partial page to be freed
399 */
400 free_init_pages("initrd memory", start, PAGE_ALIGN(end));
387} 401}
388#endif 402#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9a0c258a86be..bca79091b9d6 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -25,11 +25,11 @@
25#include <linux/pfn.h> 25#include <linux/pfn.h>
26#include <linux/poison.h> 26#include <linux/poison.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/slab.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
30#include <linux/memory_hotplug.h> 29#include <linux/memory_hotplug.h>
31#include <linux/initrd.h> 30#include <linux/initrd.h>
32#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32#include <linux/gfp.h>
33 33
34#include <asm/asm.h> 34#include <asm/asm.h>
35#include <asm/bios_ebda.h> 35#include <asm/bios_ebda.h>
@@ -241,6 +241,7 @@ kernel_physical_mapping_init(unsigned long start,
241 unsigned long page_size_mask) 241 unsigned long page_size_mask)
242{ 242{
243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M); 243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
244 unsigned long last_map_addr = end;
244 unsigned long start_pfn, end_pfn; 245 unsigned long start_pfn, end_pfn;
245 pgd_t *pgd_base = swapper_pg_dir; 246 pgd_t *pgd_base = swapper_pg_dir;
246 int pgd_idx, pmd_idx, pte_ofs; 247 int pgd_idx, pmd_idx, pte_ofs;
@@ -341,9 +342,10 @@ repeat:
341 prot = PAGE_KERNEL_EXEC; 342 prot = PAGE_KERNEL_EXEC;
342 343
343 pages_4k++; 344 pages_4k++;
344 if (mapping_iter == 1) 345 if (mapping_iter == 1) {
345 set_pte(pte, pfn_pte(pfn, init_prot)); 346 set_pte(pte, pfn_pte(pfn, init_prot));
346 else 347 last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
348 } else
347 set_pte(pte, pfn_pte(pfn, prot)); 349 set_pte(pte, pfn_pte(pfn, prot));
348 } 350 }
349 } 351 }
@@ -368,7 +370,7 @@ repeat:
368 mapping_iter = 2; 370 mapping_iter = 2;
369 goto repeat; 371 goto repeat;
370 } 372 }
371 return 0; 373 return last_map_addr;
372} 374}
373 375
374pte_t *kmap_pte; 376pte_t *kmap_pte;
@@ -748,6 +750,7 @@ static void __init zone_sizes_init(void)
748 free_area_init_nodes(max_zone_pfns); 750 free_area_init_nodes(max_zone_pfns);
749} 751}
750 752
753#ifndef CONFIG_NO_BOOTMEM
751static unsigned long __init setup_node_bootmem(int nodeid, 754static unsigned long __init setup_node_bootmem(int nodeid,
752 unsigned long start_pfn, 755 unsigned long start_pfn,
753 unsigned long end_pfn, 756 unsigned long end_pfn,
@@ -764,13 +767,14 @@ static unsigned long __init setup_node_bootmem(int nodeid,
764 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", 767 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
765 nodeid, bootmap, bootmap + bootmap_size); 768 nodeid, bootmap, bootmap + bootmap_size);
766 free_bootmem_with_active_regions(nodeid, end_pfn); 769 free_bootmem_with_active_regions(nodeid, end_pfn);
767 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
768 770
769 return bootmap + bootmap_size; 771 return bootmap + bootmap_size;
770} 772}
773#endif
771 774
772void __init setup_bootmem_allocator(void) 775void __init setup_bootmem_allocator(void)
773{ 776{
777#ifndef CONFIG_NO_BOOTMEM
774 int nodeid; 778 int nodeid;
775 unsigned long bootmap_size, bootmap; 779 unsigned long bootmap_size, bootmap;
776 /* 780 /*
@@ -782,11 +786,13 @@ void __init setup_bootmem_allocator(void)
782 if (bootmap == -1L) 786 if (bootmap == -1L)
783 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 787 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
784 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 788 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
789#endif
785 790
786 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 791 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
787 max_pfn_mapped<<PAGE_SHIFT); 792 max_pfn_mapped<<PAGE_SHIFT);
788 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 793 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
789 794
795#ifndef CONFIG_NO_BOOTMEM
790 for_each_online_node(nodeid) { 796 for_each_online_node(nodeid) {
791 unsigned long start_pfn, end_pfn; 797 unsigned long start_pfn, end_pfn;
792 798
@@ -804,6 +810,7 @@ void __init setup_bootmem_allocator(void)
804 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, 810 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
805 bootmap); 811 bootmap);
806 } 812 }
813#endif
807 814
808 after_bootmem = 1; 815 after_bootmem = 1;
809} 816}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 69ddfbd91135..ee41bba315d1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/memory_hotplug.h> 30#include <linux/memory_hotplug.h>
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32#include <linux/gfp.h>
32 33
33#include <asm/processor.h> 34#include <asm/processor.h>
34#include <asm/bios_ebda.h> 35#include <asm/bios_ebda.h>
@@ -572,6 +573,7 @@ kernel_physical_mapping_init(unsigned long start,
572void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 573void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
573 int acpi, int k8) 574 int acpi, int k8)
574{ 575{
576#ifndef CONFIG_NO_BOOTMEM
575 unsigned long bootmap_size, bootmap; 577 unsigned long bootmap_size, bootmap;
576 578
577 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 579 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
@@ -579,13 +581,15 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
579 PAGE_SIZE); 581 PAGE_SIZE);
580 if (bootmap == -1L) 582 if (bootmap == -1L)
581 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 583 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
584 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
582 /* don't touch min_low_pfn */ 585 /* don't touch min_low_pfn */
583 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, 586 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
584 0, end_pfn); 587 0, end_pfn);
585 e820_register_active_regions(0, start_pfn, end_pfn); 588 e820_register_active_regions(0, start_pfn, end_pfn);
586 free_bootmem_with_active_regions(0, end_pfn); 589 free_bootmem_with_active_regions(0, end_pfn);
587 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); 590#else
588 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); 591 e820_register_active_regions(0, start_pfn, end_pfn);
592#endif
589} 593}
590#endif 594#endif
591 595
@@ -974,7 +978,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
974 if (pmd_none(*pmd)) { 978 if (pmd_none(*pmd)) {
975 pte_t entry; 979 pte_t entry;
976 980
977 p = vmemmap_alloc_block(PMD_SIZE, node); 981 p = vmemmap_alloc_block_buf(PMD_SIZE, node);
978 if (!p) 982 if (!p)
979 return -ENOMEM; 983 return -ENOMEM;
980 984
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index c246d259822d..12e4d2d3c110 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,43 +24,6 @@
24 24
25#include "physaddr.h" 25#include "physaddr.h"
26 26
27int page_is_ram(unsigned long pagenr)
28{
29 resource_size_t addr, end;
30 int i;
31
32 /*
33 * A special case is the first 4Kb of memory;
34 * This is a BIOS owned area, not kernel ram, but generally
35 * not listed as such in the E820 table.
36 */
37 if (pagenr == 0)
38 return 0;
39
40 /*
41 * Second special case: Some BIOSen report the PC BIOS
42 * area (640->1Mb) as ram even though it is not.
43 */
44 if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
45 pagenr < (BIOS_END >> PAGE_SHIFT))
46 return 0;
47
48 for (i = 0; i < e820.nr_map; i++) {
49 /*
50 * Not usable memory:
51 */
52 if (e820.map[i].type != E820_RAM)
53 continue;
54 addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
55 end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
56
57
58 if ((pagenr >= addr) && (pagenr < end))
59 return 1;
60 }
61 return 0;
62}
63
64/* 27/*
65 * Fix up the linear direct mapping of the kernel to avoid cache attribute 28 * Fix up the linear direct mapping of the kernel to avoid cache attribute
66 * conflicts. 29 * conflicts.
@@ -422,6 +385,10 @@ void __init early_ioremap_init(void)
422 * The boot-ioremap range spans multiple pmds, for which 385 * The boot-ioremap range spans multiple pmds, for which
423 * we are not prepared: 386 * we are not prepared:
424 */ 387 */
388#define __FIXADDR_TOP (-PAGE_SIZE)
389 BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
390 != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
391#undef __FIXADDR_TOP
425 if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { 392 if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
426 WARN_ON(1); 393 WARN_ON(1);
427 printk(KERN_WARNING "pmd %p != %p\n", 394 printk(KERN_WARNING "pmd %p != %p\n",
@@ -481,6 +448,20 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
481static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; 448static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
482static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; 449static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
483 450
451void __init fixup_early_ioremap(void)
452{
453 int i;
454
455 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
456 if (prev_map[i]) {
457 WARN_ON(1);
458 break;
459 }
460 }
461
462 early_ioremap_init();
463}
464
484static int __init check_early_ioremap_leak(void) 465static int __init check_early_ioremap_leak(void)
485{ 466{
486 int count = 0; 467 int count = 0;
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 8cc183344140..b3b531a4f8e5 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -337,7 +337,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
337 if (!shadow) 337 if (!shadow)
338 return true; 338 return true;
339 339
340 status = kmemcheck_shadow_test(shadow, size); 340 status = kmemcheck_shadow_test_all(shadow, size);
341 341
342 return status == KMEMCHECK_SHADOW_INITIALIZED; 342 return status == KMEMCHECK_SHADOW_INITIALIZED;
343} 343}
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index 3f66b82076a3..aec124214d97 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -125,12 +125,12 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
125 125
126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) 126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
127{ 127{
128#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
128 uint8_t *x; 129 uint8_t *x;
129 unsigned int i; 130 unsigned int i;
130 131
131 x = shadow; 132 x = shadow;
132 133
133#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
134 /* 134 /*
135 * Make sure _some_ bytes are initialized. Gcc frequently generates 135 * Make sure _some_ bytes are initialized. Gcc frequently generates
136 * code to access neighboring bytes. 136 * code to access neighboring bytes.
@@ -139,13 +139,25 @@ enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) 139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
140 return x[i]; 140 return x[i];
141 } 141 }
142
143 return x[0];
142#else 144#else
145 return kmemcheck_shadow_test_all(shadow, size);
146#endif
147}
148
149enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
150{
151 uint8_t *x;
152 unsigned int i;
153
154 x = shadow;
155
143 /* All bytes must be initialized. */ 156 /* All bytes must be initialized. */
144 for (i = 0; i < size; ++i) { 157 for (i = 0; i < size; ++i) {
145 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) 158 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
146 return x[i]; 159 return x[i];
147 } 160 }
148#endif
149 161
150 return x[0]; 162 return x[0];
151} 163}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index af46d9ab9d86..ff0b2f70fbcb 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -11,6 +11,8 @@ enum kmemcheck_shadow {
11void *kmemcheck_shadow_lookup(unsigned long address); 11void *kmemcheck_shadow_lookup(unsigned long address);
12 12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); 13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
15 unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size); 16void kmemcheck_shadow_set(void *shadow, unsigned int size);
15 17
16#endif 18#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 536fb6823366..5d0e67fff1a6 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -21,6 +21,7 @@
21#include <linux/kdebug.h> 21#include <linux/kdebug.h>
22#include <linux/mutex.h> 22#include <linux/mutex.h>
23#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/slab.h>
24#include <asm/cacheflush.h> 25#include <asm/cacheflush.h>
25#include <asm/tlbflush.h> 26#include <asm/tlbflush.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c8191defc38a..1dab5194fd9d 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -71,7 +71,7 @@ static int mmap_is_legacy(void)
71 if (current->personality & ADDR_COMPAT_LAYOUT) 71 if (current->personality & ADDR_COMPAT_LAYOUT)
72 return 1; 72 return 1;
73 73
74 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) 74 if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
75 return 1; 75 return 1;
76 76
77 return sysctl_legacy_va_layout; 77 return sysctl_legacy_va_layout;
@@ -96,7 +96,7 @@ static unsigned long mmap_rnd(void)
96 96
97static unsigned long mmap_base(void) 97static unsigned long mmap_base(void)
98{ 98{
99 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; 99 unsigned long gap = rlimit(RLIMIT_STACK);
100 100
101 if (gap < MIN_GAP) 101 if (gap < MIN_GAP)
102 gap = MIN_GAP; 102 gap = MIN_GAP;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 34a3291ca103..3adff7dcc148 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -26,6 +26,7 @@
26 26
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/debugfs.h> 28#include <linux/debugfs.h>
29#include <linux/slab.h>
29#include <linux/uaccess.h> 30#include <linux/uaccess.h>
30#include <linux/io.h> 31#include <linux/io.h>
31#include <linux/version.h> 32#include <linux/version.h>
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index b20760ca7244..809baaaf48b1 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -418,7 +418,10 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
418 418
419 for_each_online_node(nid) { 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->node_id = nid;
422#ifndef CONFIG_NO_BOOTMEM
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 423 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
424#endif
422 } 425 }
423 426
424 setup_bootmem_allocator(); 427 setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 83bbc70d11bb..8948f47fde05 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -163,30 +163,48 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
163 unsigned long end, unsigned long size, 163 unsigned long end, unsigned long size,
164 unsigned long align) 164 unsigned long align)
165{ 165{
166 unsigned long mem = find_e820_area(start, end, size, align); 166 unsigned long mem;
167 void *ptr;
168 167
168 /*
169 * put it on high as possible
170 * something will go with NODE_DATA
171 */
172 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
173 start = MAX_DMA_PFN<<PAGE_SHIFT;
174 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
175 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
176 start = MAX_DMA32_PFN<<PAGE_SHIFT;
177 mem = find_e820_area(start, end, size, align);
178 if (mem != -1L)
179 return __va(mem);
180
181 /* extend the search scope */
182 end = max_pfn_mapped << PAGE_SHIFT;
183 if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
184 start = MAX_DMA32_PFN<<PAGE_SHIFT;
185 else
186 start = MAX_DMA_PFN<<PAGE_SHIFT;
187 mem = find_e820_area(start, end, size, align);
169 if (mem != -1L) 188 if (mem != -1L)
170 return __va(mem); 189 return __va(mem);
171 190
172 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 191 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
173 if (ptr == NULL) {
174 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
175 size, nodeid); 192 size, nodeid);
176 return NULL; 193
177 } 194 return NULL;
178 return ptr;
179} 195}
180 196
181/* Initialize bootmem allocator for a node */ 197/* Initialize bootmem allocator for a node */
182void __init 198void __init
183setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 199setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
184{ 200{
185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 201 unsigned long start_pfn, last_pfn, nodedata_phys;
186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 202 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
187 unsigned long bootmap_start, nodedata_phys;
188 void *bootmap;
189 int nid; 203 int nid;
204#ifndef CONFIG_NO_BOOTMEM
205 unsigned long bootmap_start, bootmap_pages, bootmap_size;
206 void *bootmap;
207#endif
190 208
191 if (!end) 209 if (!end)
192 return; 210 return;
@@ -200,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
200 218
201 start = roundup(start, ZONE_ALIGN); 219 start = roundup(start, ZONE_ALIGN);
202 220
203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 221 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
204 start, end); 222 start, end);
205 223
206 start_pfn = start >> PAGE_SHIFT; 224 start_pfn = start >> PAGE_SHIFT;
@@ -211,14 +229,21 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
211 if (node_data[nodeid] == NULL) 229 if (node_data[nodeid] == NULL)
212 return; 230 return;
213 nodedata_phys = __pa(node_data[nodeid]); 231 nodedata_phys = __pa(node_data[nodeid]);
232 reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
214 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 233 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
215 nodedata_phys + pgdat_size - 1); 234 nodedata_phys + pgdat_size - 1);
235 nid = phys_to_nid(nodedata_phys);
236 if (nid != nodeid)
237 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
216 238
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 239 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; 240 NODE_DATA(nodeid)->node_id = nodeid;
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 241 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 242 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 243
244#ifndef CONFIG_NO_BOOTMEM
245 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
246
222 /* 247 /*
223 * Find a place for the bootmem map 248 * Find a place for the bootmem map
224 * nodedata_phys could be on other nodes by alloc_bootmem, 249 * nodedata_phys could be on other nodes by alloc_bootmem,
@@ -227,11 +252,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
227 * of alloc_bootmem, that could clash with reserved range 252 * of alloc_bootmem, that could clash with reserved range
228 */ 253 */
229 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 254 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 255 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
231 if (nid == nodeid)
232 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else
234 bootmap_start = roundup(start, PAGE_SIZE);
235 /* 256 /*
236 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 257 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 258 * to use that to align to PAGE_SIZE
@@ -239,18 +260,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 260 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 261 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 262 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) { 263 free_early(nodedata_phys, nodedata_phys + pgdat_size);
243 /*
244 * only need to free it if it is from other node
245 * bootmem
246 */
247 if (nid != nodeid)
248 free_bootmem(nodedata_phys, pgdat_size);
249 }
250 node_data[nodeid] = NULL; 264 node_data[nodeid] = NULL;
251 return; 265 return;
252 } 266 }
253 bootmap_start = __pa(bootmap); 267 bootmap_start = __pa(bootmap);
268 reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
269 "BOOTMAP");
254 270
255 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 271 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
256 bootmap_start >> PAGE_SHIFT, 272 bootmap_start >> PAGE_SHIFT,
@@ -259,31 +275,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
259 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 275 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
260 bootmap_start, bootmap_start + bootmap_size - 1, 276 bootmap_start, bootmap_start + bootmap_size - 1,
261 bootmap_pages); 277 bootmap_pages);
262
263 free_bootmem_with_active_regions(nodeid, end);
264
265 /*
266 * convert early reserve to bootmem reserve earlier
267 * otherwise early_node_mem could use early reserved mem
268 * on previous node
269 */
270 early_res_to_bootmem(start, end);
271
272 /*
273 * in some case early_node_mem could use alloc_bootmem
274 * to get range on other node, don't reserve that again
275 */
276 if (nid != nodeid)
277 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
278 else
279 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
280 pgdat_size, BOOTMEM_DEFAULT);
281 nid = phys_to_nid(bootmap_start); 278 nid = phys_to_nid(bootmap_start);
282 if (nid != nodeid) 279 if (nid != nodeid)
283 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); 280 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
284 else 281
285 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 282 free_bootmem_with_active_regions(nodeid, end);
286 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 283#endif
287 284
288 node_set_online(nodeid); 285 node_set_online(nodeid);
289} 286}
@@ -427,7 +424,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
427 * Calculate the number of big nodes that can be allocated as a result 424 * Calculate the number of big nodes that can be allocated as a result
428 * of consolidating the remainder. 425 * of consolidating the remainder.
429 */ 426 */
430 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / 427 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
431 FAKE_NODE_MIN_SIZE; 428 FAKE_NODE_MIN_SIZE;
432 429
433 size &= FAKE_NODE_MIN_HASH_MASK; 430 size &= FAKE_NODE_MIN_HASH_MASK;
@@ -502,77 +499,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
502} 499}
503 500
504/* 501/*
505 * Splits num_nodes nodes up equally starting at node_start. The return value 502 * Returns the end address of a node so that there is at least `size' amount of
506 * is the number of nodes split up and addr is adjusted to be at the end of the 503 * non-reserved memory or `max_addr' is reached.
507 * last node allocated.
508 */ 504 */
509static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, 505static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
510 int num_nodes)
511{ 506{
512 unsigned int big; 507 u64 end = start + size;
513 u64 size;
514 int i;
515 508
516 if (num_nodes <= 0) 509 while (end - start - e820_hole_size(start, end) < size) {
517 return -1; 510 end += FAKE_NODE_MIN_SIZE;
518 if (num_nodes > MAX_NUMNODES) 511 if (end > max_addr) {
519 num_nodes = MAX_NUMNODES;
520 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
521 num_nodes;
522 /*
523 * Calculate the number of big nodes that can be allocated as a result
524 * of consolidating the leftovers.
525 */
526 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
527 FAKE_NODE_MIN_SIZE;
528
529 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
530 size &= FAKE_NODE_MIN_HASH_MASK;
531 if (!size) {
532 printk(KERN_ERR "Not enough memory for each node. "
533 "NUMA emulation disabled.\n");
534 return -1;
535 }
536
537 for (i = node_start; i < num_nodes + node_start; i++) {
538 u64 end = *addr + size;
539
540 if (i < big)
541 end += FAKE_NODE_MIN_SIZE;
542 /*
543 * The final node can have the remaining system RAM. Other
544 * nodes receive roughly the same amount of available pages.
545 */
546 if (i == num_nodes + node_start - 1)
547 end = max_addr; 512 end = max_addr;
548 else
549 while (end - *addr - e820_hole_size(*addr, end) <
550 size) {
551 end += FAKE_NODE_MIN_SIZE;
552 if (end > max_addr) {
553 end = max_addr;
554 break;
555 }
556 }
557 if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
558 break; 513 break;
514 }
559 } 515 }
560 return i - node_start + 1; 516 return end;
561} 517}
562 518
563/* 519/*
564 * Splits the remaining system RAM into chunks of size. The remaining memory is 520 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
565 * always assigned to a final node and can be asymmetric. Returns the number of 521 * `addr' to `max_addr'. The return value is the number of nodes allocated.
566 * nodes split.
567 */ 522 */
568static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, 523static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
569 u64 size)
570{ 524{
571 int i = node_start; 525 nodemask_t physnode_mask = NODE_MASK_NONE;
572 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 526 u64 min_size;
573 while (!setup_node_range(i++, addr, size, max_addr)) 527 int ret = 0;
574 ; 528 int i;
575 return i - node_start; 529
530 if (!size)
531 return -1;
532 /*
533 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
534 * increased accordingly if the requested size is too small. This
535 * creates a uniform distribution of node sizes across the entire
536 * machine (but not necessarily over physical nodes).
537 */
538 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
539 MAX_NUMNODES;
540 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
541 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
542 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
543 FAKE_NODE_MIN_HASH_MASK;
544 if (size < min_size) {
545 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
546 size >> 20, min_size >> 20);
547 size = min_size;
548 }
549 size &= FAKE_NODE_MIN_HASH_MASK;
550
551 for (i = 0; i < MAX_NUMNODES; i++)
552 if (physnodes[i].start != physnodes[i].end)
553 node_set(i, physnode_mask);
554 /*
555 * Fill physical nodes with fake nodes of size until there is no memory
556 * left on any of them.
557 */
558 while (nodes_weight(physnode_mask)) {
559 for_each_node_mask(i, physnode_mask) {
560 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
561 u64 end;
562
563 end = find_end_of_node(physnodes[i].start,
564 physnodes[i].end, size);
565 /*
566 * If there won't be at least FAKE_NODE_MIN_SIZE of
567 * non-reserved memory in ZONE_DMA32 for the next node,
568 * this one must extend to the boundary.
569 */
570 if (end < dma32_end && dma32_end - end -
571 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
572 end = dma32_end;
573
574 /*
575 * If there won't be enough non-reserved memory for the
576 * next node, this one must extend to the end of the
577 * physical node.
578 */
579 if (physnodes[i].end - end -
580 e820_hole_size(end, physnodes[i].end) < size)
581 end = physnodes[i].end;
582
583 /*
584 * Setup the fake node that will be allocated as bootmem
585 * later. If setup_node_range() returns non-zero, there
586 * is no more memory available on this physical node.
587 */
588 if (setup_node_range(ret++, &physnodes[i].start,
589 end - physnodes[i].start,
590 physnodes[i].end) < 0)
591 node_clear(i, physnode_mask);
592 }
593 }
594 return ret;
576} 595}
577 596
578/* 597/*
@@ -582,87 +601,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
582static int __init numa_emulation(unsigned long start_pfn, 601static int __init numa_emulation(unsigned long start_pfn,
583 unsigned long last_pfn, int acpi, int k8) 602 unsigned long last_pfn, int acpi, int k8)
584{ 603{
585 u64 size, addr = start_pfn << PAGE_SHIFT; 604 u64 addr = start_pfn << PAGE_SHIFT;
586 u64 max_addr = last_pfn << PAGE_SHIFT; 605 u64 max_addr = last_pfn << PAGE_SHIFT;
587 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
588 int num_phys_nodes; 606 int num_phys_nodes;
607 int num_nodes;
608 int i;
589 609
590 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); 610 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
591 /* 611 /*
592 * If the numa=fake command-line is just a single number N, split the 612 * If the numa=fake command-line contains a 'M' or 'G', it represents
593 * system RAM into N fake nodes. 613 * the fixed node size. Otherwise, if it is just a single number N,
614 * split the system RAM into N fake nodes.
594 */ 615 */
595 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 616 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
596 long n = simple_strtol(cmdline, NULL, 0); 617 u64 size;
597
598 num_nodes = split_nodes_interleave(addr, max_addr,
599 num_phys_nodes, n);
600 if (num_nodes < 0)
601 return num_nodes;
602 goto out;
603 }
604 618
605 /* Parse the command line. */ 619 size = memparse(cmdline, &cmdline);
606 for (coeff_flag = 0; ; cmdline++) { 620 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
607 if (*cmdline && isdigit(*cmdline)) { 621 } else {
608 num = num * 10 + *cmdline - '0'; 622 unsigned long n;
609 continue; 623
610 } 624 n = simple_strtoul(cmdline, NULL, 0);
611 if (*cmdline == '*') { 625 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
612 if (num > 0)
613 coeff = num;
614 coeff_flag = 1;
615 }
616 if (!*cmdline || *cmdline == ',') {
617 if (!coeff_flag)
618 coeff = 1;
619 /*
620 * Round down to the nearest FAKE_NODE_MIN_SIZE.
621 * Command-line coefficients are in megabytes.
622 */
623 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
624 if (size)
625 for (i = 0; i < coeff; i++, num_nodes++)
626 if (setup_node_range(num_nodes, &addr,
627 size, max_addr) < 0)
628 goto done;
629 if (!*cmdline)
630 break;
631 coeff_flag = 0;
632 coeff = -1;
633 }
634 num = 0;
635 }
636done:
637 if (!num_nodes)
638 return -1;
639 /* Fill remainder of system RAM, if appropriate. */
640 if (addr < max_addr) {
641 if (coeff_flag && coeff < 0) {
642 /* Split remaining nodes into num-sized chunks */
643 num_nodes += split_nodes_by_size(&addr, max_addr,
644 num_nodes, num);
645 goto out;
646 }
647 switch (*(cmdline - 1)) {
648 case '*':
649 /* Split remaining nodes into coeff chunks */
650 if (coeff <= 0)
651 break;
652 num_nodes += split_nodes_equally(&addr, max_addr,
653 num_nodes, coeff);
654 break;
655 case ',':
656 /* Do not allocate remaining system RAM */
657 break;
658 default:
659 /* Give one final node */
660 setup_node_range(num_nodes, &addr, max_addr - addr,
661 max_addr);
662 num_nodes++;
663 }
664 } 626 }
665out: 627
628 if (num_nodes < 0)
629 return num_nodes;
666 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 630 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
667 if (memnode_shift < 0) { 631 if (memnode_shift < 0) {
668 memnode_shift = 0; 632 memnode_shift = 0;
@@ -742,6 +706,10 @@ unsigned long __init numa_free_all_bootmem(void)
742 for_each_online_node(i) 706 for_each_online_node(i)
743 pages += free_all_bootmem_node(NODE_DATA(i)); 707 pages += free_all_bootmem_node(NODE_DATA(i));
744 708
709#ifdef CONFIG_NO_BOOTMEM
710 pages += free_all_memory_core_early(MAX_NUMNODES);
711#endif
712
745 return pages; 713 return pages;
746} 714}
747 715
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1d4eb93d333c..28195c350b97 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -6,13 +6,13 @@
6#include <linux/bootmem.h> 6#include <linux/bootmem.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/mm.h> 9#include <linux/mm.h>
11#include <linux/interrupt.h> 10#include <linux/interrupt.h>
12#include <linux/seq_file.h> 11#include <linux/seq_file.h>
13#include <linux/debugfs.h> 12#include <linux/debugfs.h>
14#include <linux/pfn.h> 13#include <linux/pfn.h>
15#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <linux/gfp.h>
16 16
17#include <asm/e820.h> 17#include <asm/e820.h>
18#include <asm/processor.h> 18#include <asm/processor.h>
@@ -291,8 +291,29 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
291 */ 291 */
292 if (kernel_set_to_readonly && 292 if (kernel_set_to_readonly &&
293 within(address, (unsigned long)_text, 293 within(address, (unsigned long)_text,
294 (unsigned long)__end_rodata_hpage_align)) 294 (unsigned long)__end_rodata_hpage_align)) {
295 pgprot_val(forbidden) |= _PAGE_RW; 295 unsigned int level;
296
297 /*
298 * Don't enforce the !RW mapping for the kernel text mapping,
299 * if the current mapping is already using small page mapping.
300 * No need to work hard to preserve large page mappings in this
301 * case.
302 *
303 * This also fixes the Linux Xen paravirt guest boot failure
304 * (because of unexpected read-only mappings for kernel identity
305 * mappings). In this paravirt guest case, the kernel text
306 * mapping and the kernel identity mapping share the same
307 * page-table pages. Thus we can't really use different
308 * protections for the kernel text and identity mappings. Also,
309 * these shared mappings are made of small page mappings.
310 * Thus this don't enforce !RW mapping for small page kernel
311 * text mapping logic will help Linux Xen parvirt guest boot
312 * aswell.
313 */
314 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
315 pgprot_val(forbidden) |= _PAGE_RW;
316 }
296#endif 317#endif
297 318
298 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 319 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 501fc60e5e4d..bbe5502ee1cb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,7 +12,7 @@
12#include <linux/debugfs.h> 12#include <linux/debugfs.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/gfp.h> 15#include <linux/slab.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/rbtree.h> 18#include <linux/rbtree.h>
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ed34f5e35999..5c4ee422590e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,4 +1,5 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/gfp.h>
2#include <asm/pgalloc.h> 3#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 4#include <asm/pgtable.h>
4#include <asm/tlb.h> 5#include <asm/tlb.h>
@@ -6,6 +7,14 @@
6 7
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO 8#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8 9
10#ifdef CONFIG_HIGHPTE
11#define PGALLOC_USER_GFP __GFP_HIGHMEM
12#else
13#define PGALLOC_USER_GFP 0
14#endif
15
16gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
17
9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 18pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
10{ 19{
11 return (pte_t *)__get_free_page(PGALLOC_GFP); 20 return (pte_t *)__get_free_page(PGALLOC_GFP);
@@ -15,16 +24,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
15{ 24{
16 struct page *pte; 25 struct page *pte;
17 26
18#ifdef CONFIG_HIGHPTE 27 pte = alloc_pages(__userpte_alloc_gfp, 0);
19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
20#else
21 pte = alloc_pages(PGALLOC_GFP, 0);
22#endif
23 if (pte) 28 if (pte)
24 pgtable_page_ctor(pte); 29 pgtable_page_ctor(pte);
25 return pte; 30 return pte;
26} 31}
27 32
33static int __init setup_userpte(char *arg)
34{
35 if (!arg)
36 return -EINVAL;
37
38 /*
39 * "userpte=nohigh" disables allocation of user pagetables in
40 * high memory.
41 */
42 if (strcmp(arg, "nohigh") == 0)
43 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
44 else
45 return -EINVAL;
46 return 0;
47}
48early_param("userpte", setup_userpte);
49
28void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 50void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
29{ 51{
30 pgtable_page_dtor(pte); 52 pgtable_page_dtor(pte);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 46c8834aedc0..792854003ed3 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -6,7 +6,6 @@
6#include <linux/swap.h> 6#include <linux/swap.h>
7#include <linux/smp.h> 7#include <linux/smp.h>
8#include <linux/highmem.h> 8#include <linux/highmem.h>
9#include <linux/slab.h>
10#include <linux/pagemap.h> 9#include <linux/pagemap.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/module.h> 11#include <linux/module.h>
@@ -19,6 +18,7 @@
19#include <asm/e820.h> 18#include <asm/e820.h>
20#include <asm/tlb.h> 19#include <asm/tlb.h>
21#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
21#include <asm/io.h>
22 22
23unsigned int __VMALLOC_RESERVE = 128 << 20; 23unsigned int __VMALLOC_RESERVE = 128 << 20;
24 24
@@ -129,6 +129,7 @@ static int __init parse_reservetop(char *arg)
129 129
130 address = memparse(arg, &arg); 130 address = memparse(arg, &arg);
131 reserve_top_address(address); 131 reserve_top_address(address);
132 fixup_early_ioremap();
132 return 0; 133 return 0;
133} 134}
134early_param("reservetop", parse_reservetop); 135early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 28c68762648f..f9897f7a9ef1 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -363,6 +363,54 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
363 for (i = 0; i < MAX_NUMNODES; i++) 363 for (i = 0; i < MAX_NUMNODES; i++)
364 cutoff_node(i, start, end); 364 cutoff_node(i, start, end);
365 365
366 /*
367 * Join together blocks on the same node, holes between
368 * which don't overlap with memory on other nodes.
369 */
370 for (i = 0; i < num_node_memblks; ++i) {
371 int j, k;
372
373 for (j = i + 1; j < num_node_memblks; ++j) {
374 unsigned long start, end;
375
376 if (memblk_nodeid[i] != memblk_nodeid[j])
377 continue;
378 start = min(node_memblk_range[i].end,
379 node_memblk_range[j].end);
380 end = max(node_memblk_range[i].start,
381 node_memblk_range[j].start);
382 for (k = 0; k < num_node_memblks; ++k) {
383 if (memblk_nodeid[i] == memblk_nodeid[k])
384 continue;
385 if (start < node_memblk_range[k].end &&
386 end > node_memblk_range[k].start)
387 break;
388 }
389 if (k < num_node_memblks)
390 continue;
391 start = min(node_memblk_range[i].start,
392 node_memblk_range[j].start);
393 end = max(node_memblk_range[i].end,
394 node_memblk_range[j].end);
395 printk(KERN_INFO "SRAT: Node %d "
396 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
397 memblk_nodeid[i],
398 node_memblk_range[i].start,
399 node_memblk_range[i].end,
400 node_memblk_range[j].start,
401 node_memblk_range[j].end,
402 start, end);
403 node_memblk_range[i].start = start;
404 node_memblk_range[i].end = end;
405 k = --num_node_memblks - j;
406 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
407 k * sizeof(*memblk_nodeid));
408 memmove(node_memblk_range + j, node_memblk_range + j+1,
409 k * sizeof(*node_memblk_range));
410 --j;
411 }
412 }
413
366 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 414 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
367 memblk_nodeid); 415 memblk_nodeid);
368 if (memnode_shift < 0) { 416 if (memnode_shift < 0) {
@@ -461,7 +509,8 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
461 * node, it must now point to the fake node ID. 509 * node, it must now point to the fake node ID.
462 */ 510 */
463 for (j = 0; j < MAX_LOCAL_APIC; j++) 511 for (j = 0; j < MAX_LOCAL_APIC; j++)
464 if (apicid_to_node[j] == nid) 512 if (apicid_to_node[j] == nid &&
513 fake_apicid_to_node[j] == NUMA_NO_NODE)
465 fake_apicid_to_node[j] = i; 514 fake_apicid_to_node[j] = i;
466 } 515 }
467 for (i = 0; i < num_nodes; i++) 516 for (i = 0; i < num_nodes; i++)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 65b58e4b0b8b..426f3a1a64d3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -41,7 +41,7 @@ union smp_flush_state {
41 struct { 41 struct {
42 struct mm_struct *flush_mm; 42 struct mm_struct *flush_mm;
43 unsigned long flush_va; 43 unsigned long flush_va;
44 spinlock_t tlbstate_lock; 44 raw_spinlock_t tlbstate_lock;
45 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
46 }; 46 };
47 char pad[INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
@@ -181,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is 181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
182 * probably not worth checking this for a cache-hot lock. 182 * probably not worth checking this for a cache-hot lock.
183 */ 183 */
184 spin_lock(&f->tlbstate_lock); 184 raw_spin_lock(&f->tlbstate_lock);
185 185
186 f->flush_mm = mm; 186 f->flush_mm = mm;
187 f->flush_va = va; 187 f->flush_va = va;
@@ -199,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
199 199
200 f->flush_mm = NULL; 200 f->flush_mm = NULL;
201 f->flush_va = 0; 201 f->flush_va = 0;
202 spin_unlock(&f->tlbstate_lock); 202 raw_spin_unlock(&f->tlbstate_lock);
203} 203}
204 204
205void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -223,7 +223,7 @@ static int __cpuinit init_smp_flush(void)
223 int i; 223 int i;
224 224
225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 225 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 spin_lock_init(&flush_state[i].tlbstate_lock); 226 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 227
228 return 0; 228 return 0;
229} 229}