aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2011-07-12 03:45:34 -0400
committerH. Peter Anvin <hpa@linux.intel.com>2011-07-13 00:58:29 -0400
commit1e01979c8f502ac13e3cdece4f38712c5944e6e8 (patch)
treed47c4700bfdcffc3f7f68b19d50c588c20689b48
parentd0ead157387f19801beb1b419568723b2e9b7c79 (diff)
x86, numa: Implement pfn -> nid mapping granularity check
SPARSEMEM w/o VMEMMAP and DISCONTIGMEM, both used only on 32bit, use sections array to map pfn to nid which is limited in granularity. If NUMA nodes are laid out such that the mapping cannot be accurate, boot will fail triggering BUG_ON() in mminit_verify_page_links(). On 32bit, it's 512MiB w/ PAE and SPARSEMEM. This seems to have been granular enough until commit 2706a0bf7b (x86, NUMA: Enable CONFIG_AMD_NUMA on 32bit too). Apparently, there is a machine which aligns NUMA nodes to 128MiB and has only AMD NUMA but not SRAT. This led to the following BUG_ON(). On node 0 totalpages: 2096615 DMA zone: 32 pages used for memmap DMA zone: 0 pages reserved DMA zone: 3927 pages, LIFO batch:0 Normal zone: 1740 pages used for memmap Normal zone: 220978 pages, LIFO batch:31 HighMem zone: 16405 pages used for memmap HighMem zone: 1853533 pages, LIFO batch:31 BUG: Int 6: CR2 (null) EDI (null) ESI 00000002 EBP 00000002 ESP c1543ecc EBX f2400000 EDX 00000006 ECX (null) EAX 00000001 err (null) EIP c16209aa CS 00000060 flg 00010002 Stack: f2400000 00220000 f7200800 c1620613 00220000 01000000 04400000 00238000 (null) f7200000 00000002 f7200b58 f7200800 c1620929 000375fe (null) f7200b80 c16395f0 00200a02 f7200a80 (null) 000375fe 00000002 (null) Pid: 0, comm: swapper Not tainted 2.6.39-rc5-00181-g2706a0b #17 Call Trace: [<c136b1e5>] ? early_fault+0x2e/0x2e [<c16209aa>] ? mminit_verify_page_links+0x12/0x42 [<c1620613>] ? memmap_init_zone+0xaf/0x10c [<c1620929>] ? free_area_init_node+0x2b9/0x2e3 [<c1607e99>] ? free_area_init_nodes+0x3f2/0x451 [<c1601d80>] ? paging_init+0x112/0x118 [<c15f578d>] ? setup_arch+0x791/0x82f [<c15f43d9>] ? start_kernel+0x6a/0x257 This patch implements node_map_pfn_alignment() which determines maximum internode alignment and update numa_register_memblks() to reject NUMA configuration if alignment exceeds the pfn -> nid mapping granularity of the memory model as determined by PAGES_PER_SECTION. This makes the problematic machine boot w/ flatmem by rejecting the NUMA config and provides protection against crazy NUMA configurations. Signed-off-by: Tejun Heo <tj@kernel.org> Link: http://lkml.kernel.org/r/20110712074534.GB2872@htj.dyndns.org LKML-Reference: <20110628174613.GP478@escobedo.osrc.amd.com> Reported-and-Tested-by: Hans Rosenfeld <hans.rosenfeld@amd.com> Cc: Conny Seidel <conny.seidel@amd.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
-rw-r--r--arch/x86/mm/numa.c15
-rw-r--r--include/linux/mm.h1
-rw-r--r--mm/page_alloc.c54
3 files changed, 70 insertions, 0 deletions
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f5510d889a22..fbeaaf416610 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -496,6 +496,7 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
496 496
497static int __init numa_register_memblks(struct numa_meminfo *mi) 497static int __init numa_register_memblks(struct numa_meminfo *mi)
498{ 498{
499 unsigned long uninitialized_var(pfn_align);
499 int i, nid; 500 int i, nid;
500 501
501 /* Account for nodes with cpus and no memory */ 502 /* Account for nodes with cpus and no memory */
@@ -511,6 +512,20 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
511 512
512 /* for out of order entries */ 513 /* for out of order entries */
513 sort_node_map(); 514 sort_node_map();
515
516 /*
517 * If sections array is gonna be used for pfn -> nid mapping, check
518 * whether its granularity is fine enough.
519 */
520#ifdef NODE_NOT_IN_PAGE_FLAGS
521 pfn_align = node_map_pfn_alignment();
522 if (pfn_align && pfn_align < PAGES_PER_SECTION) {
523 printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
524 PFN_PHYS(pfn_align) >> 20,
525 PFN_PHYS(PAGES_PER_SECTION) >> 20);
526 return -EINVAL;
527 }
528#endif
514 if (!numa_meminfo_cover_memory(mi)) 529 if (!numa_meminfo_cover_memory(mi))
515 return -EINVAL; 530 return -EINVAL;
516 531
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9670f71d7be9..c70a326b8f26 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1313,6 +1313,7 @@ extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
1313 unsigned long end_pfn); 1313 unsigned long end_pfn);
1314extern void remove_all_active_ranges(void); 1314extern void remove_all_active_ranges(void);
1315void sort_node_map(void); 1315void sort_node_map(void);
1316unsigned long node_map_pfn_alignment(void);
1316unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, 1317unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn,
1317 unsigned long end_pfn); 1318 unsigned long end_pfn);
1318extern unsigned long absent_pages_in_range(unsigned long start_pfn, 1319extern unsigned long absent_pages_in_range(unsigned long start_pfn,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab8..9119faae6e6a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4585,6 +4585,60 @@ void __init sort_node_map(void)
4585 cmp_node_active_region, NULL); 4585 cmp_node_active_region, NULL);
4586} 4586}
4587 4587
4588/**
4589 * node_map_pfn_alignment - determine the maximum internode alignment
4590 *
4591 * This function should be called after node map is populated and sorted.
4592 * It calculates the maximum power of two alignment which can distinguish
4593 * all the nodes.
4594 *
4595 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4596 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4597 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4598 * shifted, 1GiB is enough and this function will indicate so.
4599 *
4600 * This is used to test whether pfn -> nid mapping of the chosen memory
4601 * model has fine enough granularity to avoid incorrect mapping for the
4602 * populated node map.
4603 *
4604 * Returns the determined alignment in pfn's. 0 if there is no alignment
4605 * requirement (single node).
4606 */
4607unsigned long __init node_map_pfn_alignment(void)
4608{
4609 unsigned long accl_mask = 0, last_end = 0;
4610 int last_nid = -1;
4611 int i;
4612
4613 for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
4614 int nid = early_node_map[i].nid;
4615 unsigned long start = early_node_map[i].start_pfn;
4616 unsigned long end = early_node_map[i].end_pfn;
4617 unsigned long mask;
4618
4619 if (!start || last_nid < 0 || last_nid == nid) {
4620 last_nid = nid;
4621 last_end = end;
4622 continue;
4623 }
4624
4625 /*
4626 * Start with a mask granular enough to pin-point to the
4627 * start pfn and tick off bits one-by-one until it becomes
4628 * too coarse to separate the current node from the last.
4629 */
4630 mask = ~((1 << __ffs(start)) - 1);
4631 while (mask && last_end <= (start & (mask << 1)))
4632 mask <<= 1;
4633
4634 /* accumulate all internode masks */
4635 accl_mask |= mask;
4636 }
4637
4638 /* convert mask to number of pages */
4639 return ~accl_mask + 1;
4640}
4641
4588/* Find the lowest pfn for a node */ 4642/* Find the lowest pfn for a node */
4589static unsigned long __init find_min_pfn_for_node(int nid) 4643static unsigned long __init find_min_pfn_for_node(int nid)
4590{ 4644{