aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Whitcroft <apw@shadowen.org>2005-06-23 03:07:59 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-23 12:45:05 -0400
commit641c767389b19859a45e6de46d8e18cd935bdb60 (patch)
treeb3ac95aaea213823c226b181b8a301e4ae95bd9d
parent05b79bdcb48c18cd9b580c39e3efb9a1ab078151 (diff)
[PATCH] sparsemem swiss cheese numa layouts
The part of the sparsemem patch which modifies memmap_init_zone() has recently become a problem. It changes behavior so that there is a call to pfn_to_page() for each individual page inside of a node's range: node_start_pfn through node_end_pfn. It used to simply do this once, at the beginning of the node, but having sparsemem's non-contiguous mem_map[]s inside of a node made it necessary to change. Mike Kravetz recently wrote a patch which made the NUMA code accept some new kinds of layouts. The system's memory was laid out like this, with node 0's memory in two pieces: one before and one after node 1's memory: Node 0: +++++ +++++ Node 1: +++++ Previous behavior before Mike's patch was to assign nodes like this: Node 0: 00000 XXXXX Node 1: 11111 Where the 'X' areas were simply thrown away. The new behavior was to make the pg_data_t span node 0 across all of its areas, including areas that are really node 1's: Node 0: 000000000000000 Node 1: 11111 This wastes a little bit of mem_map space, but ends up being OK, and more fully utilizes the system's memory. memmap_init_zone() initializes all of the "struct page"s for node 0, even for the "hole", but those never get used, because there is no pfn_to_page() that resolves to those pages. However, only calling pfn_to_page() once, memmap_init_zone() always uses the pages that were allocated for node0->node_mem_map because: struct page *start = pfn_to_page(start_pfn); // effectively start = &node->node_mem_map[0] for (page = start; page < (start + size); page++) { init_page_here();... page++; } Slow, and wasteful, but generally harmless. But, modify that to call pfn_to_page() for each loop iteration (like sparsemem does): for (pfn = start_pfn; pfn < < (start_pfn + size); pfn++++) { page = pfn_to_page(pfn); } And you end up trying to initialize node 1's pages too early, along with bogus data from node 0. This patch checks for those weird layouts and declines to touch the pages, making the more frequent pfn_to_page() calls OK to do. Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/ppc64/Kconfig12
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--mm/page_alloc.c2
3 files changed, 20 insertions, 0 deletions
diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig
index 011b5c0bf1d0..85f8fcf44b6c 100644
--- a/arch/ppc64/Kconfig
+++ b/arch/ppc64/Kconfig
@@ -211,6 +211,18 @@ config ARCH_FLATMEM_ENABLE
211 211
212source "mm/Kconfig" 212source "mm/Kconfig"
213 213
214# Some NUMA nodes have memory ranges that span
215# other nodes. Even though a pfn is valid and
216# between a node's start and end pfns, it may not
217# reside on that node.
218#
219# This is a relatively temporary hack that should
220# be able to go away when sparsemem is fully in
221# place
222config NODES_SPAN_OTHER_NODES
223 def_bool y
224 depends on NEED_MULTIPLE_NODES
225
214config NUMA 226config NUMA
215 bool "NUMA support" 227 bool "NUMA support"
216 depends on DISCONTIGMEM 228 depends on DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 19860d317ec2..746b57e3d370 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -528,6 +528,12 @@ void sparse_init(void);
528#define sparse_init() do {} while (0) 528#define sparse_init() do {} while (0)
529#endif /* CONFIG_SPARSEMEM */ 529#endif /* CONFIG_SPARSEMEM */
530 530
531#ifdef CONFIG_NODES_SPAN_OTHER_NODES
532#define early_pfn_in_nid(pfn, nid) (early_pfn_to_nid(pfn) == (nid))
533#else
534#define early_pfn_in_nid(pfn, nid) (1)
535#endif
536
531#ifndef early_pfn_valid 537#ifndef early_pfn_valid
532#define early_pfn_valid(pfn) (1) 538#define early_pfn_valid(pfn) (1)
533#endif 539#endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c1b8982a6da..1eb683f9b3af 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1656,6 +1656,8 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1656 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1656 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
1657 if (!early_pfn_valid(pfn)) 1657 if (!early_pfn_valid(pfn))
1658 continue; 1658 continue;
1659 if (!early_pfn_in_nid(pfn, nid))
1660 continue;
1659 page = pfn_to_page(pfn); 1661 page = pfn_to_page(pfn);
1660 set_page_links(page, zone, nid, pfn); 1662 set_page_links(page, zone, nid, pfn);
1661 set_page_count(page, 0); 1663 set_page_count(page, 0);