aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndy Whitcroft <apw@shadowen.org>2005-06-23 03:07:54 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-23 12:45:04 -0400
commitd41dee369bff3b9dcb6328d4d822926c28cc2594 (patch)
treea0405f3b7af3ebca21838a7d427bd75a067bf850
parentaf705362ab6018071310c5fcd436a6b457517d5f (diff)
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of mem_map[] is needed by discontiguous memory machines (like in the old CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually become a complete replacement. A significant advantage over DISCONTIGMEM is that it's completely separated from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA and DISCONTIG are often confused. Another advantage is that sparse doesn't require each NUMA node's ranges to be contiguous. It can handle overlapping ranges between nodes with no problems, where DISCONTIGMEM currently throws away that memory. Sparsemem uses an array to provide different pfn_to_page() translations for each SECTION_SIZE area of physical memory. This is what allows the mem_map[] to be chopped up. In order to do quick pfn_to_page() operations, the section number of the page is encoded in page->flags. Part of the sparsemem infrastructure enables sharing of these bits more dynamically (at compile-time) between the page_zone() and sparsemem operations. However, on 32-bit architectures, the number of bits is quite limited, and may require growing the size of the page->flags type in certain conditions. Several things might force this to occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of memory), an increase in the physical address space, or an increase in the number of used page->flags. One thing to note is that, once sparsemem is present, the NUMA node information no longer needs to be stored in the page->flags. It might provide speed increases on certain platforms and will be stored there if there is room. But, if out of room, an alternate (theoretically slower) mechanism is used. This patch introduces CONFIG_FLATMEM. It is used in almost all cases where there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM often have to compile out the same areas of code. Signed-off-by: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Martin Bligh <mbligh@aracnet.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com> Signed-off-by: Bob Picco <bob.picco@hp.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/i386/Kconfig1
-rw-r--r--include/linux/mm.h92
-rw-r--r--include/linux/mmzone.h96
-rw-r--r--include/linux/numa.h2
-rw-r--r--mm/Kconfig38
-rw-r--r--mm/Makefile1
-rw-r--r--mm/bootmem.c9
-rw-r--r--mm/memory.c2
-rw-r--r--mm/page_alloc.c39
-rw-r--r--mm/sparse.c85
10 files changed, 332 insertions, 33 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 3b7248126d29..f0064b5e3702 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -813,6 +813,7 @@ source "mm/Kconfig"
813config HAVE_ARCH_EARLY_PFN_TO_NID 813config HAVE_ARCH_EARLY_PFN_TO_NID
814 bool 814 bool
815 default y 815 default y
816 depends on NUMA
816 817
817config HIGHPTE 818config HIGHPTE
818 bool "Allocate 3rd-level pagetables from highmem" 819 bool "Allocate 3rd-level pagetables from highmem"
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 57b2ead51dba..6eb7f48317f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -397,40 +397,80 @@ static inline void put_page(struct page *page)
397 * sets it, so none of the operations on it need to be atomic. 397 * sets it, so none of the operations on it need to be atomic.
398 */ 398 */
399 399
400/* Page flags: | NODE | ZONE | ... | FLAGS | */ 400
401#define NODES_PGOFF ((sizeof(page_flags_t)*8) - NODES_SHIFT) 401/*
402#define ZONES_PGOFF (NODES_PGOFF - ZONES_SHIFT) 402 * page->flags layout:
403 *
404 * There are three possibilities for how page->flags get
405 * laid out. The first is for the normal case, without
406 * sparsemem. The second is for sparsemem when there is
407 * plenty of space for node and section. The last is when
408 * we have run out of space and have to fall back to an
409 * alternate (slower) way of determining the node.
410 *
411 * No sparsemem: | NODE | ZONE | ... | FLAGS |
412 * with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
413 * no space for node: | SECTION | ZONE | ... | FLAGS |
414 */
415#ifdef CONFIG_SPARSEMEM
416#define SECTIONS_WIDTH SECTIONS_SHIFT
417#else
418#define SECTIONS_WIDTH 0
419#endif
420
421#define ZONES_WIDTH ZONES_SHIFT
422
423#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
424#define NODES_WIDTH NODES_SHIFT
425#else
426#define NODES_WIDTH 0
427#endif
428
429/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
430#define SECTIONS_PGOFF ((sizeof(page_flags_t)*8) - SECTIONS_WIDTH)
431#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
432#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
433
434/*
435 * We are going to use the flags for the page to node mapping if its in
436 * there. This includes the case where there is no node, so it is implicit.
437 */
438#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0)
439
440#ifndef PFN_SECTION_SHIFT
441#define PFN_SECTION_SHIFT 0
442#endif
403 443
404/* 444/*
405 * Define the bit shifts to access each section. For non-existant 445 * Define the bit shifts to access each section. For non-existant
406 * sections we define the shift as 0; that plus a 0 mask ensures 446 * sections we define the shift as 0; that plus a 0 mask ensures
407 * the compiler will optimise away reference to them. 447 * the compiler will optimise away reference to them.
408 */ 448 */
409#define NODES_PGSHIFT (NODES_PGOFF * (NODES_SHIFT != 0)) 449#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
410#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_SHIFT != 0)) 450#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
451#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
411 452
412/* NODE:ZONE is used to lookup the zone from a page. */ 453/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
454#if FLAGS_HAS_NODE
413#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) 455#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT)
456#else
457#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
458#endif
414#define ZONETABLE_PGSHIFT ZONES_PGSHIFT 459#define ZONETABLE_PGSHIFT ZONES_PGSHIFT
415 460
416#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED 461#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
417#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED 462#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
418#endif 463#endif
419 464
420#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) 465#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
421 466#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
422#define ZONES_MASK ((1UL << ZONES_SHIFT) - 1) 467#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
423#define NODES_MASK ((1UL << NODES_SHIFT) - 1)
424#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) 468#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1)
425 469
426static inline unsigned long page_zonenum(struct page *page) 470static inline unsigned long page_zonenum(struct page *page)
427{ 471{
428 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; 472 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
429} 473}
430static inline unsigned long page_to_nid(struct page *page)
431{
432 return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
433}
434 474
435struct zone; 475struct zone;
436extern struct zone *zone_table[]; 476extern struct zone *zone_table[];
@@ -441,6 +481,18 @@ static inline struct zone *page_zone(struct page *page)
441 ZONETABLE_MASK]; 481 ZONETABLE_MASK];
442} 482}
443 483
484static inline unsigned long page_to_nid(struct page *page)
485{
486 if (FLAGS_HAS_NODE)
487 return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
488 else
489 return page_zone(page)->zone_pgdat->node_id;
490}
491static inline unsigned long page_to_section(struct page *page)
492{
493 return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
494}
495
444static inline void set_page_zone(struct page *page, unsigned long zone) 496static inline void set_page_zone(struct page *page, unsigned long zone)
445{ 497{
446 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); 498 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
@@ -451,12 +503,18 @@ static inline void set_page_node(struct page *page, unsigned long node)
451 page->flags &= ~(NODES_MASK << NODES_PGSHIFT); 503 page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
452 page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; 504 page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
453} 505}
506static inline void set_page_section(struct page *page, unsigned long section)
507{
508 page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
509 page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
510}
454 511
455static inline void set_page_links(struct page *page, unsigned long zone, 512static inline void set_page_links(struct page *page, unsigned long zone,
456 unsigned long node) 513 unsigned long node, unsigned long pfn)
457{ 514{
458 set_page_zone(page, zone); 515 set_page_zone(page, zone);
459 set_page_node(page, node); 516 set_page_node(page, node);
517 set_page_section(page, pfn_to_section_nr(pfn));
460} 518}
461 519
462#ifndef CONFIG_DISCONTIGMEM 520#ifndef CONFIG_DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6ef07de98d69..19860d317ec2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -269,7 +269,9 @@ typedef struct pglist_data {
269 struct zone node_zones[MAX_NR_ZONES]; 269 struct zone node_zones[MAX_NR_ZONES];
270 struct zonelist node_zonelists[GFP_ZONETYPES]; 270 struct zonelist node_zonelists[GFP_ZONETYPES];
271 int nr_zones; 271 int nr_zones;
272#ifdef CONFIG_FLAT_NODE_MEM_MAP
272 struct page *node_mem_map; 273 struct page *node_mem_map;
274#endif
273 struct bootmem_data *bdata; 275 struct bootmem_data *bdata;
274 unsigned long node_start_pfn; 276 unsigned long node_start_pfn;
275 unsigned long node_present_pages; /* total number of physical pages */ 277 unsigned long node_present_pages; /* total number of physical pages */
@@ -284,7 +286,11 @@ typedef struct pglist_data {
284 286
285#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 287#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
286#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) 288#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
289#ifdef CONFIG_FLAT_NODE_MEM_MAP
287#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) 290#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr))
291#else
292#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr))
293#endif
288#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) 294#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
289 295
290extern struct pglist_data *pgdat_list; 296extern struct pglist_data *pgdat_list;
@@ -416,6 +422,10 @@ extern struct pglist_data contig_page_data;
416 422
417#endif /* !CONFIG_NEED_MULTIPLE_NODES */ 423#endif /* !CONFIG_NEED_MULTIPLE_NODES */
418 424
425#ifdef CONFIG_SPARSEMEM
426#include <asm/sparsemem.h>
427#endif
428
419#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) 429#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
420/* 430/*
421 * with 32 bit page->flags field, we reserve 8 bits for node/zone info. 431 * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
@@ -439,6 +449,92 @@ extern struct pglist_data contig_page_data;
439#define early_pfn_to_nid(nid) (0UL) 449#define early_pfn_to_nid(nid) (0UL)
440#endif 450#endif
441 451
452#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
453#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
454
455#ifdef CONFIG_SPARSEMEM
456
457/*
458 * SECTION_SHIFT #bits space required to store a section #
459 *
460 * PA_SECTION_SHIFT physical address to/from section number
461 * PFN_SECTION_SHIFT pfn to/from section number
462 */
463#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
464
465#define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
466#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
467
468#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT)
469
470#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
471#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
472
473#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
474#error Allocator MAX_ORDER exceeds SECTION_SIZE
475#endif
476
477struct page;
478struct mem_section {
479 struct page *section_mem_map;
480};
481
482extern struct mem_section mem_section[NR_MEM_SECTIONS];
483
484/*
485 * Given a kernel address, find the home node of the underlying memory.
486 */
487#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
488
489static inline struct mem_section *__pfn_to_section(unsigned long pfn)
490{
491 return &mem_section[pfn_to_section_nr(pfn)];
492}
493
494#define pfn_to_page(pfn) \
495({ \
496 unsigned long __pfn = (pfn); \
497 __pfn_to_section(__pfn)->section_mem_map + __pfn; \
498})
499#define page_to_pfn(page) \
500({ \
501 page - mem_section[page_to_section(page)].section_mem_map; \
502})
503
504static inline int pfn_valid(unsigned long pfn)
505{
506 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
507 return 0;
508 return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0;
509}
510
511/*
512 * These are _only_ used during initialisation, therefore they
513 * can use __initdata ... They could have names to indicate
514 * this restriction.
515 */
516#ifdef CONFIG_NUMA
517#define pfn_to_nid early_pfn_to_nid
518#endif
519
520#define pfn_to_pgdat(pfn) \
521({ \
522 NODE_DATA(pfn_to_nid(pfn)); \
523})
524
525#define early_pfn_valid(pfn) pfn_valid(pfn)
526void sparse_init(void);
527#else
528#define sparse_init() do {} while (0)
529#endif /* CONFIG_SPARSEMEM */
530
531#ifndef early_pfn_valid
532#define early_pfn_valid(pfn) (1)
533#endif
534
535void memory_present(int nid, unsigned long start, unsigned long end);
536unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
537
442#endif /* !__ASSEMBLY__ */ 538#endif /* !__ASSEMBLY__ */
443#endif /* __KERNEL__ */ 539#endif /* __KERNEL__ */
444#endif /* _LINUX_MMZONE_H */ 540#endif /* _LINUX_MMZONE_H */
diff --git a/include/linux/numa.h b/include/linux/numa.h
index bd0c8c4e9a95..f0c539bd3cfc 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -3,7 +3,7 @@
3 3
4#include <linux/config.h> 4#include <linux/config.h>
5 5
6#ifdef CONFIG_DISCONTIGMEM 6#ifndef CONFIG_FLATMEM
7#include <asm/numnodes.h> 7#include <asm/numnodes.h>
8#endif 8#endif
9 9
diff --git a/mm/Kconfig b/mm/Kconfig
index 5127441561b4..cd379936cac6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -6,6 +6,7 @@ choice
6 prompt "Memory model" 6 prompt "Memory model"
7 depends on SELECT_MEMORY_MODEL 7 depends on SELECT_MEMORY_MODEL
8 default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT 8 default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
9 default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
9 default FLATMEM_MANUAL 10 default FLATMEM_MANUAL
10 11
11config FLATMEM_MANUAL 12config FLATMEM_MANUAL
@@ -17,7 +18,15 @@ config FLATMEM_MANUAL
17 only have one option here: FLATMEM. This is normal 18 only have one option here: FLATMEM. This is normal
18 and a correct option. 19 and a correct option.
19 20
20 If unsure, choose this option over any other. 21 Some users of more advanced features like NUMA and
22 memory hotplug may have different options here.
23 DISCONTIGMEM is an more mature, better tested system,
24 but is incompatible with memory hotplug and may suffer
25 decreased performance over SPARSEMEM. If unsure between
26 "Sparse Memory" and "Discontiguous Memory", choose
27 "Discontiguous Memory".
28
29 If unsure, choose this option (Flat Memory) over any other.
21 30
22config DISCONTIGMEM_MANUAL 31config DISCONTIGMEM_MANUAL
23 bool "Discontigious Memory" 32 bool "Discontigious Memory"
@@ -35,15 +44,38 @@ config DISCONTIGMEM_MANUAL
35 44
36 If unsure, choose "Flat Memory" over this option. 45 If unsure, choose "Flat Memory" over this option.
37 46
47config SPARSEMEM_MANUAL
48 bool "Sparse Memory"
49 depends on ARCH_SPARSEMEM_ENABLE
50 help
51 This will be the only option for some systems, including
52 memory hotplug systems. This is normal.
53
54 For many other systems, this will be an alternative to
55 "Discontigious Memory". This option provides some potential
56 performance benefits, along with decreased code complexity,
57 but it is newer, and more experimental.
58
59 If unsure, choose "Discontiguous Memory" or "Flat Memory"
60 over this option.
61
38endchoice 62endchoice
39 63
40config DISCONTIGMEM 64config DISCONTIGMEM
41 def_bool y 65 def_bool y
42 depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL 66 depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
43 67
68config SPARSEMEM
69 def_bool y
70 depends on SPARSEMEM_MANUAL
71
44config FLATMEM 72config FLATMEM
45 def_bool y 73 def_bool y
46 depends on !DISCONTIGMEM || FLATMEM_MANUAL 74 depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
75
76config FLAT_NODE_MEM_MAP
77 def_bool y
78 depends on !SPARSEMEM
47 79
48# 80#
49# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's 81# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
@@ -56,4 +88,4 @@ config NEED_MULTIPLE_NODES
56 88
57config HAVE_MEMORY_PRESENT 89config HAVE_MEMORY_PRESENT
58 def_bool y 90 def_bool y
59 depends on ARCH_HAVE_MEMORY_PRESENT 91 depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
diff --git a/mm/Makefile b/mm/Makefile
index 097408064f6a..8f70ffd763c8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 15obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
16obj-$(CONFIG_HUGETLBFS) += hugetlb.o 16obj-$(CONFIG_HUGETLBFS) += hugetlb.o
17obj-$(CONFIG_NUMA) += mempolicy.o 17obj-$(CONFIG_NUMA) += mempolicy.o
18obj-$(CONFIG_SPARSEMEM) += sparse.o
18obj-$(CONFIG_SHMEM) += shmem.o 19obj-$(CONFIG_SHMEM) += shmem.o
19obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o 20obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
20 21
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 260e703850d8..f82f7aebbee3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -256,6 +256,7 @@ found:
256static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) 256static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
257{ 257{
258 struct page *page; 258 struct page *page;
259 unsigned long pfn;
259 bootmem_data_t *bdata = pgdat->bdata; 260 bootmem_data_t *bdata = pgdat->bdata;
260 unsigned long i, count, total = 0; 261 unsigned long i, count, total = 0;
261 unsigned long idx; 262 unsigned long idx;
@@ -266,7 +267,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
266 267
267 count = 0; 268 count = 0;
268 /* first extant page of the node */ 269 /* first extant page of the node */
269 page = virt_to_page(phys_to_virt(bdata->node_boot_start)); 270 pfn = bdata->node_boot_start >> PAGE_SHIFT;
270 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT); 271 idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
271 map = bdata->node_bootmem_map; 272 map = bdata->node_bootmem_map;
272 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ 273 /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
@@ -275,9 +276,11 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
275 gofast = 1; 276 gofast = 1;
276 for (i = 0; i < idx; ) { 277 for (i = 0; i < idx; ) {
277 unsigned long v = ~map[i / BITS_PER_LONG]; 278 unsigned long v = ~map[i / BITS_PER_LONG];
279
278 if (gofast && v == ~0UL) { 280 if (gofast && v == ~0UL) {
279 int j, order; 281 int j, order;
280 282
283 page = pfn_to_page(pfn);
281 count += BITS_PER_LONG; 284 count += BITS_PER_LONG;
282 __ClearPageReserved(page); 285 __ClearPageReserved(page);
283 order = ffs(BITS_PER_LONG) - 1; 286 order = ffs(BITS_PER_LONG) - 1;
@@ -292,6 +295,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
292 page += BITS_PER_LONG; 295 page += BITS_PER_LONG;
293 } else if (v) { 296 } else if (v) {
294 unsigned long m; 297 unsigned long m;
298
299 page = pfn_to_page(pfn);
295 for (m = 1; m && i < idx; m<<=1, page++, i++) { 300 for (m = 1; m && i < idx; m<<=1, page++, i++) {
296 if (v & m) { 301 if (v & m) {
297 count++; 302 count++;
@@ -302,8 +307,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
302 } 307 }
303 } else { 308 } else {
304 i+=BITS_PER_LONG; 309 i+=BITS_PER_LONG;
305 page += BITS_PER_LONG;
306 } 310 }
311 pfn += BITS_PER_LONG;
307 } 312 }
308 total += count; 313 total += count;
309 314
diff --git a/mm/memory.c b/mm/memory.c
index da91b7bf9986..30975ef48722 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -58,7 +58,7 @@
58#include <linux/swapops.h> 58#include <linux/swapops.h>
59#include <linux/elf.h> 59#include <linux/elf.h>
60 60
61#ifndef CONFIG_DISCONTIGMEM 61#ifndef CONFIG_NEED_MULTIPLE_NODES
62/* use the per-pgdat data instead for discontigmem - mbligh */ 62/* use the per-pgdat data instead for discontigmem - mbligh */
63unsigned long max_mapnr; 63unsigned long max_mapnr;
64struct page *mem_map; 64struct page *mem_map;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20e239599db0..5c1b8982a6da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
68 * Used by page_zone() to look up the address of the struct zone whose 68 * Used by page_zone() to look up the address of the struct zone whose
69 * id is encoded in the upper bits of page->flags 69 * id is encoded in the upper bits of page->flags
70 */ 70 */
71struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; 71struct zone *zone_table[1 << ZONETABLE_SHIFT];
72EXPORT_SYMBOL(zone_table); 72EXPORT_SYMBOL(zone_table);
73 73
74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 74static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -1649,11 +1649,15 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1649void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1649void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1650 unsigned long start_pfn) 1650 unsigned long start_pfn)
1651{ 1651{
1652 struct page *start = pfn_to_page(start_pfn);
1653 struct page *page; 1652 struct page *page;
1653 int end_pfn = start_pfn + size;
1654 int pfn;
1654 1655
1655 for (page = start; page < (start + size); page++) { 1656 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
1656 set_page_links(page, zone, nid); 1657 if (!early_pfn_valid(pfn))
1658 continue;
1659 page = pfn_to_page(pfn);
1660 set_page_links(page, zone, nid, pfn);
1657 set_page_count(page, 0); 1661 set_page_count(page, 0);
1658 reset_page_mapcount(page); 1662 reset_page_mapcount(page);
1659 SetPageReserved(page); 1663 SetPageReserved(page);
@@ -1677,6 +1681,20 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1677 } 1681 }
1678} 1682}
1679 1683
1684#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
1685void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1686 unsigned long size)
1687{
1688 unsigned long snum = pfn_to_section_nr(pfn);
1689 unsigned long end = pfn_to_section_nr(pfn + size);
1690
1691 if (FLAGS_HAS_NODE)
1692 zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
1693 else
1694 for (; snum <= end; snum++)
1695 zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
1696}
1697
1680#ifndef __HAVE_ARCH_MEMMAP_INIT 1698#ifndef __HAVE_ARCH_MEMMAP_INIT
1681#define memmap_init(size, nid, zone, start_pfn) \ 1699#define memmap_init(size, nid, zone, start_pfn) \
1682 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1700 memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1861,7 +1879,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1861 unsigned long size, realsize; 1879 unsigned long size, realsize;
1862 unsigned long batch; 1880 unsigned long batch;
1863 1881
1864 zone_table[NODEZONE(nid, j)] = zone;
1865 realsize = size = zones_size[j]; 1882 realsize = size = zones_size[j];
1866 if (zholes_size) 1883 if (zholes_size)
1867 realsize -= zholes_size[j]; 1884 realsize -= zholes_size[j];
@@ -1927,6 +1944,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1927 1944
1928 memmap_init(size, nid, j, zone_start_pfn); 1945 memmap_init(size, nid, j, zone_start_pfn);
1929 1946
1947 zonetable_add(zone, nid, j, zone_start_pfn, size);
1948
1930 zone_start_pfn += size; 1949 zone_start_pfn += size;
1931 1950
1932 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 1951 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1935,28 +1954,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1935 1954
1936static void __init alloc_node_mem_map(struct pglist_data *pgdat) 1955static void __init alloc_node_mem_map(struct pglist_data *pgdat)
1937{ 1956{
1938 unsigned long size;
1939 struct page *map;
1940
1941 /* Skip empty nodes */ 1957 /* Skip empty nodes */
1942 if (!pgdat->node_spanned_pages) 1958 if (!pgdat->node_spanned_pages)
1943 return; 1959 return;
1944 1960
1961#ifdef CONFIG_FLAT_NODE_MEM_MAP
1945 /* ia64 gets its own node_mem_map, before this, without bootmem */ 1962 /* ia64 gets its own node_mem_map, before this, without bootmem */
1946 if (!pgdat->node_mem_map) { 1963 if (!pgdat->node_mem_map) {
1964 unsigned long size;
1965 struct page *map;
1966
1947 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); 1967 size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
1948 map = alloc_remap(pgdat->node_id, size); 1968 map = alloc_remap(pgdat->node_id, size);
1949 if (!map) 1969 if (!map)
1950 map = alloc_bootmem_node(pgdat, size); 1970 map = alloc_bootmem_node(pgdat, size);
1951 pgdat->node_mem_map = map; 1971 pgdat->node_mem_map = map;
1952 } 1972 }
1953#ifndef CONFIG_DISCONTIGMEM 1973#ifdef CONFIG_FLATMEM
1954 /* 1974 /*
1955 * With no DISCONTIG, the global mem_map is just set as node 0's 1975 * With no DISCONTIG, the global mem_map is just set as node 0's
1956 */ 1976 */
1957 if (pgdat == NODE_DATA(0)) 1977 if (pgdat == NODE_DATA(0))
1958 mem_map = NODE_DATA(0)->node_mem_map; 1978 mem_map = NODE_DATA(0)->node_mem_map;
1959#endif 1979#endif
1980#endif /* CONFIG_FLAT_NODE_MEM_MAP */
1960} 1981}
1961 1982
1962void __init free_area_init_node(int nid, struct pglist_data *pgdat, 1983void __init free_area_init_node(int nid, struct pglist_data *pgdat,
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 000000000000..f888385b9e14
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,85 @@
1/*
2 * sparse memory mappings.
3 */
4#include <linux/config.h>
5#include <linux/mm.h>
6#include <linux/mmzone.h>
7#include <linux/bootmem.h>
8#include <linux/module.h>
9#include <asm/dma.h>
10
11/*
12 * Permanent SPARSEMEM data:
13 *
14 * 1) mem_section - memory sections, mem_map's for valid memory
15 */
16struct mem_section mem_section[NR_MEM_SECTIONS];
17EXPORT_SYMBOL(mem_section);
18
19/* Record a memory area against a node. */
20void memory_present(int nid, unsigned long start, unsigned long end)
21{
22 unsigned long pfn;
23
24 start &= PAGE_SECTION_MASK;
25 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
26 unsigned long section = pfn_to_section_nr(pfn);
27 if (!mem_section[section].section_mem_map)
28 mem_section[section].section_mem_map = (void *) -1;
29 }
30}
31
32/*
33 * Only used by the i386 NUMA architecures, but relatively
34 * generic code.
35 */
36unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
37 unsigned long end_pfn)
38{
39 unsigned long pfn;
40 unsigned long nr_pages = 0;
41
42 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
43 if (nid != early_pfn_to_nid(pfn))
44 continue;
45
46 if (pfn_valid(pfn))
47 nr_pages += PAGES_PER_SECTION;
48 }
49
50 return nr_pages * sizeof(struct page);
51}
52
53/*
54 * Allocate the accumulated non-linear sections, allocate a mem_map
55 * for each and record the physical to section mapping.
56 */
57void sparse_init(void)
58{
59 unsigned long pnum;
60 struct page *map;
61 int nid;
62
63 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
64 if (!mem_section[pnum].section_mem_map)
65 continue;
66
67 nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
68 map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
69 if (!map)
70 map = alloc_bootmem_node(NODE_DATA(nid),
71 sizeof(struct page) * PAGES_PER_SECTION);
72 if (!map) {
73 mem_section[pnum].section_mem_map = 0;
74 continue;
75 }
76
77 /*
78 * Subtle, we encode the real pfn into the mem_map such that
79 * the identity pfn - section_mem_map will return the actual
80 * physical page frame number.
81 */
82 mem_section[pnum].section_mem_map = map -
83 section_nr_to_pfn(pnum);
84 }
85}