aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorAndy Whitcroft <apw@shadowen.org>2005-06-23 03:07:54 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org>2005-06-23 12:45:04 -0400
commitd41dee369bff3b9dcb6328d4d822926c28cc2594 (patch)
treea0405f3b7af3ebca21838a7d427bd75a067bf850 /include/linux
parentaf705362ab6018071310c5fcd436a6b457517d5f (diff)
[PATCH] sparsemem memory model
Sparsemem abstracts the use of discontiguous mem_maps[]. This kind of mem_map[] is needed by discontiguous memory machines (like in the old CONFIG_DISCONTIGMEM case) as well as memory hotplug systems. Sparsemem replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually become a complete replacement. A significant advantage over DISCONTIGMEM is that it's completely separated from CONFIG_NUMA. When producing this patch, it became apparent in that NUMA and DISCONTIG are often confused. Another advantage is that sparse doesn't require each NUMA node's ranges to be contiguous. It can handle overlapping ranges between nodes with no problems, where DISCONTIGMEM currently throws away that memory. Sparsemem uses an array to provide different pfn_to_page() translations for each SECTION_SIZE area of physical memory. This is what allows the mem_map[] to be chopped up. In order to do quick pfn_to_page() operations, the section number of the page is encoded in page->flags. Part of the sparsemem infrastructure enables sharing of these bits more dynamically (at compile-time) between the page_zone() and sparsemem operations. However, on 32-bit architectures, the number of bits is quite limited, and may require growing the size of the page->flags type in certain conditions. Several things might force this to occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of memory), an increase in the physical address space, or an increase in the number of used page->flags. One thing to note is that, once sparsemem is present, the NUMA node information no longer needs to be stored in the page->flags. It might provide speed increases on certain platforms and will be stored there if there is room. But, if out of room, an alternate (theoretically slower) mechanism is used. This patch introduces CONFIG_FLATMEM. It is used in almost all cases where there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM often have to compile out the same areas of code. Signed-off-by: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Martin Bligh <mbligh@aracnet.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com> Signed-off-by: Bob Picco <bob.picco@hp.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/mm.h92
-rw-r--r--include/linux/mmzone.h96
-rw-r--r--include/linux/numa.h2
3 files changed, 172 insertions, 18 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 57b2ead51dba..6eb7f48317f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -397,40 +397,80 @@ static inline void put_page(struct page *page)
397 * sets it, so none of the operations on it need to be atomic. 397 * sets it, so none of the operations on it need to be atomic.
398 */ 398 */
399 399
400/* Page flags: | NODE | ZONE | ... | FLAGS | */ 400
401#define NODES_PGOFF ((sizeof(page_flags_t)*8) - NODES_SHIFT) 401/*
402#define ZONES_PGOFF (NODES_PGOFF - ZONES_SHIFT) 402 * page->flags layout:
403 *
404 * There are three possibilities for how page->flags get
405 * laid out. The first is for the normal case, without
406 * sparsemem. The second is for sparsemem when there is
407 * plenty of space for node and section. The last is when
408 * we have run out of space and have to fall back to an
409 * alternate (slower) way of determining the node.
410 *
411 * No sparsemem: | NODE | ZONE | ... | FLAGS |
412 * with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
413 * no space for node: | SECTION | ZONE | ... | FLAGS |
414 */
415#ifdef CONFIG_SPARSEMEM
416#define SECTIONS_WIDTH SECTIONS_SHIFT
417#else
418#define SECTIONS_WIDTH 0
419#endif
420
421#define ZONES_WIDTH ZONES_SHIFT
422
423#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
424#define NODES_WIDTH NODES_SHIFT
425#else
426#define NODES_WIDTH 0
427#endif
428
429/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
430#define SECTIONS_PGOFF ((sizeof(page_flags_t)*8) - SECTIONS_WIDTH)
431#define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH)
432#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
433
434/*
435 * We are going to use the flags for the page to node mapping if its in
436 * there. This includes the case where there is no node, so it is implicit.
437 */
438#define FLAGS_HAS_NODE (NODES_WIDTH > 0 || NODES_SHIFT == 0)
439
440#ifndef PFN_SECTION_SHIFT
441#define PFN_SECTION_SHIFT 0
442#endif
403 443
404/* 444/*
405 * Define the bit shifts to access each section. For non-existant 445 * Define the bit shifts to access each section. For non-existant
406 * sections we define the shift as 0; that plus a 0 mask ensures 446 * sections we define the shift as 0; that plus a 0 mask ensures
407 * the compiler will optimise away reference to them. 447 * the compiler will optimise away reference to them.
408 */ 448 */
409#define NODES_PGSHIFT (NODES_PGOFF * (NODES_SHIFT != 0)) 449#define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
410#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_SHIFT != 0)) 450#define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0))
451#define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0))
411 452
412/* NODE:ZONE is used to lookup the zone from a page. */ 453/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
454#if FLAGS_HAS_NODE
413#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT) 455#define ZONETABLE_SHIFT (NODES_SHIFT + ZONES_SHIFT)
456#else
457#define ZONETABLE_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT)
458#endif
414#define ZONETABLE_PGSHIFT ZONES_PGSHIFT 459#define ZONETABLE_PGSHIFT ZONES_PGSHIFT
415 460
416#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED 461#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
417#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED 462#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
418#endif 463#endif
419 464
420#define NODEZONE(node, zone) ((node << ZONES_SHIFT) | zone) 465#define ZONES_MASK ((1UL << ZONES_WIDTH) - 1)
421 466#define NODES_MASK ((1UL << NODES_WIDTH) - 1)
422#define ZONES_MASK ((1UL << ZONES_SHIFT) - 1) 467#define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1)
423#define NODES_MASK ((1UL << NODES_SHIFT) - 1)
424#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1) 468#define ZONETABLE_MASK ((1UL << ZONETABLE_SHIFT) - 1)
425 469
426static inline unsigned long page_zonenum(struct page *page) 470static inline unsigned long page_zonenum(struct page *page)
427{ 471{
428 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; 472 return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
429} 473}
430static inline unsigned long page_to_nid(struct page *page)
431{
432 return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
433}
434 474
435struct zone; 475struct zone;
436extern struct zone *zone_table[]; 476extern struct zone *zone_table[];
@@ -441,6 +481,18 @@ static inline struct zone *page_zone(struct page *page)
441 ZONETABLE_MASK]; 481 ZONETABLE_MASK];
442} 482}
443 483
484static inline unsigned long page_to_nid(struct page *page)
485{
486 if (FLAGS_HAS_NODE)
487 return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
488 else
489 return page_zone(page)->zone_pgdat->node_id;
490}
491static inline unsigned long page_to_section(struct page *page)
492{
493 return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
494}
495
444static inline void set_page_zone(struct page *page, unsigned long zone) 496static inline void set_page_zone(struct page *page, unsigned long zone)
445{ 497{
446 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); 498 page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
@@ -451,12 +503,18 @@ static inline void set_page_node(struct page *page, unsigned long node)
451 page->flags &= ~(NODES_MASK << NODES_PGSHIFT); 503 page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
452 page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; 504 page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
453} 505}
506static inline void set_page_section(struct page *page, unsigned long section)
507{
508 page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
509 page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
510}
454 511
455static inline void set_page_links(struct page *page, unsigned long zone, 512static inline void set_page_links(struct page *page, unsigned long zone,
456 unsigned long node) 513 unsigned long node, unsigned long pfn)
457{ 514{
458 set_page_zone(page, zone); 515 set_page_zone(page, zone);
459 set_page_node(page, node); 516 set_page_node(page, node);
517 set_page_section(page, pfn_to_section_nr(pfn));
460} 518}
461 519
462#ifndef CONFIG_DISCONTIGMEM 520#ifndef CONFIG_DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6ef07de98d69..19860d317ec2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -269,7 +269,9 @@ typedef struct pglist_data {
269 struct zone node_zones[MAX_NR_ZONES]; 269 struct zone node_zones[MAX_NR_ZONES];
270 struct zonelist node_zonelists[GFP_ZONETYPES]; 270 struct zonelist node_zonelists[GFP_ZONETYPES];
271 int nr_zones; 271 int nr_zones;
272#ifdef CONFIG_FLAT_NODE_MEM_MAP
272 struct page *node_mem_map; 273 struct page *node_mem_map;
274#endif
273 struct bootmem_data *bdata; 275 struct bootmem_data *bdata;
274 unsigned long node_start_pfn; 276 unsigned long node_start_pfn;
275 unsigned long node_present_pages; /* total number of physical pages */ 277 unsigned long node_present_pages; /* total number of physical pages */
@@ -284,7 +286,11 @@ typedef struct pglist_data {
284 286
285#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 287#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
286#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) 288#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
289#ifdef CONFIG_FLAT_NODE_MEM_MAP
287#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) 290#define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr))
291#else
292#define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr))
293#endif
288#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) 294#define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr))
289 295
290extern struct pglist_data *pgdat_list; 296extern struct pglist_data *pgdat_list;
@@ -416,6 +422,10 @@ extern struct pglist_data contig_page_data;
416 422
417#endif /* !CONFIG_NEED_MULTIPLE_NODES */ 423#endif /* !CONFIG_NEED_MULTIPLE_NODES */
418 424
425#ifdef CONFIG_SPARSEMEM
426#include <asm/sparsemem.h>
427#endif
428
419#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED) 429#if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
420/* 430/*
421 * with 32 bit page->flags field, we reserve 8 bits for node/zone info. 431 * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
@@ -439,6 +449,92 @@ extern struct pglist_data contig_page_data;
439#define early_pfn_to_nid(nid) (0UL) 449#define early_pfn_to_nid(nid) (0UL)
440#endif 450#endif
441 451
452#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
453#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
454
455#ifdef CONFIG_SPARSEMEM
456
457/*
458 * SECTION_SHIFT #bits space required to store a section #
459 *
460 * PA_SECTION_SHIFT physical address to/from section number
461 * PFN_SECTION_SHIFT pfn to/from section number
462 */
463#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
464
465#define PA_SECTION_SHIFT (SECTION_SIZE_BITS)
466#define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT)
467
468#define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT)
469
470#define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT)
471#define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1))
472
473#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
474#error Allocator MAX_ORDER exceeds SECTION_SIZE
475#endif
476
477struct page;
478struct mem_section {
479 struct page *section_mem_map;
480};
481
482extern struct mem_section mem_section[NR_MEM_SECTIONS];
483
484/*
485 * Given a kernel address, find the home node of the underlying memory.
486 */
487#define kvaddr_to_nid(kaddr) pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
488
489static inline struct mem_section *__pfn_to_section(unsigned long pfn)
490{
491 return &mem_section[pfn_to_section_nr(pfn)];
492}
493
494#define pfn_to_page(pfn) \
495({ \
496 unsigned long __pfn = (pfn); \
497 __pfn_to_section(__pfn)->section_mem_map + __pfn; \
498})
499#define page_to_pfn(page) \
500({ \
501 page - mem_section[page_to_section(page)].section_mem_map; \
502})
503
504static inline int pfn_valid(unsigned long pfn)
505{
506 if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
507 return 0;
508 return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0;
509}
510
511/*
512 * These are _only_ used during initialisation, therefore they
513 * can use __initdata ... They could have names to indicate
514 * this restriction.
515 */
516#ifdef CONFIG_NUMA
517#define pfn_to_nid early_pfn_to_nid
518#endif
519
520#define pfn_to_pgdat(pfn) \
521({ \
522 NODE_DATA(pfn_to_nid(pfn)); \
523})
524
525#define early_pfn_valid(pfn) pfn_valid(pfn)
526void sparse_init(void);
527#else
528#define sparse_init() do {} while (0)
529#endif /* CONFIG_SPARSEMEM */
530
531#ifndef early_pfn_valid
532#define early_pfn_valid(pfn) (1)
533#endif
534
535void memory_present(int nid, unsigned long start, unsigned long end);
536unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
537
442#endif /* !__ASSEMBLY__ */ 538#endif /* !__ASSEMBLY__ */
443#endif /* __KERNEL__ */ 539#endif /* __KERNEL__ */
444#endif /* _LINUX_MMZONE_H */ 540#endif /* _LINUX_MMZONE_H */
diff --git a/include/linux/numa.h b/include/linux/numa.h
index bd0c8c4e9a95..f0c539bd3cfc 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -3,7 +3,7 @@
3 3
4#include <linux/config.h> 4#include <linux/config.h>
5 5
6#ifdef CONFIG_DISCONTIGMEM 6#ifndef CONFIG_FLATMEM
7#include <asm/numnodes.h> 7#include <asm/numnodes.h>
8#endif 8#endif
9 9