aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2007-07-17 07:03:12 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-17 13:22:59 -0400
commit2a1e274acf0b1c192face19a4be7c12d4503eaaf (patch)
treef7e98e1fe19d38bb10bf178fb8f8ed1789b659b2 /mm
parent769848c03895b63e5662eb7e4ec8c4866f7d0183 (diff)
Create the ZONE_MOVABLE zone
The following 8 patches against 2.6.20-mm2 create a zone called ZONE_MOVABLE that is only usable by allocations that specify both __GFP_HIGHMEM and __GFP_MOVABLE. This has the effect of keeping all non-movable pages within a single memory partition while allowing movable allocations to be satisfied from either partition. The patches may be applied with the list-based anti-fragmentation patches that groups pages together based on mobility. The size of the zone is determined by a kernelcore= parameter specified at boot-time. This specifies how much memory is usable by non-movable allocations and the remainder is used for ZONE_MOVABLE. Any range of pages within ZONE_MOVABLE can be released by migrating the pages or by reclaiming. When selecting a zone to take pages from for ZONE_MOVABLE, there are two things to consider. First, only memory from the highest populated zone is used for ZONE_MOVABLE. On the x86, this is probably going to be ZONE_HIGHMEM but it would be ZONE_DMA on ppc64 or possibly ZONE_DMA32 on x86_64. Second, the amount of memory usable by the kernel will be spread evenly throughout NUMA nodes where possible. If the nodes are not of equal size, the amount of memory usable by the kernel on some nodes may be greater than others. By default, the zone is not as useful for hugetlb allocations because they are pinned and non-migratable (currently at least). A sysctl is provided that allows huge pages to be allocated from that zone. This means that the huge page pool can be resized to the size of ZONE_MOVABLE during the lifetime of the system assuming that pages are not mlocked. Despite huge pages being non-movable, we do not introduce additional external fragmentation of note as huge pages are always the largest contiguous block we care about. Credit goes to Andy Whitcroft for catching a large variety of problems during review of the patches. This patch creates an additional zone, ZONE_MOVABLE. This zone is only usable by allocations which specify both __GFP_HIGHMEM and __GFP_MOVABLE. Hot-added memory continues to be placed in their existing destination as there is no mechanism to redirect them to a specific zone. [y-goto@jp.fujitsu.com: Fix section mismatch of memory hotplug related code] [akpm@linux-foundation.org: various fixes] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Cc: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/highmem.c7
-rw-r--r--mm/page_alloc.c232
-rw-r--r--mm/vmstat.c2
3 files changed, 235 insertions, 6 deletions
diff --git a/mm/highmem.c b/mm/highmem.c
index be8f8d36a8b9..7a967bc35152 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -46,9 +46,14 @@ unsigned int nr_free_highpages (void)
46 pg_data_t *pgdat; 46 pg_data_t *pgdat;
47 unsigned int pages = 0; 47 unsigned int pages = 0;
48 48
49 for_each_online_pgdat(pgdat) 49 for_each_online_pgdat(pgdat) {
50 pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 50 pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
51 NR_FREE_PAGES); 51 NR_FREE_PAGES);
52 if (zone_movable_is_highmem())
53 pages += zone_page_state(
54 &pgdat->node_zones[ZONE_MOVABLE],
55 NR_FREE_PAGES);
56 }
52 57
53 return pages; 58 return pages;
54} 59}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f9e4e647d7e8..c3f6f851f76e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -80,8 +80,9 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
80 256, 80 256,
81#endif 81#endif
82#ifdef CONFIG_HIGHMEM 82#ifdef CONFIG_HIGHMEM
83 32 83 32,
84#endif 84#endif
85 32,
85}; 86};
86 87
87EXPORT_SYMBOL(totalram_pages); 88EXPORT_SYMBOL(totalram_pages);
@@ -95,8 +96,9 @@ static char * const zone_names[MAX_NR_ZONES] = {
95#endif 96#endif
96 "Normal", 97 "Normal",
97#ifdef CONFIG_HIGHMEM 98#ifdef CONFIG_HIGHMEM
98 "HighMem" 99 "HighMem",
99#endif 100#endif
101 "Movable",
100}; 102};
101 103
102int min_free_kbytes = 1024; 104int min_free_kbytes = 1024;
@@ -134,6 +136,12 @@ static unsigned long __meminitdata dma_reserve;
134 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; 136 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
135 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; 137 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
136#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 138#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
139 unsigned long __initdata required_kernelcore;
140 unsigned long __initdata zone_movable_pfn[MAX_NUMNODES];
141
142 /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
143 int movable_zone;
144 EXPORT_SYMBOL(movable_zone);
137#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 145#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
138 146
139#if MAX_NUMNODES > 1 147#if MAX_NUMNODES > 1
@@ -1480,7 +1488,7 @@ unsigned int nr_free_buffer_pages(void)
1480 */ 1488 */
1481unsigned int nr_free_pagecache_pages(void) 1489unsigned int nr_free_pagecache_pages(void)
1482{ 1490{
1483 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1491 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
1484} 1492}
1485 1493
1486static inline void show_node(struct zone *zone) 1494static inline void show_node(struct zone *zone)
@@ -2667,6 +2675,63 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
2667} 2675}
2668 2676
2669/* 2677/*
2678 * This finds a zone that can be used for ZONE_MOVABLE pages. The
2679 * assumption is made that zones within a node are ordered in monotonic
2680 * increasing memory addresses so that the "highest" populated zone is used
2681 */
2682void __init find_usable_zone_for_movable(void)
2683{
2684 int zone_index;
2685 for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
2686 if (zone_index == ZONE_MOVABLE)
2687 continue;
2688
2689 if (arch_zone_highest_possible_pfn[zone_index] >
2690 arch_zone_lowest_possible_pfn[zone_index])
2691 break;
2692 }
2693
2694 VM_BUG_ON(zone_index == -1);
2695 movable_zone = zone_index;
2696}
2697
2698/*
2699 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
2700 * because it is sized independant of architecture. Unlike the other zones,
2701 * the starting point for ZONE_MOVABLE is not fixed. It may be different
2702 * in each node depending on the size of each node and how evenly kernelcore
2703 * is distributed. This helper function adjusts the zone ranges
2704 * provided by the architecture for a given node by using the end of the
2705 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
2706 * zones within a node are in order of monotonic increases memory addresses
2707 */
2708void __meminit adjust_zone_range_for_zone_movable(int nid,
2709 unsigned long zone_type,
2710 unsigned long node_start_pfn,
2711 unsigned long node_end_pfn,
2712 unsigned long *zone_start_pfn,
2713 unsigned long *zone_end_pfn)
2714{
2715 /* Only adjust if ZONE_MOVABLE is on this node */
2716 if (zone_movable_pfn[nid]) {
2717 /* Size ZONE_MOVABLE */
2718 if (zone_type == ZONE_MOVABLE) {
2719 *zone_start_pfn = zone_movable_pfn[nid];
2720 *zone_end_pfn = min(node_end_pfn,
2721 arch_zone_highest_possible_pfn[movable_zone]);
2722
2723 /* Adjust for ZONE_MOVABLE starting within this range */
2724 } else if (*zone_start_pfn < zone_movable_pfn[nid] &&
2725 *zone_end_pfn > zone_movable_pfn[nid]) {
2726 *zone_end_pfn = zone_movable_pfn[nid];
2727
2728 /* Check if this whole range is within ZONE_MOVABLE */
2729 } else if (*zone_start_pfn >= zone_movable_pfn[nid])
2730 *zone_start_pfn = *zone_end_pfn;
2731 }
2732}
2733
2734/*
2670 * Return the number of pages a zone spans in a node, including holes 2735 * Return the number of pages a zone spans in a node, including holes
2671 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 2736 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2672 */ 2737 */
@@ -2681,6 +2746,9 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
2681 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 2746 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2682 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 2747 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2683 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 2748 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2749 adjust_zone_range_for_zone_movable(nid, zone_type,
2750 node_start_pfn, node_end_pfn,
2751 &zone_start_pfn, &zone_end_pfn);
2684 2752
2685 /* Check that this node has pages within the zone's required range */ 2753 /* Check that this node has pages within the zone's required range */
2686 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 2754 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
@@ -2771,6 +2839,9 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
2771 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], 2839 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2772 node_end_pfn); 2840 node_end_pfn);
2773 2841
2842 adjust_zone_range_for_zone_movable(nid, zone_type,
2843 node_start_pfn, node_end_pfn,
2844 &zone_start_pfn, &zone_end_pfn);
2774 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 2845 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2775} 2846}
2776 2847
@@ -3148,6 +3219,122 @@ unsigned long __init find_max_pfn_with_active_regions(void)
3148 return max_pfn; 3219 return max_pfn;
3149} 3220}
3150 3221
3222/*
3223 * Find the PFN the Movable zone begins in each node. Kernel memory
3224 * is spread evenly between nodes as long as the nodes have enough
3225 * memory. When they don't, some nodes will have more kernelcore than
3226 * others
3227 */
3228void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
3229{
3230 int i, nid;
3231 unsigned long usable_startpfn;
3232 unsigned long kernelcore_node, kernelcore_remaining;
3233 int usable_nodes = num_online_nodes();
3234
3235 /* If kernelcore was not specified, there is no ZONE_MOVABLE */
3236 if (!required_kernelcore)
3237 return;
3238
3239 /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
3240 find_usable_zone_for_movable();
3241 usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
3242
3243restart:
3244 /* Spread kernelcore memory as evenly as possible throughout nodes */
3245 kernelcore_node = required_kernelcore / usable_nodes;
3246 for_each_online_node(nid) {
3247 /*
3248 * Recalculate kernelcore_node if the division per node
3249 * now exceeds what is necessary to satisfy the requested
3250 * amount of memory for the kernel
3251 */
3252 if (required_kernelcore < kernelcore_node)
3253 kernelcore_node = required_kernelcore / usable_nodes;
3254
3255 /*
3256 * As the map is walked, we track how much memory is usable
3257 * by the kernel using kernelcore_remaining. When it is
3258 * 0, the rest of the node is usable by ZONE_MOVABLE
3259 */
3260 kernelcore_remaining = kernelcore_node;
3261
3262 /* Go through each range of PFNs within this node */
3263 for_each_active_range_index_in_nid(i, nid) {
3264 unsigned long start_pfn, end_pfn;
3265 unsigned long size_pages;
3266
3267 start_pfn = max(early_node_map[i].start_pfn,
3268 zone_movable_pfn[nid]);
3269 end_pfn = early_node_map[i].end_pfn;
3270 if (start_pfn >= end_pfn)
3271 continue;
3272
3273 /* Account for what is only usable for kernelcore */
3274 if (start_pfn < usable_startpfn) {
3275 unsigned long kernel_pages;
3276 kernel_pages = min(end_pfn, usable_startpfn)
3277 - start_pfn;
3278
3279 kernelcore_remaining -= min(kernel_pages,
3280 kernelcore_remaining);
3281 required_kernelcore -= min(kernel_pages,
3282 required_kernelcore);
3283
3284 /* Continue if range is now fully accounted */
3285 if (end_pfn <= usable_startpfn) {
3286
3287 /*
3288 * Push zone_movable_pfn to the end so
3289 * that if we have to rebalance
3290 * kernelcore across nodes, we will
3291 * not double account here
3292 */
3293 zone_movable_pfn[nid] = end_pfn;
3294 continue;
3295 }
3296 start_pfn = usable_startpfn;
3297 }
3298
3299 /*
3300 * The usable PFN range for ZONE_MOVABLE is from
3301 * start_pfn->end_pfn. Calculate size_pages as the
3302 * number of pages used as kernelcore
3303 */
3304 size_pages = end_pfn - start_pfn;
3305 if (size_pages > kernelcore_remaining)
3306 size_pages = kernelcore_remaining;
3307 zone_movable_pfn[nid] = start_pfn + size_pages;
3308
3309 /*
3310 * Some kernelcore has been met, update counts and
3311 * break if the kernelcore for this node has been
3312 * satisified
3313 */
3314 required_kernelcore -= min(required_kernelcore,
3315 size_pages);
3316 kernelcore_remaining -= size_pages;
3317 if (!kernelcore_remaining)
3318 break;
3319 }
3320 }
3321
3322 /*
3323 * If there is still required_kernelcore, we do another pass with one
3324 * less node in the count. This will push zone_movable_pfn[nid] further
3325 * along on the nodes that still have memory until kernelcore is
3326 * satisified
3327 */
3328 usable_nodes--;
3329 if (usable_nodes && required_kernelcore > usable_nodes)
3330 goto restart;
3331
3332 /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
3333 for (nid = 0; nid < MAX_NUMNODES; nid++)
3334 zone_movable_pfn[nid] =
3335 roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
3336}
3337
3151/** 3338/**
3152 * free_area_init_nodes - Initialise all pg_data_t and zone data 3339 * free_area_init_nodes - Initialise all pg_data_t and zone data
3153 * @max_zone_pfn: an array of max PFNs for each zone 3340 * @max_zone_pfn: an array of max PFNs for each zone
@@ -3177,19 +3364,37 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3177 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 3364 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
3178 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 3365 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
3179 for (i = 1; i < MAX_NR_ZONES; i++) { 3366 for (i = 1; i < MAX_NR_ZONES; i++) {
3367 if (i == ZONE_MOVABLE)
3368 continue;
3180 arch_zone_lowest_possible_pfn[i] = 3369 arch_zone_lowest_possible_pfn[i] =
3181 arch_zone_highest_possible_pfn[i-1]; 3370 arch_zone_highest_possible_pfn[i-1];
3182 arch_zone_highest_possible_pfn[i] = 3371 arch_zone_highest_possible_pfn[i] =
3183 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 3372 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
3184 } 3373 }
3374 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
3375 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
3376
3377 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
3378 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
3379 find_zone_movable_pfns_for_nodes(zone_movable_pfn);
3185 3380
3186 /* Print out the zone ranges */ 3381 /* Print out the zone ranges */
3187 printk("Zone PFN ranges:\n"); 3382 printk("Zone PFN ranges:\n");
3188 for (i = 0; i < MAX_NR_ZONES; i++) 3383 for (i = 0; i < MAX_NR_ZONES; i++) {
3384 if (i == ZONE_MOVABLE)
3385 continue;
3189 printk(" %-8s %8lu -> %8lu\n", 3386 printk(" %-8s %8lu -> %8lu\n",
3190 zone_names[i], 3387 zone_names[i],
3191 arch_zone_lowest_possible_pfn[i], 3388 arch_zone_lowest_possible_pfn[i],
3192 arch_zone_highest_possible_pfn[i]); 3389 arch_zone_highest_possible_pfn[i]);
3390 }
3391
3392 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
3393 printk("Movable zone start PFN for each node\n");
3394 for (i = 0; i < MAX_NUMNODES; i++) {
3395 if (zone_movable_pfn[i])
3396 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
3397 }
3193 3398
3194 /* Print out the early_node_map[] */ 3399 /* Print out the early_node_map[] */
3195 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 3400 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
@@ -3206,6 +3411,25 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3206 find_min_pfn_for_node(nid), NULL); 3411 find_min_pfn_for_node(nid), NULL);
3207 } 3412 }
3208} 3413}
3414
3415/*
3416 * kernelcore=size sets the amount of memory for use for allocations that
3417 * cannot be reclaimed or migrated.
3418 */
3419int __init cmdline_parse_kernelcore(char *p)
3420{
3421 unsigned long long coremem;
3422 if (!p)
3423 return -EINVAL;
3424
3425 coremem = memparse(p, &p);
3426 required_kernelcore = coremem >> PAGE_SHIFT;
3427
3428 /* Paranoid check that UL is enough for required_kernelcore */
3429 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
3430
3431 return 0;
3432}
3209#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 3433#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
3210 3434
3211/** 3435/**
diff --git a/mm/vmstat.c b/mm/vmstat.c
index eceaf496210f..fadf791cd7e6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -472,7 +472,7 @@ const struct seq_operations fragmentation_op = {
472#endif 472#endif
473 473
474#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ 474#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
475 TEXT_FOR_HIGHMEM(xx) 475 TEXT_FOR_HIGHMEM(xx) xx "_movable",
476 476
477static const char * const vmstat_text[] = { 477static const char * const vmstat_text[] = {
478 /* Zoned VM counters */ 478 /* Zoned VM counters */