From be49a6e13537f32f85bd8c4ba04317ca36ca60ff Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 12 Dec 2012 13:51:19 -0800 Subject: mm: use migrate_prep() instead of migrate_prep_local() __alloc_contig_migrate_range() should use all possible ways to get all the pages migrated from the given memory range, so pruning per-cpu lru lists for all CPUs is required, regadless the cost of such operation. Otherwise some pages which got stuck at per-cpu lru list might get missed by migration procedure causing the contiguous allocation to fail. Reported-by: SeongHwan Yoon Signed-off-by: Marek Szyprowski Signed-off-by: Kyungmin Park Acked-by: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a8d339d282a..4171cd4f8257 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5727,7 +5727,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, unsigned int tries = 0; int ret = 0; - migrate_prep_local(); + migrate_prep(); while (pfn < end || !list_empty(&cc->migratepages)) { if (fatal_signal_pending(current)) { -- cgit v1.2.2 From 4b0ef1fe8a626f0ba7f649764f979d0dc9eab86b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:51:46 -0800 Subject: page_alloc: use N_MEMORY instead N_HIGH_MEMORY change the node_states initialization N_HIGH_MEMORY stands for the nodes that has normal or high memory. N_MEMORY stands for the nodes that has any memory. The code here need to handle with the nodes which have memory, we should use N_MEMORY instead. Since we introduced N_MEMORY, we update the initialization of node_states. Signed-off-by: Lai Jiangshan Signed-off-by: Lin Feng Signed-off-by: Wen Congyang Cc: Christoph Lameter Cc: Hillf Danton Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4171cd4f8257..35727168896b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1695,7 +1695,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, * * If the zonelist cache is present in the passed in zonelist, then * returns a pointer to the allowed node mask (either the current - * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) + * tasks mems_allowed, or node_states[N_MEMORY].) * * If the zonelist cache is not available for this zonelist, does * nothing and returns NULL. @@ -1724,7 +1724,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? &cpuset_current_mems_allowed : - &node_states[N_HIGH_MEMORY]; + &node_states[N_MEMORY]; return allowednodes; } @@ -3238,7 +3238,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) return node; } - for_each_node_state(n, N_HIGH_MEMORY) { + for_each_node_state(n, N_MEMORY) { /* Don't want a node to appear more than once */ if (node_isset(n, *used_node_mask)) @@ -3380,7 +3380,7 @@ static int default_zonelist_order(void) * local memory, NODE_ORDER may be suitable. */ average_size = total_size / - (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); + (nodes_weight(node_states[N_MEMORY]) + 1); for_each_online_node(nid) { low_kmem_size = 0; total_size = 0; @@ -4731,7 +4731,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. - * Populate N_HIGH_MEMORY for calculating usable_nodes. + * Populate N_MEMORY for calculating usable_nodes. */ static unsigned long __init early_calculate_totalpages(void) { @@ -4744,7 +4744,7 @@ static unsigned long __init early_calculate_totalpages(void) totalpages += pages; if (pages) - node_set_state(nid, N_HIGH_MEMORY); + node_set_state(nid, N_MEMORY); } return totalpages; } @@ -4761,9 +4761,9 @@ static void __init find_zone_movable_pfns_for_nodes(void) unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; /* save the state before borrow the nodemask */ - nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; + nodemask_t saved_node_state = node_states[N_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); - int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); + int usable_nodes = nodes_weight(node_states[N_MEMORY]); /* * If movablecore was specified, calculate what size of @@ -4798,7 +4798,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) restart: /* Spread kernelcore memory as evenly as possible throughout nodes */ kernelcore_node = required_kernelcore / usable_nodes; - for_each_node_state(nid, N_HIGH_MEMORY) { + for_each_node_state(nid, N_MEMORY) { unsigned long start_pfn, end_pfn; /* @@ -4890,23 +4890,27 @@ restart: out: /* restore the node_state */ - node_states[N_HIGH_MEMORY] = saved_node_state; + node_states[N_MEMORY] = saved_node_state; } -/* Any regular memory on that node ? */ -static void __init check_for_regular_memory(pg_data_t *pgdat) +/* Any regular or high memory on that node ? */ +static void check_for_memory(pg_data_t *pgdat, int nid) { -#ifdef CONFIG_HIGHMEM enum zone_type zone_type; - for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { + if (N_MEMORY == N_NORMAL_MEMORY) + return; + + for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; if (zone->present_pages) { - node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); + node_set_state(nid, N_HIGH_MEMORY); + if (N_NORMAL_MEMORY != N_HIGH_MEMORY && + zone_type <= ZONE_NORMAL) + node_set_state(nid, N_NORMAL_MEMORY); break; } } -#endif } /** @@ -4989,8 +4993,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) /* Any memory on that node */ if (pgdat->node_present_pages) - node_set_state(nid, N_HIGH_MEMORY); - check_for_regular_memory(pgdat); + node_set_state(nid, N_MEMORY); + check_for_memory(pgdat, nid); } } -- cgit v1.2.2 From 20b2f52b73febce476fc9376f0296c1aa0e4f5a7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 12 Dec 2012 13:52:00 -0800 Subject: numa: add CONFIG_MOVABLE_NODE for movable-dedicated node We need a node which only contains movable memory. This feature is very important for node hotplug. If a node has normal/highmem, the memory may be used by the kernel and can't be offlined. If the node only contains movable memory, we can offline the memory and the node. All are prepared, we can actually introduce N_MEMORY. add CONFIG_MOVABLE_NODE make we can use it for movable-dedicated node [akpm@linux-foundation.org: fix Kconfig text] Signed-off-by: Lai Jiangshan Tested-by: Yasuaki Ishimatsu Signed-off-by: Wen Congyang Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Rusty Russell Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 35727168896b..2bf0d43d646b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -89,6 +89,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_NORMAL_MEMORY] = { { [0] = 1UL } }, #ifdef CONFIG_HIGHMEM [N_HIGH_MEMORY] = { { [0] = 1UL } }, +#endif +#ifdef CONFIG_MOVABLE_NODE + [N_MEMORY] = { { [0] = 1UL } }, #endif [N_CPU] = { { [0] = 1UL } }, #endif /* NUMA */ -- cgit v1.2.2 From 9feedc9d831e18ae6d0d15aa562e5e46ba53647b Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 12 Dec 2012 13:52:12 -0800 Subject: mm: introduce new field "managed_pages" to struct zone Currently a zone's present_pages is calcuated as below, which is inaccurate and may cause trouble to memory hotplug. spanned_pages - absent_pages - memmap_pages - dma_reserve. During fixing bugs caused by inaccurate zone->present_pages, we found zone->present_pages has been abused. The field zone->present_pages may have different meanings in different contexts: 1) pages existing in a zone. 2) pages managed by the buddy system. For more discussions about the issue, please refer to: http://lkml.org/lkml/2012/11/5/866 https://patchwork.kernel.org/patch/1346751/ This patchset tries to introduce a new field named "managed_pages" to struct zone, which counts "pages managed by the buddy system". And revert zone->present_pages to count "physical pages existing in a zone", which also keep in consistence with pgdat->node_present_pages. We will set an initial value for zone->managed_pages in function free_area_init_core() and will adjust it later if the initial value is inaccurate. For DMA/normal zones, the initial value is set to: (spanned_pages - absent_pages - memmap_pages - dma_reserve) Later zone->managed_pages will be adjusted to the accurate value when the bootmem allocator frees all free pages to the buddy system in function free_all_bootmem_node() and free_all_bootmem(). The bootmem allocator doesn't touch highmem pages, so highmem zones' managed_pages is set to the accurate value "spanned_pages - absent_pages" in function free_area_init_core() and won't be updated anymore. This patch also adds a new field "managed_pages" to /proc/zoneinfo and sysrq showmem. [akpm@linux-foundation.org: small comment tweaks] Signed-off-by: Jiang Liu Cc: Wen Congyang Cc: David Rientjes Cc: Maciej Rutecki Tested-by: Chris Clayton Cc: "Rafael J . Wysocki" Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Jianguo Wu Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bf0d43d646b..0b6a6d04300a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -735,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } +/* + * Read access to zone->managed_pages is safe because it's unsigned long, + * but we still need to serialize writers. Currently all callers of + * __free_pages_bootmem() except put_page_bootmem() should only be used + * at boot time. So for shorter boot time, we shift the burden to + * put_page_bootmem() to serialize writers. + */ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; @@ -750,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) set_page_count(p, 0); } + page_zone(page)->managed_pages += 1 << order; set_page_refcounted(page); __free_pages(page, order); } @@ -2984,6 +2992,7 @@ void show_free_areas(unsigned int filter) " isolated(anon):%lukB" " isolated(file):%lukB" " present:%lukB" + " managed:%lukB" " mlocked:%lukB" " dirty:%lukB" " writeback:%lukB" @@ -3013,6 +3022,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_ISOLATED_ANON)), K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), + K(zone->managed_pages), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_FILE_DIRTY)), K(zone_page_state(zone, NR_WRITEBACK)), @@ -4502,48 +4512,54 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize, memmap_pages; + unsigned long size, realsize, freesize, memmap_pages; size = zone_spanned_pages_in_node(nid, j, zones_size); - realsize = size - zone_absent_pages_in_node(nid, j, + realsize = freesize = size - zone_absent_pages_in_node(nid, j, zholes_size); /* - * Adjust realsize so that it accounts for how much memory + * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ memmap_pages = PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; - if (realsize >= memmap_pages) { - realsize -= memmap_pages; + if (freesize >= memmap_pages) { + freesize -= memmap_pages; if (memmap_pages) printk(KERN_DEBUG " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else printk(KERN_WARNING - " %s zone: %lu pages exceeds realsize %lu\n", - zone_names[j], memmap_pages, realsize); + " %s zone: %lu pages exceeds freesize %lu\n", + zone_names[j], memmap_pages, freesize); /* Account for reserved pages */ - if (j == 0 && realsize > dma_reserve) { - realsize -= dma_reserve; + if (j == 0 && freesize > dma_reserve) { + freesize -= dma_reserve; printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j)) - nr_kernel_pages += realsize; - nr_all_pages += realsize; + nr_kernel_pages += freesize; + nr_all_pages += freesize; zone->spanned_pages = size; - zone->present_pages = realsize; + zone->present_pages = freesize; + /* + * Set an approximate value for lowmem here, it will be adjusted + * when the bootmem allocator frees pages into the buddy system. + * And all highmem pages will be managed by the buddy system. + */ + zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; #ifdef CONFIG_NUMA zone->node = nid; - zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) + zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) / 100; - zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; + zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); -- cgit v1.2.2 From 01cefaef40c48c714b7476e62781230993e10dbf Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 12 Dec 2012 13:52:19 -0800 Subject: mm: provide more accurate estimation of pages occupied by memmap If SPARSEMEM is enabled, it won't build page structures for non-existing pages (holes) within a zone, so provide a more accurate estimation of pages occupied by memmap if there are bigger holes within the zone. And pages for highmem zones' memmap will be allocated from lowmem, so charge nr_kernel_pages for that. [akpm@linux-foundation.org: mark calc_memmap_size __paging_init] Signed-off-by: Jiang Liu Cc: Wen Congyang Cc: David Rientjes Cc: Maciej Rutecki Cc: Chris Clayton Cc: "Rafael J . Wysocki" Cc: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Tested-by: Jianguo Wu Cc: Dave Hansen Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b6a6d04300a..d187988e2b56 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4489,6 +4489,26 @@ void __init set_pageblock_order(void) #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ +static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, + unsigned long present_pages) +{ + unsigned long pages = spanned_pages; + + /* + * Provide a more accurate estimation if there are holes within + * the zone and SPARSEMEM is in use. If there are holes within the + * zone, each populated memory region may cost us one or two extra + * memmap pages due to alignment because memmap pages for each + * populated regions may not naturally algined on page boundary. + * So the (present_pages >> 4) heuristic is a tradeoff for that. + */ + if (spanned_pages > present_pages + (present_pages >> 4) && + IS_ENABLED(CONFIG_SPARSEMEM)) + pages = present_pages; + + return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; +} + /* * Set up the zone data structures: * - mark all pages reserved @@ -4523,8 +4543,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = - PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; + memmap_pages = calc_memmap_size(size, realsize); if (freesize >= memmap_pages) { freesize -= memmap_pages; if (memmap_pages) @@ -4545,6 +4564,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, if (!is_highmem_idx(j)) nr_kernel_pages += freesize; + /* Charge for highmem memmap if there are enough kernel pages */ + else if (nr_kernel_pages > memmap_pages * 2) + nr_kernel_pages -= memmap_pages; nr_all_pages += freesize; zone->spanned_pages = size; -- cgit v1.2.2