aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorJiang Liu <liuj97@gmail.com>2012-12-12 16:52:12 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 20:38:34 -0500
commit9feedc9d831e18ae6d0d15aa562e5e46ba53647b (patch)
treecb26ff54b0f02c4905772288b27f99b8b384ad6d /mm/page_alloc.c
parentc2d23f919bafcbc2259f5257d9a7d729802f0e3a (diff)
mm: introduce new field "managed_pages" to struct zone
Currently a zone's present_pages is calcuated as below, which is inaccurate and may cause trouble to memory hotplug. spanned_pages - absent_pages - memmap_pages - dma_reserve. During fixing bugs caused by inaccurate zone->present_pages, we found zone->present_pages has been abused. The field zone->present_pages may have different meanings in different contexts: 1) pages existing in a zone. 2) pages managed by the buddy system. For more discussions about the issue, please refer to: http://lkml.org/lkml/2012/11/5/866 https://patchwork.kernel.org/patch/1346751/ This patchset tries to introduce a new field named "managed_pages" to struct zone, which counts "pages managed by the buddy system". And revert zone->present_pages to count "physical pages existing in a zone", which also keep in consistence with pgdat->node_present_pages. We will set an initial value for zone->managed_pages in function free_area_init_core() and will adjust it later if the initial value is inaccurate. For DMA/normal zones, the initial value is set to: (spanned_pages - absent_pages - memmap_pages - dma_reserve) Later zone->managed_pages will be adjusted to the accurate value when the bootmem allocator frees all free pages to the buddy system in function free_all_bootmem_node() and free_all_bootmem(). The bootmem allocator doesn't touch highmem pages, so highmem zones' managed_pages is set to the accurate value "spanned_pages - absent_pages" in function free_area_init_core() and won't be updated anymore. This patch also adds a new field "managed_pages" to /proc/zoneinfo and sysrq showmem. [akpm@linux-foundation.org: small comment tweaks] Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Maciej Rutecki <maciej.rutecki@gmail.com> Tested-by: Chris Clayton <chris2553@googlemail.com> Cc: "Rafael J . Wysocki" <rjw@sisk.pl> Cc: Mel Gorman <mgorman@suse.de> Cc: Minchan Kim <minchan@kernel.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c44
1 files changed, 30 insertions, 14 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2bf0d43d646b..0b6a6d04300a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -735,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
735 local_irq_restore(flags); 735 local_irq_restore(flags);
736} 736}
737 737
738/*
739 * Read access to zone->managed_pages is safe because it's unsigned long,
740 * but we still need to serialize writers. Currently all callers of
741 * __free_pages_bootmem() except put_page_bootmem() should only be used
742 * at boot time. So for shorter boot time, we shift the burden to
743 * put_page_bootmem() to serialize writers.
744 */
738void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 745void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
739{ 746{
740 unsigned int nr_pages = 1 << order; 747 unsigned int nr_pages = 1 << order;
@@ -750,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
750 set_page_count(p, 0); 757 set_page_count(p, 0);
751 } 758 }
752 759
760 page_zone(page)->managed_pages += 1 << order;
753 set_page_refcounted(page); 761 set_page_refcounted(page);
754 __free_pages(page, order); 762 __free_pages(page, order);
755} 763}
@@ -2984,6 +2992,7 @@ void show_free_areas(unsigned int filter)
2984 " isolated(anon):%lukB" 2992 " isolated(anon):%lukB"
2985 " isolated(file):%lukB" 2993 " isolated(file):%lukB"
2986 " present:%lukB" 2994 " present:%lukB"
2995 " managed:%lukB"
2987 " mlocked:%lukB" 2996 " mlocked:%lukB"
2988 " dirty:%lukB" 2997 " dirty:%lukB"
2989 " writeback:%lukB" 2998 " writeback:%lukB"
@@ -3013,6 +3022,7 @@ void show_free_areas(unsigned int filter)
3013 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3022 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3014 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3023 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3015 K(zone->present_pages), 3024 K(zone->present_pages),
3025 K(zone->managed_pages),
3016 K(zone_page_state(zone, NR_MLOCK)), 3026 K(zone_page_state(zone, NR_MLOCK)),
3017 K(zone_page_state(zone, NR_FILE_DIRTY)), 3027 K(zone_page_state(zone, NR_FILE_DIRTY)),
3018 K(zone_page_state(zone, NR_WRITEBACK)), 3028 K(zone_page_state(zone, NR_WRITEBACK)),
@@ -4502,48 +4512,54 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4502 4512
4503 for (j = 0; j < MAX_NR_ZONES; j++) { 4513 for (j = 0; j < MAX_NR_ZONES; j++) {
4504 struct zone *zone = pgdat->node_zones + j; 4514 struct zone *zone = pgdat->node_zones + j;
4505 unsigned long size, realsize, memmap_pages; 4515 unsigned long size, realsize, freesize, memmap_pages;
4506 4516
4507 size = zone_spanned_pages_in_node(nid, j, zones_size); 4517 size = zone_spanned_pages_in_node(nid, j, zones_size);
4508 realsize = size - zone_absent_pages_in_node(nid, j, 4518 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4509 zholes_size); 4519 zholes_size);
4510 4520
4511 /* 4521 /*
4512 * Adjust realsize so that it accounts for how much memory 4522 * Adjust freesize so that it accounts for how much memory
4513 * is used by this zone for memmap. This affects the watermark 4523 * is used by this zone for memmap. This affects the watermark
4514 * and per-cpu initialisations 4524 * and per-cpu initialisations
4515 */ 4525 */
4516 memmap_pages = 4526 memmap_pages =
4517 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 4527 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
4518 if (realsize >= memmap_pages) { 4528 if (freesize >= memmap_pages) {
4519 realsize -= memmap_pages; 4529 freesize -= memmap_pages;
4520 if (memmap_pages) 4530 if (memmap_pages)
4521 printk(KERN_DEBUG 4531 printk(KERN_DEBUG
4522 " %s zone: %lu pages used for memmap\n", 4532 " %s zone: %lu pages used for memmap\n",
4523 zone_names[j], memmap_pages); 4533 zone_names[j], memmap_pages);
4524 } else 4534 } else
4525 printk(KERN_WARNING 4535 printk(KERN_WARNING
4526 " %s zone: %lu pages exceeds realsize %lu\n", 4536 " %s zone: %lu pages exceeds freesize %lu\n",
4527 zone_names[j], memmap_pages, realsize); 4537 zone_names[j], memmap_pages, freesize);
4528 4538
4529 /* Account for reserved pages */ 4539 /* Account for reserved pages */
4530 if (j == 0 && realsize > dma_reserve) { 4540 if (j == 0 && freesize > dma_reserve) {
4531 realsize -= dma_reserve; 4541 freesize -= dma_reserve;
4532 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4542 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4533 zone_names[0], dma_reserve); 4543 zone_names[0], dma_reserve);
4534 } 4544 }
4535 4545
4536 if (!is_highmem_idx(j)) 4546 if (!is_highmem_idx(j))
4537 nr_kernel_pages += realsize; 4547 nr_kernel_pages += freesize;
4538 nr_all_pages += realsize; 4548 nr_all_pages += freesize;
4539 4549
4540 zone->spanned_pages = size; 4550 zone->spanned_pages = size;
4541 zone->present_pages = realsize; 4551 zone->present_pages = freesize;
4552 /*
4553 * Set an approximate value for lowmem here, it will be adjusted
4554 * when the bootmem allocator frees pages into the buddy system.
4555 * And all highmem pages will be managed by the buddy system.
4556 */
4557 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4542#ifdef CONFIG_NUMA 4558#ifdef CONFIG_NUMA
4543 zone->node = nid; 4559 zone->node = nid;
4544 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4560 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4545 / 100; 4561 / 100;
4546 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 4562 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4547#endif 4563#endif
4548 zone->name = zone_names[j]; 4564 zone->name = zone_names[j];
4549 spin_lock_init(&zone->lock); 4565 spin_lock_init(&zone->lock);