aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJiang Liu <liuj97@gmail.com>2012-12-12 16:52:12 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-12 20:38:34 -0500
commit9feedc9d831e18ae6d0d15aa562e5e46ba53647b (patch)
treecb26ff54b0f02c4905772288b27f99b8b384ad6d
parentc2d23f919bafcbc2259f5257d9a7d729802f0e3a (diff)
mm: introduce new field "managed_pages" to struct zone
Currently a zone's present_pages is calcuated as below, which is inaccurate and may cause trouble to memory hotplug. spanned_pages - absent_pages - memmap_pages - dma_reserve. During fixing bugs caused by inaccurate zone->present_pages, we found zone->present_pages has been abused. The field zone->present_pages may have different meanings in different contexts: 1) pages existing in a zone. 2) pages managed by the buddy system. For more discussions about the issue, please refer to: http://lkml.org/lkml/2012/11/5/866 https://patchwork.kernel.org/patch/1346751/ This patchset tries to introduce a new field named "managed_pages" to struct zone, which counts "pages managed by the buddy system". And revert zone->present_pages to count "physical pages existing in a zone", which also keep in consistence with pgdat->node_present_pages. We will set an initial value for zone->managed_pages in function free_area_init_core() and will adjust it later if the initial value is inaccurate. For DMA/normal zones, the initial value is set to: (spanned_pages - absent_pages - memmap_pages - dma_reserve) Later zone->managed_pages will be adjusted to the accurate value when the bootmem allocator frees all free pages to the buddy system in function free_all_bootmem_node() and free_all_bootmem(). The bootmem allocator doesn't touch highmem pages, so highmem zones' managed_pages is set to the accurate value "spanned_pages - absent_pages" in function free_area_init_core() and won't be updated anymore. This patch also adds a new field "managed_pages" to /proc/zoneinfo and sysrq showmem. [akpm@linux-foundation.org: small comment tweaks] Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Maciej Rutecki <maciej.rutecki@gmail.com> Tested-by: Chris Clayton <chris2553@googlemail.com> Cc: "Rafael J . Wysocki" <rjw@sisk.pl> Cc: Mel Gorman <mgorman@suse.de> Cc: Minchan Kim <minchan@kernel.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h41
-rw-r--r--mm/bootmem.c21
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/nobootmem.c22
-rw-r--r--mm/page_alloc.c44
-rw-r--r--mm/vmstat.c6
6 files changed, 121 insertions, 23 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c0b1d608a69..cd55dad56aac 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -460,17 +460,44 @@ struct zone {
460 unsigned long zone_start_pfn; 460 unsigned long zone_start_pfn;
461 461
462 /* 462 /*
463 * zone_start_pfn, spanned_pages and present_pages are all 463 * spanned_pages is the total pages spanned by the zone, including
464 * protected by span_seqlock. It is a seqlock because it has 464 * holes, which is calculated as:
465 * to be read outside of zone->lock, and it is done in the main 465 * spanned_pages = zone_end_pfn - zone_start_pfn;
466 * allocator path. But, it is written quite infrequently.
467 * 466 *
468 * The lock is declared along with zone->lock because it is 467 * present_pages is physical pages existing within the zone, which
468 * is calculated as:
469 * present_pages = spanned_pages - absent_pages(pags in holes);
470 *
471 * managed_pages is present pages managed by the buddy system, which
472 * is calculated as (reserved_pages includes pages allocated by the
473 * bootmem allocator):
474 * managed_pages = present_pages - reserved_pages;
475 *
476 * So present_pages may be used by memory hotplug or memory power
477 * management logic to figure out unmanaged pages by checking
478 * (present_pages - managed_pages). And managed_pages should be used
479 * by page allocator and vm scanner to calculate all kinds of watermarks
480 * and thresholds.
481 *
482 * Locking rules:
483 *
484 * zone_start_pfn and spanned_pages are protected by span_seqlock.
485 * It is a seqlock because it has to be read outside of zone->lock,
486 * and it is done in the main allocator path. But, it is written
487 * quite infrequently.
488 *
489 * The span_seq lock is declared along with zone->lock because it is
469 * frequently read in proximity to zone->lock. It's good to 490 * frequently read in proximity to zone->lock. It's good to
470 * give them a chance of being in the same cacheline. 491 * give them a chance of being in the same cacheline.
492 *
493 * Write access to present_pages and managed_pages at runtime should
494 * be protected by lock_memory_hotplug()/unlock_memory_hotplug().
495 * Any reader who can't tolerant drift of present_pages and
496 * managed_pages should hold memory hotplug lock to get a stable value.
471 */ 497 */
472 unsigned long spanned_pages; /* total size, including holes */ 498 unsigned long spanned_pages;
473 unsigned long present_pages; /* amount of memory (excluding holes) */ 499 unsigned long present_pages;
500 unsigned long managed_pages;
474 501
475 /* 502 /*
476 * rarely used fields: 503 * rarely used fields:
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 26d057a8b552..19262ac05dd2 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
229 return count; 229 return count;
230} 230}
231 231
232static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
233{
234 struct zone *z;
235
236 /*
237 * In free_area_init_core(), highmem zone's managed_pages is set to
238 * present_pages, and bootmem allocator doesn't allocate from highmem
239 * zones. So there's no need to recalculate managed_pages because all
240 * highmem pages will be managed by the buddy system. Here highmem
241 * zone also includes highmem movable zone.
242 */
243 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
244 if (!is_highmem(z))
245 z->managed_pages = 0;
246}
247
232/** 248/**
233 * free_all_bootmem_node - release a node's free pages to the buddy allocator 249 * free_all_bootmem_node - release a node's free pages to the buddy allocator
234 * @pgdat: node to be released 250 * @pgdat: node to be released
@@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
238unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 254unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
239{ 255{
240 register_page_bootmem_info_node(pgdat); 256 register_page_bootmem_info_node(pgdat);
257 reset_node_lowmem_managed_pages(pgdat);
241 return free_all_bootmem_core(pgdat->bdata); 258 return free_all_bootmem_core(pgdat->bdata);
242} 259}
243 260
@@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void)
250{ 267{
251 unsigned long total_pages = 0; 268 unsigned long total_pages = 0;
252 bootmem_data_t *bdata; 269 bootmem_data_t *bdata;
270 struct pglist_data *pgdat;
271
272 for_each_online_pgdat(pgdat)
273 reset_node_lowmem_managed_pages(pgdat);
253 274
254 list_for_each_entry(bdata, &bdata_list, list) 275 list_for_each_entry(bdata, &bdata_list, list)
255 total_pages += free_all_bootmem_core(bdata); 276 total_pages += free_all_bootmem_core(bdata);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c6cd8b515424..b7c93ca896d6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
106void __ref put_page_bootmem(struct page *page) 106void __ref put_page_bootmem(struct page *page)
107{ 107{
108 unsigned long type; 108 unsigned long type;
109 static DEFINE_MUTEX(ppb_lock);
109 110
110 type = (unsigned long) page->lru.next; 111 type = (unsigned long) page->lru.next;
111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page)
115 ClearPagePrivate(page); 116 ClearPagePrivate(page);
116 set_page_private(page, 0); 117 set_page_private(page, 0);
117 INIT_LIST_HEAD(&page->lru); 118 INIT_LIST_HEAD(&page->lru);
119
120 /*
121 * Please refer to comment for __free_pages_bootmem()
122 * for why we serialize here.
123 */
124 mutex_lock(&ppb_lock);
118 __free_pages_bootmem(page, 0); 125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock);
119 } 127 }
120 128
121} 129}
@@ -748,6 +756,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
748 return ret; 756 return ret;
749 } 757 }
750 758
759 zone->managed_pages += onlined_pages;
751 zone->present_pages += onlined_pages; 760 zone->present_pages += onlined_pages;
752 zone->zone_pgdat->node_present_pages += onlined_pages; 761 zone->zone_pgdat->node_present_pages += onlined_pages;
753 if (onlined_pages) { 762 if (onlined_pages) {
@@ -1321,6 +1330,7 @@ repeat:
1321 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1330 /* reset pagetype flags and makes migrate type to be MOVABLE */
1322 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1331 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1323 /* removal success */ 1332 /* removal success */
1333 zone->managed_pages -= offlined_pages;
1324 zone->present_pages -= offlined_pages; 1334 zone->present_pages -= offlined_pages;
1325 zone->zone_pgdat->node_present_pages -= offlined_pages; 1335 zone->zone_pgdat->node_present_pages -= offlined_pages;
1326 totalram_pages -= offlined_pages; 1336 totalram_pages -= offlined_pages;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bd82f6b31411..b8294fc03df8 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid)
137 return count; 137 return count;
138} 138}
139 139
140static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
141{
142 struct zone *z;
143
144 /*
145 * In free_area_init_core(), highmem zone's managed_pages is set to
146 * present_pages, and bootmem allocator doesn't allocate from highmem
147 * zones. So there's no need to recalculate managed_pages because all
148 * highmem pages will be managed by the buddy system. Here highmem
149 * zone also includes highmem movable zone.
150 */
151 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
152 if (!is_highmem(z))
153 z->managed_pages = 0;
154}
155
140/** 156/**
141 * free_all_bootmem_node - release a node's free pages to the buddy allocator 157 * free_all_bootmem_node - release a node's free pages to the buddy allocator
142 * @pgdat: node to be released 158 * @pgdat: node to be released
@@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
146unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 162unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
147{ 163{
148 register_page_bootmem_info_node(pgdat); 164 register_page_bootmem_info_node(pgdat);
165 reset_node_lowmem_managed_pages(pgdat);
149 166
150 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ 167 /* free_low_memory_core_early(MAX_NUMNODES) will be called later */
151 return 0; 168 return 0;
@@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
158 */ 175 */
159unsigned long __init free_all_bootmem(void) 176unsigned long __init free_all_bootmem(void)
160{ 177{
178 struct pglist_data *pgdat;
179
180 for_each_online_pgdat(pgdat)
181 reset_node_lowmem_managed_pages(pgdat);
182
161 /* 183 /*
162 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 184 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
163 * because in some case like Node0 doesn't have RAM installed 185 * because in some case like Node0 doesn't have RAM installed
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2bf0d43d646b..0b6a6d04300a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -735,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
735 local_irq_restore(flags); 735 local_irq_restore(flags);
736} 736}
737 737
738/*
739 * Read access to zone->managed_pages is safe because it's unsigned long,
740 * but we still need to serialize writers. Currently all callers of
741 * __free_pages_bootmem() except put_page_bootmem() should only be used
742 * at boot time. So for shorter boot time, we shift the burden to
743 * put_page_bootmem() to serialize writers.
744 */
738void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 745void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
739{ 746{
740 unsigned int nr_pages = 1 << order; 747 unsigned int nr_pages = 1 << order;
@@ -750,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
750 set_page_count(p, 0); 757 set_page_count(p, 0);
751 } 758 }
752 759
760 page_zone(page)->managed_pages += 1 << order;
753 set_page_refcounted(page); 761 set_page_refcounted(page);
754 __free_pages(page, order); 762 __free_pages(page, order);
755} 763}
@@ -2984,6 +2992,7 @@ void show_free_areas(unsigned int filter)
2984 " isolated(anon):%lukB" 2992 " isolated(anon):%lukB"
2985 " isolated(file):%lukB" 2993 " isolated(file):%lukB"
2986 " present:%lukB" 2994 " present:%lukB"
2995 " managed:%lukB"
2987 " mlocked:%lukB" 2996 " mlocked:%lukB"
2988 " dirty:%lukB" 2997 " dirty:%lukB"
2989 " writeback:%lukB" 2998 " writeback:%lukB"
@@ -3013,6 +3022,7 @@ void show_free_areas(unsigned int filter)
3013 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3022 K(zone_page_state(zone, NR_ISOLATED_ANON)),
3014 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3023 K(zone_page_state(zone, NR_ISOLATED_FILE)),
3015 K(zone->present_pages), 3024 K(zone->present_pages),
3025 K(zone->managed_pages),
3016 K(zone_page_state(zone, NR_MLOCK)), 3026 K(zone_page_state(zone, NR_MLOCK)),
3017 K(zone_page_state(zone, NR_FILE_DIRTY)), 3027 K(zone_page_state(zone, NR_FILE_DIRTY)),
3018 K(zone_page_state(zone, NR_WRITEBACK)), 3028 K(zone_page_state(zone, NR_WRITEBACK)),
@@ -4502,48 +4512,54 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4502 4512
4503 for (j = 0; j < MAX_NR_ZONES; j++) { 4513 for (j = 0; j < MAX_NR_ZONES; j++) {
4504 struct zone *zone = pgdat->node_zones + j; 4514 struct zone *zone = pgdat->node_zones + j;
4505 unsigned long size, realsize, memmap_pages; 4515 unsigned long size, realsize, freesize, memmap_pages;
4506 4516
4507 size = zone_spanned_pages_in_node(nid, j, zones_size); 4517 size = zone_spanned_pages_in_node(nid, j, zones_size);
4508 realsize = size - zone_absent_pages_in_node(nid, j, 4518 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4509 zholes_size); 4519 zholes_size);
4510 4520
4511 /* 4521 /*
4512 * Adjust realsize so that it accounts for how much memory 4522 * Adjust freesize so that it accounts for how much memory
4513 * is used by this zone for memmap. This affects the watermark 4523 * is used by this zone for memmap. This affects the watermark
4514 * and per-cpu initialisations 4524 * and per-cpu initialisations
4515 */ 4525 */
4516 memmap_pages = 4526 memmap_pages =
4517 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 4527 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
4518 if (realsize >= memmap_pages) { 4528 if (freesize >= memmap_pages) {
4519 realsize -= memmap_pages; 4529 freesize -= memmap_pages;
4520 if (memmap_pages) 4530 if (memmap_pages)
4521 printk(KERN_DEBUG 4531 printk(KERN_DEBUG
4522 " %s zone: %lu pages used for memmap\n", 4532 " %s zone: %lu pages used for memmap\n",
4523 zone_names[j], memmap_pages); 4533 zone_names[j], memmap_pages);
4524 } else 4534 } else
4525 printk(KERN_WARNING 4535 printk(KERN_WARNING
4526 " %s zone: %lu pages exceeds realsize %lu\n", 4536 " %s zone: %lu pages exceeds freesize %lu\n",
4527 zone_names[j], memmap_pages, realsize); 4537 zone_names[j], memmap_pages, freesize);
4528 4538
4529 /* Account for reserved pages */ 4539 /* Account for reserved pages */
4530 if (j == 0 && realsize > dma_reserve) { 4540 if (j == 0 && freesize > dma_reserve) {
4531 realsize -= dma_reserve; 4541 freesize -= dma_reserve;
4532 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4542 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4533 zone_names[0], dma_reserve); 4543 zone_names[0], dma_reserve);
4534 } 4544 }
4535 4545
4536 if (!is_highmem_idx(j)) 4546 if (!is_highmem_idx(j))
4537 nr_kernel_pages += realsize; 4547 nr_kernel_pages += freesize;
4538 nr_all_pages += realsize; 4548 nr_all_pages += freesize;
4539 4549
4540 zone->spanned_pages = size; 4550 zone->spanned_pages = size;
4541 zone->present_pages = realsize; 4551 zone->present_pages = freesize;
4552 /*
4553 * Set an approximate value for lowmem here, it will be adjusted
4554 * when the bootmem allocator frees pages into the buddy system.
4555 * And all highmem pages will be managed by the buddy system.
4556 */
4557 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4542#ifdef CONFIG_NUMA 4558#ifdef CONFIG_NUMA
4543 zone->node = nid; 4559 zone->node = nid;
4544 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4560 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4545 / 100; 4561 / 100;
4546 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 4562 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4547#endif 4563#endif
4548 zone->name = zone_names[j]; 4564 zone->name = zone_names[j];
4549 spin_lock_init(&zone->lock); 4565 spin_lock_init(&zone->lock);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a4a522c0b0f..df14808f0a36 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -994,14 +994,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
994 "\n high %lu" 994 "\n high %lu"
995 "\n scanned %lu" 995 "\n scanned %lu"
996 "\n spanned %lu" 996 "\n spanned %lu"
997 "\n present %lu", 997 "\n present %lu"
998 "\n managed %lu",
998 zone_page_state(zone, NR_FREE_PAGES), 999 zone_page_state(zone, NR_FREE_PAGES),
999 min_wmark_pages(zone), 1000 min_wmark_pages(zone),
1000 low_wmark_pages(zone), 1001 low_wmark_pages(zone),
1001 high_wmark_pages(zone), 1002 high_wmark_pages(zone),
1002 zone->pages_scanned, 1003 zone->pages_scanned,
1003 zone->spanned_pages, 1004 zone->spanned_pages,
1004 zone->present_pages); 1005 zone->present_pages,
1006 zone->managed_pages);
1005 1007
1006 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 1008 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
1007 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1009 seq_printf(m, "\n %-12s %lu", vmstat_text[i],