From 2a7684a23e9c263c2a1e8b2c0027ad1836a0f9df Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 16 Sep 2009 11:50:12 +0200 Subject: HWPOISON: check and isolate corrupted free pages v2 If memory corruption hits the free buddy pages, we can safely ignore them. No one will access them until page allocation time, then prep_new_page() will automatically check and isolate PG_hwpoison page for us (for 0-order allocation). This patch expands prep_new_page() to check every component page in a high order page allocation, in order to completely stop PG_hwpoison pages from being recirculated. Note that the common case -- only allocating a single page, doesn't do any more work than before. Allocating > order 0 does a bit more work, but that's relatively uncommon. This simple implementation may drop some innocent neighbor pages, hopefully it is not a big problem because the event should be rare enough. This patch adds some runtime costs to high order page users. [AK: Improved description] v2: Andi Kleen: Port to -mm code Move check into separate function. Don't dump stack in bad_pages for hwpoisoned pages. Signed-off-by: Wu Fengguang Signed-off-by: Andi Kleen --- mm/page_alloc.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a0de15f46987..9faa7ad95ac5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -234,6 +234,12 @@ static void bad_page(struct page *page) static unsigned long nr_shown; static unsigned long nr_unshown; + /* Don't complain about poisoned pages */ + if (PageHWPoison(page)) { + __ClearPageBuddy(page); + return; + } + /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -646,7 +652,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static inline int check_new_page(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -655,6 +661,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) bad_page(page); return 1; } + return 0; +} + +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + if (unlikely(check_new_page(p))) + return 1; + } set_page_private(page, 0); set_page_refcounted(page); -- cgit v1.2.2 From 478b81fd84a299adb401dbbae296f3767e552999 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 21 Sep 2009 17:01:15 -0700 Subject: mm: remove obsoleted alloc_pages cpuset comment When a cpuset's nodemask is updated, all attached tasks have their cached task->mems_allowed updated by a heap instead of requiring an explicit call to cpuset_update_task_memory_state(), which has since been removed in 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"). Remove the obsoleted comment from the page allocator. Cc: Paul Menage Acked-by: Miao Xie Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a0de15f46987..f4e929e356db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1627,10 +1627,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - - /* - * The task's cpuset might have expanded its set of allowable nodes - */ p->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; -- cgit v1.2.2 From 112067f0905b2de862c607ee62411cf47d2fe5c4 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 21 Sep 2009 17:01:16 -0700 Subject: memory hotplug: update zone pcp at memory online In my test, 128M memory is hot added, but zone's pcp batch is 0, which is an obvious error. When pages are onlined, zone pcp should be updated accordingly. [akpm@linux-foundation.org: fix warnings] Signed-off-by: Shaohua Li Cc: Mel Gorman Cc: Christoph Lameter Cc: Yakui Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f4e929e356db..1a3a893ef50e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3142,6 +3142,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } +static int __zone_pcp_update(void *data) +{ + struct zone *zone = data; + int cpu; + unsigned long batch = zone_batchsize(zone), flags; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + pset = zone_pcp(zone, cpu); + pcp = &pset->pcp; + + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + setup_pageset(pset, batch); + local_irq_restore(flags); + } + return 0; +} + +void zone_pcp_update(struct zone *zone) +{ + stop_machine(__zone_pcp_update, zone, NULL); +} + static __meminit void zone_pcp_init(struct zone *zone) { int cpu; -- cgit v1.2.2 From 6fb332fabd7288af9dbe7992394aa6ba97c1a537 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 21 Sep 2009 17:01:17 -0700 Subject: memory hotplug: exclude isolated page from pco page alloc Pages marked as isolated should not be allocated again. If such pages reside in pcp list, they can be allocated too, so there is a ping-pong memory offline frees some pages to pcp list and the pages get allocated and then memory offline frees them again, this loop will happen again and again. This should have no impact in normal code path, because in normal code path, pages in pcp list aren't isolated, and below loop will break in the first entry. Signed-off-by: Shaohua Li Cc: Mel Gorman Cc: Christoph Lameter Cc: Yakui Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1a3a893ef50e..8a8302711725 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1143,10 +1143,20 @@ again: /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { + int get_one_page = 0; + pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, migratetype, cold); - page = list_entry(pcp->list.next, struct page, lru); + list_for_each_entry(page, &pcp->list, lru) { + if (get_pageblock_migratetype(page) != + MIGRATE_ISOLATE) { + get_one_page = 1; + break; + } + } + if (!get_one_page) + goto failed; } list_del(&page->lru); -- cgit v1.2.2 From 8e7e40d9658cf7b2ae2b76484e235799b3ddaa97 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 21 Sep 2009 17:01:18 -0700 Subject: memory hotplug: make pages from movable zone always isolatable Pages on movable zone have two types, MIGRATE_MOVABLE and MIGRATE_RESERVE, both them can be movable, because only movable memory allocation can get pages from movable zone. This makes pages in movable zone always be able to migrate. Signed-off-by: Shaohua Li Cc: Mel Gorman Cc: Christoph Lameter Cc: Yakui Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8a8302711725..7b4d4e4ea64f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4906,13 +4906,16 @@ int set_migratetype_isolate(struct page *page) struct zone *zone; unsigned long flags; int ret = -EBUSY; + int zone_idx; zone = page_zone(page); + zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); /* * In future, more migrate types will be able to be isolation target. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && + zone_idx != ZONE_MOVABLE) goto out; set_pageblock_migratetype(page, MIGRATE_ISOLATE); move_freepages_block(zone, page, MIGRATE_ISOLATE); -- cgit v1.2.2 From 55a4462af5722d2814858bc51ee8d58ca29544ab Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 21 Sep 2009 17:01:20 -0700 Subject: page_alloc: fix kernel-doc warning Ummark function as having kernel-doc notation, fixing the kernel-doc warning. Warning(mm/page_alloc.c:4519): No description found for parameter 'zone' Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7b4d4e4ea64f..91ae36c30330 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4541,7 +4541,7 @@ void setup_per_zone_wmarks(void) calculate_totalreserve_pages(); } -/** +/* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. -- cgit v1.2.2 From 3701b0332330ca1add3e5d56513ef201ff7efdbb Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:01:29 -0700 Subject: mm: show_free_areas(): display slab pages in two separate fields If an OOM happens, we really want to know the number of remaining reclaimable pages. So the reclaimable slab and unreclaimable slab fields should not be combined for display. Signed-off-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 91ae36c30330..bdf12f36bae0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2138,7 +2138,8 @@ void show_free_areas(void) " inactive_file:%lu" " unevictable:%lu" " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", + " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), @@ -2148,8 +2149,8 @@ void show_free_areas(void) global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), global_page_state(NR_FREE_PAGES), - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_page_state(NR_SLAB_RECLAIMABLE), + global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); -- cgit v1.2.2 From 4a0aa73f1d613bf19bc8610bf090c941ef49d720 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:01:30 -0700 Subject: mm: oom analysis: add per-zone statistics to show_free_areas() show_free_areas() displays only a limited amount of zone counters. This patch includes additional counters in the display to allow easier debugging. This may be especially useful if an OOM is due to running out of DMA memory. Signed-off-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Acked-by: Wu Fengguang Reviewed-by: Minchan Kim Reviewed-by: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdf12f36bae0..8fbf5a4f5cf7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2170,6 +2170,16 @@ void show_free_areas(void) " inactive_file:%lukB" " unevictable:%lukB" " present:%lukB" + " mlocked:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " mapped:%lukB" + " slab_reclaimable:%lukB" + " slab_unreclaimable:%lukB" + " pagetables:%lukB" + " unstable:%lukB" + " bounce:%lukB" + " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" "\n", @@ -2184,6 +2194,16 @@ void show_free_areas(void) K(zone_page_state(zone, NR_INACTIVE_FILE)), K(zone_page_state(zone, NR_UNEVICTABLE)), K(zone->present_pages), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_FILE_DIRTY)), + K(zone_page_state(zone, NR_WRITEBACK)), + K(zone_page_state(zone, NR_FILE_MAPPED)), + K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), + K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), + K(zone_page_state(zone, NR_PAGETABLE)), + K(zone_page_state(zone, NR_UNSTABLE_NFS)), + K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") ); -- cgit v1.2.2 From 71de1ccbe1fb40203edd3beb473f8580d917d2ca Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:01:31 -0700 Subject: mm: oom analysis: add buffer cache information to show_free_areas() It is often useful to know the statistics for all pages that are handled like page cache pages when looking at OOM log output. Therefore show_free_areas() should also display buffer cache statistics. Signed-off-by: KOSAKI Motohiro Acked-by: Wu Fengguang Reviewed-by: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8fbf5a4f5cf7..494c09196c30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2137,7 +2137,7 @@ void show_free_areas(void) printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" " inactive_file:%lu" " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu\n" + " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), @@ -2148,6 +2148,7 @@ void show_free_areas(void) global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), + nr_blockdev_pages(), global_page_state(NR_FREE_PAGES), global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), -- cgit v1.2.2 From c6a7f5728a1db45d30df55a01adc130b4ab0327c Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:01:32 -0700 Subject: mm: oom analysis: Show kernel stack usage in /proc/meminfo and OOM log output The amount of memory allocated to kernel stacks can become significant and cause OOM conditions. However, we do not display the amount of memory consumed by stacks. Add code to display the amount of memory used for stacks in /proc/meminfo. Signed-off-by: KOSAKI Motohiro Reviewed-by: Christoph Lameter Reviewed-by: Minchan Kim Reviewed-by: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 494c09196c30..4e050f325ebd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2177,6 +2177,7 @@ void show_free_areas(void) " mapped:%lukB" " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" + " kernel_stack:%lukB" " pagetables:%lukB" " unstable:%lukB" " bounce:%lukB" @@ -2201,6 +2202,8 @@ void show_free_areas(void) K(zone_page_state(zone, NR_FILE_MAPPED)), K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), + zone_page_state(zone, NR_KERNEL_STACK) * + THREAD_SIZE / 1024, K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_UNSTABLE_NFS)), K(zone_page_state(zone, NR_BOUNCE)), -- cgit v1.2.2 From 4b02108ac1b3354a22b0d83c684797692efdc395 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:01:33 -0700 Subject: mm: oom analysis: add shmem vmstat Recently we encountered OOM problems due to memory use of the GEM cache. Generally a large amuont of Shmem/Tmpfs pages tend to create a memory shortage problem. We often use the following calculation to determine the amount of shmem pages: shmem = NR_ACTIVE_ANON + NR_INACTIVE_ANON - NR_ANON_PAGES however the expression does not consider isolated and mlocked pages. This patch adds explicit accounting for pages used by shmem and tmpfs. Signed-off-by: KOSAKI Motohiro Acked-by: Rik van Riel Reviewed-by: Christoph Lameter Acked-by: Wu Fengguang Cc: David Rientjes Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e050f325ebd..e50c22545b8f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2139,7 +2139,7 @@ void show_free_areas(void) " unevictable:%lu" " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" - " mapped:%lu pagetables:%lu bounce:%lu\n", + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), @@ -2153,6 +2153,7 @@ void show_free_areas(void) global_page_state(NR_SLAB_RECLAIMABLE), global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), + global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); @@ -2175,6 +2176,7 @@ void show_free_areas(void) " dirty:%lukB" " writeback:%lukB" " mapped:%lukB" + " shmem:%lukB" " slab_reclaimable:%lukB" " slab_unreclaimable:%lukB" " kernel_stack:%lukB" @@ -2200,6 +2202,7 @@ void show_free_areas(void) K(zone_page_state(zone, NR_FILE_DIRTY)), K(zone_page_state(zone, NR_WRITEBACK)), K(zone_page_state(zone, NR_FILE_MAPPED)), + K(zone_page_state(zone, NR_SHMEM)), K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), zone_page_state(zone, NR_KERNEL_STACK) * -- cgit v1.2.2 From b259fbde0a86085264c89aa2ce9c6e35792a1aad Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 21 Sep 2009 17:01:34 -0700 Subject: mm: update alloc_flags after oom killer has been called It is possible for the oom killer to select current as the task to kill. When this happens, alloc_flags needs to be updated accordingly to set ALLOC_NO_WATERMARKS so the subsequent allocation attempt may use memory reserves as the result of its thread having TIF_MEMDIE set if the allocation is not __GFP_NOMEMALLOC. Acked-by: Mel Gorman Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e50c22545b8f..b6d0d09557ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1771,6 +1771,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, wake_all_kswapd(order, zonelist, high_zoneidx); +restart: /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according @@ -1778,7 +1779,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ alloc_flags = gfp_to_alloc_flags(gfp_mask); -restart: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, -- cgit v1.2.2 From a731286de62294b63d8ceb3c5914ac52cc17e690 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 21 Sep 2009 17:01:37 -0700 Subject: mm: vmstat: add isolate pages If the system is running a heavy load of processes then concurrent reclaim can isolate a large number of pages from the LRU. /proc/vmstat and the output generated for an OOM do not show how many pages were isolated. This has been observed during process fork bomb testing (mstctl11 in LTP). This patch shows the information about isolated pages. Reproduced via: ----------------------- % ./hackbench 140 process 1000 => OOM occur active_anon:146 inactive_anon:0 isolated_anon:49245 active_file:79 inactive_file:18 isolated_file:113 unevictable:0 dirty:0 writeback:0 unstable:0 buffer:39 free:370 slab_reclaimable:309 slab_unreclaimable:5492 mapped:53 shmem:15 pagetables:28140 bounce:0 Signed-off-by: KOSAKI Motohiro Acked-by: Rik van Riel Acked-by: Wu Fengguang Reviewed-by: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b6d0d09557ef..afda8fd16484 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2134,16 +2134,18 @@ void show_free_areas(void) } } - printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" - " inactive_file:%lu" + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu" " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_ISOLATED_ANON), + global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_FILE), + global_page_state(NR_ISOLATED_FILE), global_page_state(NR_UNEVICTABLE), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), @@ -2171,6 +2173,8 @@ void show_free_areas(void) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" " present:%lukB" " mlocked:%lukB" " dirty:%lukB" @@ -2197,6 +2201,8 @@ void show_free_areas(void) K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), K(zone_page_state(zone, NR_UNEVICTABLE)), + K(zone_page_state(zone, NR_ISOLATED_ANON)), + K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_FILE_DIRTY)), -- cgit v1.2.2 From 945a11136ebdfa7fcce319ee6215958e84cb85f6 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 21 Sep 2009 17:01:47 -0700 Subject: mm: add gfp mask checking for __get_free_pages() __get_free_pages() with __GFP_HIGHMEM is not safe because the return address cannot represent a highmem page. get_zeroed_page() already has such a debug checking. Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index afda8fd16484..81926c7ef6f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1922,31 +1922,25 @@ EXPORT_SYMBOL(__alloc_pages_nodemask); */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { - struct page * page; + struct page *page; + + /* + * __get_free_pages() returns a 32-bit address, which cannot represent + * a highmem page + */ + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + page = alloc_pages(gfp_mask, order); if (!page) return 0; return (unsigned long) page_address(page); } - EXPORT_SYMBOL(__get_free_pages); unsigned long get_zeroed_page(gfp_t gfp_mask) { - struct page * page; - - /* - * get_zeroed_page() returns a 32-bit address, which cannot represent - * a highmem page - */ - VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); - - page = alloc_pages(gfp_mask | __GFP_ZERO, 0); - if (page) - return (unsigned long) page_address(page); - return 0; + return __get_free_pages(gfp_mask | __GFP_ZERO, 0); } - EXPORT_SYMBOL(get_zeroed_page); void __pagevec_free(struct pagevec *pvec) -- cgit v1.2.2 From 451ea25da71590361c71bf3044c55b870a887d53 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 21 Sep 2009 17:01:48 -0700 Subject: mm: perform non-atomic test-clear of PG_mlocked on free By the time PG_mlocked is cleared in the page freeing path, nobody else is looking at our page->flags anymore. It is thus safe to make the test-and-clear non-atomic and thereby removing an unnecessary and expensive operation from a hotpath. Signed-off-by: Johannes Weiner Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Cc: Christoph Lameter Cc: Mel Gorman Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 81926c7ef6f0..9242d13f4ff3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -557,7 +557,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; - int wasMlocked = TestClearPageMlocked(page); + int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, order); @@ -1026,7 +1026,7 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; - int wasMlocked = TestClearPageMlocked(page); + int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, 0); -- cgit v1.2.2 From 2f66a68f3fac2e94da360c342ff78ab45553f86c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:02:31 -0700 Subject: page-allocator: change migratetype for all pageblocks within a high-order page during __rmqueue_fallback When there are no pages of a target migratetype free, the page allocator selects a high-order block of another migratetype to allocate from. When the order of the page taken is greater than pageblock_order, all pageblocks within that high-order page should change migratetype so that pages are later freed to the correct free-lists. The current behaviour is that pageblocks change migratetype if the order being split matches the pageblock_order. When pageblock_order < MAX_ORDER-1, ownership is not changing correct and pages are being later freed to the incorrect list and this impacts fragmentation avoidance. This patch changes all pageblocks within the high-order page being split to the correct migratetype. Without the patch, allocation success rates for hugepages under stress were about 59% of physical memory on x86-64. With the patch applied, this goes up to 65%. Signed-off-by: Mel Gorman Cc: Andy Whitcroft Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9242d13f4ff3..20759803a64a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -783,6 +783,17 @@ static int move_freepages_block(struct zone *zone, struct page *page, return move_freepages(zone, start_page, end_page, migratetype); } +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) @@ -836,8 +847,9 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); - if (current_order == pageblock_order) - set_pageblock_migratetype(page, + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) + change_pageblock_range(page, current_order, start_migratetype); expand(zone, page, order, current_order, area, migratetype); -- cgit v1.2.2 From 38a398572fa2d8124f7479e40db581b5b72719c9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:02:39 -0700 Subject: page-allocator: remove dead function free_cold_page() The function free_cold_page() has no callers so delete it. Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20759803a64a..913a8ebd3a8e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1079,11 +1079,6 @@ void free_hot_page(struct page *page) free_hot_cold_page(page, 0); } -void free_cold_page(struct page *page) -{ - free_hot_cold_page(page, 1); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1< Date: Mon, 21 Sep 2009 17:02:41 -0700 Subject: tracing, page-allocator: add trace events for page allocation and page freeing This patch adds trace events for the allocation and freeing of pages, including the freeing of pagevecs. Using the events, it will be known what struct page and pfns are being allocated and freed and what the call site was in many cases. The page alloc tracepoints be used as an indicator as to whether the workload was heavily dependant on the page allocator or not. You can make a guess based on vmstat but you can't get a per-process breakdown. Depending on the call path, the call_site for page allocation may be __get_free_pages() instead of a useful callsite. Instead of passing down a return address similar to slab debugging, the user should enable the stacktrace and seg-addr options to get a proper stack trace. The pagevec free tracepoint has a different usecase. It can be used to get a idea of how many pages are being dumped off the LRU and whether it is kswapd doing the work or a process doing direct reclaim. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Ingo Molnar Cc: Larry Woodman Cc: Peter Zijlstra Cc: Li Ming Chun Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 913a8ebd3a8e..80f954d82d77 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1076,6 +1076,7 @@ static void free_hot_cold_page(struct page *page, int cold) void free_hot_page(struct page *page) { + trace_mm_page_free_direct(page, 0); free_hot_cold_page(page, 0); } @@ -1920,6 +1921,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -1954,13 +1956,16 @@ void __pagevec_free(struct pagevec *pvec) { int i = pagevec_count(pvec); - while (--i >= 0) + while (--i >= 0) { + trace_mm_pagevec_free(pvec->pages[i], pvec->cold); free_hot_cold_page(pvec->pages[i], pvec->cold); + } } void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { + trace_mm_page_free_direct(page, order); if (order == 0) free_hot_page(page); else -- cgit v1.2.2 From e0fff1bd12469c45dab088e353d8882761387bb6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:02:42 -0700 Subject: tracing, page-allocator: add trace events for anti-fragmentation falling back to other migratetypes Fragmentation avoidance depends on being able to use free pages from lists of the appropriate migrate type. In the event this is not possible, __rmqueue_fallback() selects a different list and in some circumstances change the migratetype of the pageblock. Simplistically, the more times this event occurs, the more likely that fragmentation will be a problem later for hugepage allocation at least but there are other considerations such as the order of page being split to satisfy the allocation. This patch adds a trace event for __rmqueue_fallback() that reports what page is being used for the fallback, the orders of relevant pages, the desired migratetype and the migratetype of the lists being used, whether the pageblock changed type and whether this event is important with respect to fragmentation avoidance or not. This information can be used to help analyse fragmentation avoidance and help decide whether min_free_kbytes should be increased or not. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Ingo Molnar Cc: Larry Woodman Cc: Peter Zijlstra Cc: Li Ming Chun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 80f954d82d77..77f517c18b37 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -853,6 +853,10 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) start_migratetype); expand(zone, page, order, current_order, area, migratetype); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, migratetype); + return page; } } -- cgit v1.2.2 From 0d3d062a6e289e065bd0aa537a6806a1806bf8aa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:02:44 -0700 Subject: tracing, page-allocator: add trace event for page traffic related to the buddy lists The page allocation trace event reports that a page was successfully allocated but it does not specify where it came from. When analysing performance, it can be important to distinguish between pages coming from the per-cpu allocator and pages coming from the buddy lists as the latter requires the zone lock to the taken and more data structures to be examined. This patch adds a trace event for __rmqueue reporting when a page is being allocated from the buddy lists. It distinguishes between being called to refill the per-cpu lists or whether it is a high-order allocation. Similarly, this patch adds an event to catch when the PCP lists are being drained a little and pages are going back to the buddy lists. This is trickier to draw conclusions from but high activity on those events could explain why there were a large number of cache misses on a page-allocator-intensive workload. The coalescing and splitting of buddies involves a lot of writing of page metadata and cache line bounces not to mention the acquisition of an interrupt-safe lock necessary to enter this path. [akpm@linux-foundation.org: fix build] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Reviewed-by: Ingo Molnar Cc: Larry Woodman Cc: Peter Zijlstra Cc: Li Ming Chun Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77f517c18b37..4c847cc57caf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -535,6 +536,7 @@ static void free_pages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); + trace_mm_page_pcpu_drain(page, order, page_private(page)); __free_one_page(page, zone, order, page_private(page)); } spin_unlock(&zone->lock); @@ -890,6 +892,7 @@ retry_reserve: } } + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } -- cgit v1.2.2 From 78986a678f6ec3759a01976749f4437d8bf2d6c3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:03:02 -0700 Subject: page-allocator: limit the number of MIGRATE_RESERVE pageblocks per zone After anti-fragmentation was merged, a bug was reported whereby devices that depended on high-order atomic allocations were failing. The solution was to preserve a property in the buddy allocator which tended to keep the minimum number of free pages in the zone at the lower physical addresses and contiguous. To preserve this property, MIGRATE_RESERVE was introduced and a number of pageblocks at the start of a zone would be marked "reserve", the number of which depended on min_free_kbytes. Anti-fragmentation works by avoiding the mixing of page migratetypes within the same pageblock. One way of helping this is to increase min_free_kbytes because it becomes less like that it will be necessary to place pages of of MIGRATE_RESERVE is unbounded, the free memory is kept there in large contiguous blocks instead of helping anti-fragmentation as much as it should. With the page-allocator tracepoint patches applied, it was found during anti-fragmentation tests that the number of fragmentation-related events were far higher than expected even with min_free_kbytes at higher values. This patch limits the number of MIGRATE_RESERVE blocks that exist per zone to two. For example, with a sufficient min_free_kbytes, 4MB of memory will be kept aside on an x86-64 and remain more or less free and contiguous for the systems uptime. This should be sufficient for devices depending on high-order atomic allocations while helping fragmentation control when min_free_kbytes is tuned appropriately. As side-effect of this patch is that the reserve variable is converted to int as unsigned long was the wrong type to use when ensuring that only the required number of reserve blocks are created. With the patches applied, fragmentation-related events as measured by the page allocator tracepoints were significantly reduced when running some fragmentation stress-tests on systems with min_free_kbytes tuned to a value appropriate for hugepage allocations at runtime. On x86, the events recorded were reduced by 99.8%, on x86-64 by 99.72% and on ppc64 by 99.83%. Signed-off-by: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4c847cc57caf..33b1a4762a7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2836,7 +2836,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) { unsigned long start_pfn, pfn, end_pfn; struct page *page; - unsigned long reserve, block_migratetype; + unsigned long block_migratetype; + int reserve; /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; @@ -2844,6 +2845,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; + /* + * Reserve blocks are generally in place to help high-order atomic + * allocations that are short-lived. A min_free_kbytes value that + * would result in more than 2 reserve blocks for atomic allocations + * is assumed to be in place to help anti-fragmentation for the + * future allocation of hugepages at runtime. + */ + reserve = min(2, reserve); + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) continue; -- cgit v1.2.2 From 2c85f51d222ccdd8c401d77a36b723a89156810d Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 21 Sep 2009 17:03:07 -0700 Subject: mm: also use alloc_large_system_hash() for the PID hash table This is being done by allowing boot time allocations to specify that they may want a sub-page sized amount of memory. Overall this seems more consistent with the other hash table allocations, and allows making two supposedly mm-only variables really mm-only (nr_{kernel,all}_pages). Signed-off-by: Jan Beulich Cc: Ingo Molnar Cc: "Eric W. Biederman" Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 33b1a4762a7b..770f011e1c12 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -124,8 +124,8 @@ static char * const zone_names[MAX_NR_ZONES] = { int min_free_kbytes = 1024; -unsigned long __meminitdata nr_kernel_pages; -unsigned long __meminitdata nr_all_pages; +static unsigned long __meminitdata nr_kernel_pages; +static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_ARCH_POPULATES_NODE_MAP @@ -4821,7 +4821,14 @@ void *__init alloc_large_system_hash(const char *tablename, numentries <<= (PAGE_SHIFT - scale); /* Make sure we've got at least a 0-order allocation.. */ - if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + if (unlikely(flags & HASH_SMALL)) { + /* Makes no sense without HASH_EARLY */ + WARN_ON(!(flags & HASH_EARLY)); + if (!(numentries >> *_hash_shift)) { + numentries = 1UL << *_hash_shift; + BUG_ON(!numentries); + } + } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); -- cgit v1.2.2 From f86296317434b21585e229f6c49a33cb9ebab4d3 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 21 Sep 2009 17:03:11 -0700 Subject: mm: do batched scans for mem_cgroup For mem_cgroup, shrink_zone() may call shrink_list() with nr_to_scan=1, in which case shrink_list() _still_ calls isolate_pages() with the much larger SWAP_CLUSTER_MAX. It effectively scales up the inactive list scan rate by up to 32 times. For example, with 16k inactive pages and DEF_PRIORITY=12, (16k >> 12)=4. So when shrink_zone() expects to scan 4 pages in the active/inactive list, the active list will be scanned 4 pages, while the inactive list will be (over) scanned SWAP_CLUSTER_MAX=32 pages in effect. And that could break the balance between the two lists. It can further impact the scan of anon active list, due to the anon active/inactive ratio rebalance logic in balance_pgdat()/shrink_zone(): inactive anon list over scanned => inactive_anon_is_low() == TRUE => shrink_active_list() => active anon list over scanned So the end result may be - anon inactive => over scanned - anon active => over scanned (maybe not as much) - file inactive => over scanned - file active => under scanned (relatively) The accesses to nr_saved_scan are not lock protected and so not 100% accurate, however we can tolerate small errors and the resulted small imbalanced scan rates between zones. Cc: Rik van Riel Reviewed-by: KOSAKI Motohiro Acked-by: Balbir Singh Reviewed-by: Minchan Kim Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 770f011e1c12..84d9da1e8f4c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3809,7 +3809,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_saved_scan = 0; + zone->reclaim_stat.nr_saved_scan[l] = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; -- cgit v1.2.2 From 5f8dcc21211a3d4e3a7a5ca366b469fb88117f61 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:03:19 -0700 Subject: page-allocator: split per-cpu list into one-list-per-migrate-type The following two patches remove searching in the page allocator fast-path by maintaining multiple free-lists in the per-cpu structure. At the time the search was introduced, increasing the per-cpu structures would waste a lot of memory as per-cpu structures were statically allocated at compile-time. This is no longer the case. The patches are as follows. They are based on mmotm-2009-08-27. Patch 1 adds multiple lists to struct per_cpu_pages, one per migratetype that can be stored on the PCP lists. Patch 2 notes that the pcpu drain path check empty lists multiple times. The patch reduces the number of checks by maintaining a count of free lists encountered. Lists containing pages will then free multiple pages in batch The patches were tested with kernbench, netperf udp/tcp, hackbench and sysbench. The netperf tests were not bound to any CPU in particular and were run such that the results should be 99% confidence that the reported results are within 1% of the estimated mean. sysbench was run with a postgres background and read-only tests. Similar to netperf, it was run multiple times so that it's 99% confidence results are within 1%. The patches were tested on x86, x86-64 and ppc64 as x86: Intel Pentium D 3GHz with 8G RAM (no-brand machine) kernbench - No significant difference, variance well within noise netperf-udp - 1.34% to 2.28% gain netperf-tcp - 0.45% to 1.22% gain hackbench - Small variances, very close to noise sysbench - Very small gains x86-64: AMD Phenom 9950 1.3GHz with 8G RAM (no-brand machine) kernbench - No significant difference, variance well within noise netperf-udp - 1.83% to 10.42% gains netperf-tcp - No conclusive until buffer >= PAGE_SIZE 4096 +15.83% 8192 + 0.34% (not significant) 16384 + 1% hackbench - Small gains, very close to noise sysbench - 0.79% to 1.6% gain ppc64: PPC970MP 2.5GHz with 10GB RAM (it's a terrasoft powerstation) kernbench - No significant difference, variance well within noise netperf-udp - 2-3% gain for almost all buffer sizes tested netperf-tcp - losses on small buffers, gains on larger buffers possibly indicates some bad caching effect. hackbench - No significant difference sysbench - 2-4% gain This patch: Currently the per-cpu page allocator searches the PCP list for pages of the correct migrate-type to reduce the possibility of pages being inappropriate placed from a fragmentation perspective. This search is potentially expensive in a fast-path and undesirable. Splitting the per-cpu list into multiple lists increases the size of a per-cpu structure and this was potentially a major problem at the time the search was introduced. These problem has been mitigated as now only the necessary number of structures is allocated for the running system. This patch replaces a list search in the per-cpu allocator with one list per migrate type. The potential snag with this approach is when bulk freeing pages. We round-robin free pages based on migrate type which has little bearing on the cache hotness of the page and potentially checks empty lists repeatedly in the event the majority of PCP pages are of one type. Signed-off-by: Mel Gorman Acked-by: Nick Piggin Cc: Christoph Lameter Cc: Minchan Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 106 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 59 insertions(+), 47 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 84d9da1e8f4c..1b1c39e6a9b8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -511,7 +511,7 @@ static inline int free_pages_check(struct page *page) } /* - * Frees a list of pages. + * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. * count is the number of pages to free. * @@ -521,23 +521,36 @@ static inline int free_pages_check(struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static void free_pages_bulk(struct zone *zone, int count, - struct list_head *list, int order) +static void free_pcppages_bulk(struct zone *zone, int count, + struct per_cpu_pages *pcp) { + int migratetype = 0; + spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); + __mod_zone_page_state(zone, NR_FREE_PAGES, count); while (count--) { struct page *page; + struct list_head *list; + + /* + * Remove pages from lists in a round-robin fashion. This spinning + * around potentially empty lists is bloody awful, alternatives that + * don't suck are welcome + */ + do { + if (++migratetype == MIGRATE_PCPTYPES) + migratetype = 0; + list = &pcp->lists[migratetype]; + } while (list_empty(list)); - VM_BUG_ON(list_empty(list)); page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); - trace_mm_page_pcpu_drain(page, order, page_private(page)); - __free_one_page(page, zone, order, page_private(page)); + trace_mm_page_pcpu_drain(page, 0, migratetype); + __free_one_page(page, zone, 0, migratetype); } spin_unlock(&zone->lock); } @@ -953,7 +966,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->batch; else to_drain = pcp->count; - free_pages_bulk(zone, to_drain, &pcp->list, 0); + free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; local_irq_restore(flags); } @@ -979,7 +992,7 @@ static void drain_pages(unsigned int cpu) pcp = &pset->pcp; local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); } @@ -1045,6 +1058,7 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + int migratetype; int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, 0); @@ -1062,21 +1076,39 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; - set_page_private(page, get_pageblock_migratetype(page)); + migratetype = get_pageblock_migratetype(page); + set_page_private(page, migratetype); local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_event(PGFREE); + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Free ISOLATE pages back to the allocator because they are being + * offlined but treat RESERVE as movable pages so we can get those + * areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + if (migratetype >= MIGRATE_PCPTYPES) { + if (unlikely(migratetype == MIGRATE_ISOLATE)) { + free_one_page(zone, page, 0, migratetype); + goto out; + } + migratetype = MIGRATE_MOVABLE; + } + if (cold) - list_add_tail(&page->lru, &pcp->list); + list_add_tail(&page->lru, &pcp->lists[migratetype]); else - list_add(&page->lru, &pcp->list); + list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->batch, pcp); pcp->count -= pcp->batch; } + +out: local_irq_restore(flags); put_cpu(); } @@ -1134,46 +1166,24 @@ again: cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; + struct list_head *list; pcp = &zone_pcp(zone, cpu)->pcp; + list = &pcp->lists[migratetype]; local_irq_save(flags); - if (!pcp->count) { - pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, - migratetype, cold); - if (unlikely(!pcp->count)) - goto failed; - } - - /* Find a page of the appropriate migrate type */ - if (cold) { - list_for_each_entry_reverse(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } else { - list_for_each_entry(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } - - /* Allocate more to the pcp list if necessary */ - if (unlikely(&page->lru == &pcp->list)) { - int get_one_page = 0; - + if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, + pcp->batch, list, migratetype, cold); - list_for_each_entry(page, &pcp->list, lru) { - if (get_pageblock_migratetype(page) != - MIGRATE_ISOLATE) { - get_one_page = 1; - break; - } - } - if (!get_one_page) + if (unlikely(list_empty(list))) goto failed; } + if (cold) + page = list_entry(list->prev, struct page, lru); + else + page = list_entry(list->next, struct page, lru); + list_del(&page->lru); pcp->count--; } else { @@ -3024,6 +3034,7 @@ static int zone_batchsize(struct zone *zone) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; + int migratetype; memset(p, 0, sizeof(*p)); @@ -3031,7 +3042,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp->count = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); - INIT_LIST_HEAD(&pcp->list); + for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) + INIT_LIST_HEAD(&pcp->lists[migratetype]); } /* @@ -3223,7 +3235,7 @@ static int __zone_pcp_update(void *data) pcp = &pset->pcp; local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->count, pcp); setup_pageset(pset, batch); local_irq_restore(flags); } -- cgit v1.2.2 From a6f9edd65beaef24836e8934c8912c1e974dd45c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 21 Sep 2009 17:03:20 -0700 Subject: page-allocator: maintain rolling count of pages to free from the PCP When round-robin freeing pages from the PCP lists, empty lists may be encountered. In the event one of the lists has more pages than another, there may be numerous checks for list_empty() which is undesirable. This patch maintains a count of pages to free which is incremented when empty lists are encountered. The intention is that more pages will then be freed from fuller lists than the empty ones reducing the number of empty list checks in the free path. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Cc: Nick Piggin Cc: Christoph Lameter Reviewed-by: Minchan Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1b1c39e6a9b8..6877e22e3aa1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -525,32 +525,38 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) { int migratetype = 0; + int batch_free = 0; spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; __mod_zone_page_state(zone, NR_FREE_PAGES, count); - while (count--) { + while (count) { struct page *page; struct list_head *list; /* - * Remove pages from lists in a round-robin fashion. This spinning - * around potentially empty lists is bloody awful, alternatives that - * don't suck are welcome + * Remove pages from lists in a round-robin fashion. A + * batch_free count is maintained that is incremented when an + * empty list is encountered. This is so more pages are freed + * off fuller lists instead of spinning excessively around empty + * lists */ do { + batch_free++; if (++migratetype == MIGRATE_PCPTYPES) migratetype = 0; list = &pcp->lists[migratetype]; } while (list_empty(list)); - page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_one_page list manipulates */ - list_del(&page->lru); - trace_mm_page_pcpu_drain(page, 0, migratetype); - __free_one_page(page, zone, 0, migratetype); + do { + page = list_entry(list->prev, struct page, lru); + /* must delete as __free_one_page list manipulates */ + list_del(&page->lru); + __free_one_page(page, zone, 0, migratetype); + trace_mm_page_pcpu_drain(page, 0, migratetype); + } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); } -- cgit v1.2.2 From 03f6462a3ae78f36eb1f0ee8b4d5ae2f7859c1d5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Sep 2009 17:03:35 -0700 Subject: mm: move highest_memmap_pfn Move highest_memmap_pfn __read_mostly from page_alloc.c next to zero_pfn __read_mostly in memory.c: to help them share a cacheline, since they're very often tested together in vm_normal_page(). Signed-off-by: Hugh Dickins Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Nick Piggin Cc: Mel Gorman Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6877e22e3aa1..5717f27a0704 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,7 +72,6 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -- cgit v1.2.2 From 8d65af789f3e2cf4cfbdbf71a0f7a61ebcd41d38 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 23 Sep 2009 15:57:19 -0700 Subject: sysctl: remove "struct file *" argument of ->proc_handler It's unused. It isn't needed -- read or write flag is already passed and sysctl shouldn't care about the rest. It _was_ used in two places at arch/frv for some reason. Signed-off-by: Alexey Dobriyan Cc: David Howells Cc: "Eric W. Biederman" Cc: Al Viro Cc: Ralf Baechle Cc: Martin Schwidefsky Cc: Ingo Molnar Cc: "David S. Miller" Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5717f27a0704..88248b3c20bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2373,7 +2373,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, + void __user *buffer, size_t *length, loff_t *ppos) { char saved_string[NUMA_ZONELIST_ORDER_LEN]; @@ -2382,7 +2382,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, if (write) strncpy(saved_string, (char*)table->data, NUMA_ZONELIST_ORDER_LEN); - ret = proc_dostring(table, write, file, buffer, length, ppos); + ret = proc_dostring(table, write, buffer, length, ppos); if (ret) return ret; if (write) { @@ -4706,9 +4706,9 @@ module_init(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (write) setup_per_zone_wmarks(); return 0; @@ -4716,12 +4716,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4732,12 +4732,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, } int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4758,9 +4758,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, file, buffer, length, ppos); + proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); return 0; } @@ -4772,13 +4772,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; unsigned int cpu; int ret; - ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { -- cgit v1.2.2