aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMinchan Kim <minchan@kernel.org>2012-07-31 19:43:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-31 21:42:45 -0400
commit702d1a6e0766d45642c934444fd41f658d251305 (patch)
tree6c9144521b03f11f7ea2e709f066b90a9b9f38d5 /mm
parent2cfed0752808625d30aca7fc9f383af386fd8a13 (diff)
memory-hotplug: fix kswapd looping forever problem
When hotplug offlining happens on zone A, it starts to mark freed page as MIGRATE_ISOLATE type in buddy for preventing further allocation. (MIGRATE_ISOLATE is very irony type because it's apparently on buddy but we can't allocate them). When the memory shortage happens during hotplug offlining, current task starts to reclaim, then wake up kswapd. Kswapd checks watermark, then go sleep because current zone_watermark_ok_safe doesn't consider MIGRATE_ISOLATE freed page count. Current task continue to reclaim in direct reclaim path without kswapd's helping. The problem is that zone->all_unreclaimable is set by only kswapd so that current task would be looping forever like below. __alloc_pages_slowpath restart: wake_all_kswapd rebalance: __alloc_pages_direct_reclaim do_try_to_free_pages if global_reclaim && !all_unreclaimable return 1; /* It means we did did_some_progress */ skip __alloc_pages_may_oom should_alloc_retry goto rebalance; If we apply KOSAKI's patch[1] which doesn't depends on kswapd about setting zone->all_unreclaimable, we can solve this problem by killing some task in direct reclaim path. But it doesn't wake up kswapd, still. It could be a problem still if other subsystem needs GFP_ATOMIC request. So kswapd should consider MIGRATE_ISOLATE when it calculate free pages BEFORE going sleep. This patch counts the number of MIGRATE_ISOLATE page block and zone_watermark_ok_safe will consider it if the system has such blocks (fortunately, it's very rare so no problem in POV overhead and kswapd is never hotpath). Copy/modify from Mel's quote " Ideal solution would be "allocating" the pageblock. It would keep the free space accounting as it is but historically, memory hotplug didn't allocate pages because it would be difficult to detect if a pageblock was isolated or if part of some balloon. Allocating just full pageblocks would work around this, However, it would play very badly with CMA. " [1] http://lkml.org/lkml/2012/6/14/74 [akpm@linux-foundation.org: simplify nr_zone_isolate_freepages(), rework zone_watermark_ok_safe() comment, simplify set_pageblock_isolate() and restore_pageblock_isolate()] [akpm@linux-foundation.org: fix CONFIG_MEMORY_ISOLATION=n build] Signed-off-by: Minchan Kim <minchan@kernel.org> Suggested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: Aaditya Kumar <aaditya.kumar.30@gmail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Michal Hocko <mhocko@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/page_alloc.c30
-rw-r--r--mm/page_isolation.c26
2 files changed, 54 insertions, 2 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2e6635993558..6a29ed8e6e60 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -218,6 +218,11 @@ EXPORT_SYMBOL(nr_online_nodes);
218 218
219int page_group_by_mobility_disabled __read_mostly; 219int page_group_by_mobility_disabled __read_mostly;
220 220
221/*
222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate.
225 */
221void set_pageblock_migratetype(struct page *page, int migratetype) 226void set_pageblock_migratetype(struct page *page, int migratetype)
222{ 227{
223 228
@@ -1619,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1619 return true; 1624 return true;
1620} 1625}
1621 1626
1627#ifdef CONFIG_MEMORY_ISOLATION
1628static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1629{
1630 if (unlikely(zone->nr_pageblock_isolate))
1631 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1632 return 0;
1633}
1634#else
1635static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1636{
1637 return 0;
1638}
1639#endif
1640
1622bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1641bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1623 int classzone_idx, int alloc_flags) 1642 int classzone_idx, int alloc_flags)
1624{ 1643{
@@ -1634,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1634 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1635 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1636 1655
1656 /*
1657 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1658 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1659 * sleep although it could do so. But this is more desirable for memory
1660 * hotplug than sleeping which can cause a livelock in the direct
1661 * reclaim path.
1662 */
1663 free_pages -= nr_zone_isolate_freepages(z);
1637 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1664 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1638 free_pages); 1665 free_pages);
1639} 1666}
@@ -4398,6 +4425,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4398 lruvec_init(&zone->lruvec, zone); 4425 lruvec_init(&zone->lruvec, zone);
4399 zap_zone_vm_stats(zone); 4426 zap_zone_vm_stats(zone);
4400 zone->flags = 0; 4427 zone->flags = 0;
4428#ifdef CONFIG_MEMORY_ISOLATION
4429 zone->nr_pageblock_isolate = 0;
4430#endif
4401 if (!size) 4431 if (!size)
4402 continue; 4432 continue;
4403 4433
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index fb482cf438da..247d1f175739 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -8,6 +8,28 @@
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include "internal.h" 9#include "internal.h"
10 10
11/* called while holding zone->lock */
12static void set_pageblock_isolate(struct page *page)
13{
14 if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
15 return;
16
17 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
18 page_zone(page)->nr_pageblock_isolate++;
19}
20
21/* called while holding zone->lock */
22static void restore_pageblock_isolate(struct page *page, int migratetype)
23{
24 struct zone *zone = page_zone(page);
25 if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
26 return;
27
28 BUG_ON(zone->nr_pageblock_isolate <= 0);
29 set_pageblock_migratetype(page, migratetype);
30 zone->nr_pageblock_isolate--;
31}
32
11int set_migratetype_isolate(struct page *page) 33int set_migratetype_isolate(struct page *page)
12{ 34{
13 struct zone *zone; 35 struct zone *zone;
@@ -54,7 +76,7 @@ int set_migratetype_isolate(struct page *page)
54 76
55out: 77out:
56 if (!ret) { 78 if (!ret) {
57 set_pageblock_migratetype(page, MIGRATE_ISOLATE); 79 set_pageblock_isolate(page);
58 move_freepages_block(zone, page, MIGRATE_ISOLATE); 80 move_freepages_block(zone, page, MIGRATE_ISOLATE);
59 } 81 }
60 82
@@ -72,8 +94,8 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
72 spin_lock_irqsave(&zone->lock, flags); 94 spin_lock_irqsave(&zone->lock, flags);
73 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 95 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
74 goto out; 96 goto out;
75 set_pageblock_migratetype(page, migratetype);
76 move_freepages_block(zone, page, migratetype); 97 move_freepages_block(zone, page, migratetype);
98 restore_pageblock_isolate(page, migratetype);
77out: 99out:
78 spin_unlock_irqrestore(&zone->lock, flags); 100 spin_unlock_irqrestore(&zone->lock, flags);
79} 101}