aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2011-07-08 18:39:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-09 00:14:43 -0400
commit215ddd6664ced067afca7eebd2d1eb83f064ff5a (patch)
treeb0e01235355d9c77b3bf63e0a57a6721fc8e3793
parentda175d06b437093f93109ba9e5efbe44dfdf9409 (diff)
mm: vmscan: only read new_classzone_idx from pgdat when reclaiming successfully
During allocator-intensive workloads, kswapd will be woken frequently causing free memory to oscillate between the high and min watermark. This is expected behaviour. Unfortunately, if the highest zone is small, a problem occurs. When balance_pgdat() returns, it may be at a lower classzone_idx than it started because the highest zone was unreclaimable. Before checking if it should go to sleep though, it checks pgdat->classzone_idx which when there is no other activity will be MAX_NR_ZONES-1. It interprets this as it has been woken up while reclaiming, skips scheduling and reclaims again. As there is no useful reclaim work to do, it enters into a loop of shrinking slab consuming loads of CPU until the highest zone becomes reclaimable for a long period of time. There are two problems here. 1) If the returned classzone or order is lower, it'll continue reclaiming without scheduling. 2) if the highest zone was marked unreclaimable but balance_pgdat() returns immediately at DEF_PRIORITY, the new lower classzone is not communicated back to kswapd() for sleeping. This patch does two things that are related. If the end_zone is unreclaimable, this information is communicated back. Second, if the classzone or order was reduced due to failing to reclaim, new information is not read from pgdat and instead an attempt is made to go to sleep. Due to this, it is also necessary that pgdat->classzone_idx be initialised each time to pgdat->nr_zones - 1 to avoid re-reads being interpreted as wakeups. Signed-off-by: Mel Gorman <mgorman@suse.de> Reported-by: Pádraig Brady <P@draigBrady.com> Tested-by: Pádraig Brady <P@draigBrady.com> Tested-by: Andrew Lutomirski <luto@mit.edu> Acked-by: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: <stable@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/vmscan.c34
1 files changed, 21 insertions, 13 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a51b3c9f05ba..5ed24b94c5e6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2451,7 +2451,6 @@ loop_again:
2451 if (!zone_watermark_ok_safe(zone, order, 2451 if (!zone_watermark_ok_safe(zone, order,
2452 high_wmark_pages(zone), 0, 0)) { 2452 high_wmark_pages(zone), 0, 0)) {
2453 end_zone = i; 2453 end_zone = i;
2454 *classzone_idx = i;
2455 break; 2454 break;
2456 } 2455 }
2457 } 2456 }
@@ -2531,8 +2530,11 @@ loop_again:
2531 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2530 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2532 sc.may_writepage = 1; 2531 sc.may_writepage = 1;
2533 2532
2534 if (zone->all_unreclaimable) 2533 if (zone->all_unreclaimable) {
2534 if (end_zone && end_zone == i)
2535 end_zone--;
2535 continue; 2536 continue;
2537 }
2536 2538
2537 if (!zone_watermark_ok_safe(zone, order, 2539 if (!zone_watermark_ok_safe(zone, order,
2538 high_wmark_pages(zone), end_zone, 0)) { 2540 high_wmark_pages(zone), end_zone, 0)) {
@@ -2712,8 +2714,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2712 */ 2714 */
2713static int kswapd(void *p) 2715static int kswapd(void *p)
2714{ 2716{
2715 unsigned long order; 2717 unsigned long order, new_order;
2716 int classzone_idx; 2718 int classzone_idx, new_classzone_idx;
2717 pg_data_t *pgdat = (pg_data_t*)p; 2719 pg_data_t *pgdat = (pg_data_t*)p;
2718 struct task_struct *tsk = current; 2720 struct task_struct *tsk = current;
2719 2721
@@ -2743,17 +2745,23 @@ static int kswapd(void *p)
2743 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2745 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2744 set_freezable(); 2746 set_freezable();
2745 2747
2746 order = 0; 2748 order = new_order = 0;
2747 classzone_idx = MAX_NR_ZONES - 1; 2749 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2748 for ( ; ; ) { 2750 for ( ; ; ) {
2749 unsigned long new_order;
2750 int new_classzone_idx;
2751 int ret; 2751 int ret;
2752 2752
2753 new_order = pgdat->kswapd_max_order; 2753 /*
2754 new_classzone_idx = pgdat->classzone_idx; 2754 * If the last balance_pgdat was unsuccessful it's unlikely a
2755 pgdat->kswapd_max_order = 0; 2755 * new request of a similar or harder type will succeed soon
2756 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2756 * so consider going to sleep on the basis we reclaimed at
2757 */
2758 if (classzone_idx >= new_classzone_idx && order == new_order) {
2759 new_order = pgdat->kswapd_max_order;
2760 new_classzone_idx = pgdat->classzone_idx;
2761 pgdat->kswapd_max_order = 0;
2762 pgdat->classzone_idx = pgdat->nr_zones - 1;
2763 }
2764
2757 if (order < new_order || classzone_idx > new_classzone_idx) { 2765 if (order < new_order || classzone_idx > new_classzone_idx) {
2758 /* 2766 /*
2759 * Don't sleep if someone wants a larger 'order' 2767 * Don't sleep if someone wants a larger 'order'
@@ -2766,7 +2774,7 @@ static int kswapd(void *p)
2766 order = pgdat->kswapd_max_order; 2774 order = pgdat->kswapd_max_order;
2767 classzone_idx = pgdat->classzone_idx; 2775 classzone_idx = pgdat->classzone_idx;
2768 pgdat->kswapd_max_order = 0; 2776 pgdat->kswapd_max_order = 0;
2769 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2777 pgdat->classzone_idx = pgdat->nr_zones - 1;
2770 } 2778 }
2771 2779
2772 ret = try_to_freeze(); 2780 ret = try_to_freeze();