summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLisa Du <cldu@marvell.com>2013-09-11 17:22:36 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-09-11 18:58:01 -0400
commit6e543d5780e36ff5ee56c44d7e2e30db3457a7ed (patch)
tree094208c4caad9d0d766137c243d0cfe97a1ce0b9 /mm
parent7a8010cd36273ff5f6fea5201ef9232f30cebbd9 (diff)
mm: vmscan: fix do_try_to_free_pages() livelock
This patch is based on KOSAKI's work and I add a little more description, please refer https://lkml.org/lkml/2012/6/14/74. Currently, I found system can enter a state that there are lots of free pages in a zone but only order-0 and order-1 pages which means the zone is heavily fragmented, then high order allocation could make direct reclaim path's long stall(ex, 60 seconds) especially in no swap and no compaciton enviroment. This problem happened on v3.4, but it seems issue still lives in current tree, the reason is do_try_to_free_pages enter live lock: kswapd will go to sleep if the zones have been fully scanned and are still not balanced. As kswapd thinks there's little point trying all over again to avoid infinite loop. Instead it changes order from high-order to 0-order because kswapd think order-0 is the most important. Look at 73ce02e9 in detail. If watermarks are ok, kswapd will go back to sleep and may leave zone->all_unreclaimable =3D 0. It assume high-order users can still perform direct reclaim if they wish. Direct reclaim continue to reclaim for a high order which is not a COSTLY_ORDER without oom-killer until kswapd turn on zone->all_unreclaimble= . This is because to avoid too early oom-kill. So it means direct_reclaim depends on kswapd to break this loop. In worst case, direct-reclaim may continue to page reclaim forever when kswapd sleeps forever until someone like watchdog detect and finally kill the process. As described in: http://thread.gmane.org/gmane.linux.kernel.mm/103737 We can't turn on zone->all_unreclaimable from direct reclaim path because direct reclaim path don't take any lock and this way is racy. Thus this patch removes zone->all_unreclaimable field completely and recalculates zone reclaimable state every time. Note: we can't take the idea that direct-reclaim see zone->pages_scanned directly and kswapd continue to use zone->all_unreclaimable. Because, it is racy. commit 929bea7c71 (vmscan: all_unreclaimable() use zone->all_unreclaimable as a name) describes the detail. [akpm@linux-foundation.org: uninline zone_reclaimable_pages() and zone_reclaimable()] Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com> Cc: Ying Han <yinghan@google.com> Cc: Nick Piggin <npiggin@gmail.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Christoph Lameter <cl@linux.com> Cc: Bob Liu <lliubbo@gmail.com> Cc: Neil Zhang <zhangwm@marvell.com> Cc: Russell King - ARM Linux <linux@arm.linux.org.uk> Reviewed-by: Michal Hocko <mhocko@suse.cz> Acked-by: Minchan Kim <minchan@kernel.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Lisa Du <cldu@marvell.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/internal.h2
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/page-writeback.c3
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/vmscan.c66
-rw-r--r--mm/vmstat.c5
6 files changed, 43 insertions, 40 deletions
diff --git a/mm/internal.h b/mm/internal.h
index 4390ac6c106e..684f7aa9692a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn;
85 */ 85 */
86extern int isolate_lru_page(struct page *page); 86extern int isolate_lru_page(struct page *page);
87extern void putback_lru_page(struct page *page); 87extern void putback_lru_page(struct page *page);
88extern unsigned long zone_reclaimable_pages(struct zone *zone);
89extern bool zone_reclaimable(struct zone *zone);
88 90
89/* 91/*
90 * in mm/rmap.c: 92 * in mm/rmap.c:
diff --git a/mm/migrate.c b/mm/migrate.c
index 61f14a1923fd..b7ded7eafe3a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1471,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1471 if (!populated_zone(zone)) 1471 if (!populated_zone(zone))
1472 continue; 1472 continue;
1473 1473
1474 if (zone->all_unreclaimable) 1474 if (!zone_reclaimable(zone))
1475 continue; 1475 continue;
1476 1476
1477 /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 1477 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d374b29296dd..3750431b3cd8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,8 +36,11 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/timer.h> 37#include <linux/timer.h>
38#include <linux/sched/rt.h> 38#include <linux/sched/rt.h>
39#include <linux/mm_inline.h>
39#include <trace/events/writeback.h> 40#include <trace/events/writeback.h>
40 41
42#include "internal.h"
43
41/* 44/*
42 * Sleep at most 200ms at a time in balance_dirty_pages(). 45 * Sleep at most 200ms at a time in balance_dirty_pages().
43 */ 46 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7b1b706a1ffa..ff2782576e39 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@
56#include <linux/ftrace_event.h> 56#include <linux/ftrace_event.h>
57#include <linux/memcontrol.h> 57#include <linux/memcontrol.h>
58#include <linux/prefetch.h> 58#include <linux/prefetch.h>
59#include <linux/mm_inline.h>
59#include <linux/migrate.h> 60#include <linux/migrate.h>
60#include <linux/page-debug-flags.h> 61#include <linux/page-debug-flags.h>
61#include <linux/hugetlb.h> 62#include <linux/hugetlb.h>
@@ -647,7 +648,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
647 int to_free = count; 648 int to_free = count;
648 649
649 spin_lock(&zone->lock); 650 spin_lock(&zone->lock);
650 zone->all_unreclaimable = 0;
651 zone->pages_scanned = 0; 651 zone->pages_scanned = 0;
652 652
653 while (to_free) { 653 while (to_free) {
@@ -696,7 +696,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
696 int migratetype) 696 int migratetype)
697{ 697{
698 spin_lock(&zone->lock); 698 spin_lock(&zone->lock);
699 zone->all_unreclaimable = 0;
700 zone->pages_scanned = 0; 699 zone->pages_scanned = 0;
701 700
702 __free_one_page(page, zone, order, migratetype); 701 __free_one_page(page, zone, order, migratetype);
@@ -3164,7 +3163,7 @@ void show_free_areas(unsigned int filter)
3164 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3163 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3165 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3164 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3166 zone->pages_scanned, 3165 zone->pages_scanned,
3167 (zone->all_unreclaimable ? "yes" : "no") 3166 (!zone_reclaimable(zone) ? "yes" : "no")
3168 ); 3167 );
3169 printk("lowmem_reserve[]:"); 3168 printk("lowmem_reserve[]:");
3170 for (i = 0; i < MAX_NR_ZONES; i++) 3169 for (i = 0; i < MAX_NR_ZONES; i++)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 44c072a7cba2..fe715daeb8bc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc)
146} 146}
147#endif 147#endif
148 148
149unsigned long zone_reclaimable_pages(struct zone *zone)
150{
151 int nr;
152
153 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
154 zone_page_state(zone, NR_INACTIVE_FILE);
155
156 if (get_nr_swap_pages() > 0)
157 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
158 zone_page_state(zone, NR_INACTIVE_ANON);
159
160 return nr;
161}
162
163bool zone_reclaimable(struct zone *zone)
164{
165 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
166}
167
149static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 168static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
150{ 169{
151 if (!mem_cgroup_disabled()) 170 if (!mem_cgroup_disabled())
@@ -1789,7 +1808,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1789 * latencies, so it's better to scan a minimum amount there as 1808 * latencies, so it's better to scan a minimum amount there as
1790 * well. 1809 * well.
1791 */ 1810 */
1792 if (current_is_kswapd() && zone->all_unreclaimable) 1811 if (current_is_kswapd() && !zone_reclaimable(zone))
1793 force_scan = true; 1812 force_scan = true;
1794 if (!global_reclaim(sc)) 1813 if (!global_reclaim(sc))
1795 force_scan = true; 1814 force_scan = true;
@@ -2244,8 +2263,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2244 if (global_reclaim(sc)) { 2263 if (global_reclaim(sc)) {
2245 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2264 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2246 continue; 2265 continue;
2247 if (zone->all_unreclaimable && 2266 if (sc->priority != DEF_PRIORITY &&
2248 sc->priority != DEF_PRIORITY) 2267 !zone_reclaimable(zone))
2249 continue; /* Let kswapd poll it */ 2268 continue; /* Let kswapd poll it */
2250 if (IS_ENABLED(CONFIG_COMPACTION)) { 2269 if (IS_ENABLED(CONFIG_COMPACTION)) {
2251 /* 2270 /*
@@ -2283,11 +2302,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2283 return aborted_reclaim; 2302 return aborted_reclaim;
2284} 2303}
2285 2304
2286static bool zone_reclaimable(struct zone *zone)
2287{
2288 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
2289}
2290
2291/* All zones in zonelist are unreclaimable? */ 2305/* All zones in zonelist are unreclaimable? */
2292static bool all_unreclaimable(struct zonelist *zonelist, 2306static bool all_unreclaimable(struct zonelist *zonelist,
2293 struct scan_control *sc) 2307 struct scan_control *sc)
@@ -2301,7 +2315,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
2301 continue; 2315 continue;
2302 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2316 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2303 continue; 2317 continue;
2304 if (!zone->all_unreclaimable) 2318 if (zone_reclaimable(zone))
2305 return false; 2319 return false;
2306 } 2320 }
2307 2321
@@ -2712,7 +2726,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2712 * DEF_PRIORITY. Effectively, it considers them balanced so 2726 * DEF_PRIORITY. Effectively, it considers them balanced so
2713 * they must be considered balanced here as well! 2727 * they must be considered balanced here as well!
2714 */ 2728 */
2715 if (zone->all_unreclaimable) { 2729 if (!zone_reclaimable(zone)) {
2716 balanced_pages += zone->managed_pages; 2730 balanced_pages += zone->managed_pages;
2717 continue; 2731 continue;
2718 } 2732 }
@@ -2773,7 +2787,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
2773 unsigned long lru_pages, 2787 unsigned long lru_pages,
2774 unsigned long *nr_attempted) 2788 unsigned long *nr_attempted)
2775{ 2789{
2776 unsigned long nr_slab;
2777 int testorder = sc->order; 2790 int testorder = sc->order;
2778 unsigned long balance_gap; 2791 unsigned long balance_gap;
2779 struct reclaim_state *reclaim_state = current->reclaim_state; 2792 struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2818,15 +2831,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
2818 shrink_zone(zone, sc); 2831 shrink_zone(zone, sc);
2819 2832
2820 reclaim_state->reclaimed_slab = 0; 2833 reclaim_state->reclaimed_slab = 0;
2821 nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); 2834 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2822 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2835 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2823 2836
2824 /* Account for the number of pages attempted to reclaim */ 2837 /* Account for the number of pages attempted to reclaim */
2825 *nr_attempted += sc->nr_to_reclaim; 2838 *nr_attempted += sc->nr_to_reclaim;
2826 2839
2827 if (nr_slab == 0 && !zone_reclaimable(zone))
2828 zone->all_unreclaimable = 1;
2829
2830 zone_clear_flag(zone, ZONE_WRITEBACK); 2840 zone_clear_flag(zone, ZONE_WRITEBACK);
2831 2841
2832 /* 2842 /*
@@ -2835,7 +2845,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
2835 * BDIs but as pressure is relieved, speculatively avoid congestion 2845 * BDIs but as pressure is relieved, speculatively avoid congestion
2836 * waits. 2846 * waits.
2837 */ 2847 */
2838 if (!zone->all_unreclaimable && 2848 if (zone_reclaimable(zone) &&
2839 zone_balanced(zone, testorder, 0, classzone_idx)) { 2849 zone_balanced(zone, testorder, 0, classzone_idx)) {
2840 zone_clear_flag(zone, ZONE_CONGESTED); 2850 zone_clear_flag(zone, ZONE_CONGESTED);
2841 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 2851 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
@@ -2901,8 +2911,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2901 if (!populated_zone(zone)) 2911 if (!populated_zone(zone))
2902 continue; 2912 continue;
2903 2913
2904 if (zone->all_unreclaimable && 2914 if (sc.priority != DEF_PRIORITY &&
2905 sc.priority != DEF_PRIORITY) 2915 !zone_reclaimable(zone))
2906 continue; 2916 continue;
2907 2917
2908 /* 2918 /*
@@ -2980,8 +2990,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2980 if (!populated_zone(zone)) 2990 if (!populated_zone(zone))
2981 continue; 2991 continue;
2982 2992
2983 if (zone->all_unreclaimable && 2993 if (sc.priority != DEF_PRIORITY &&
2984 sc.priority != DEF_PRIORITY) 2994 !zone_reclaimable(zone))
2985 continue; 2995 continue;
2986 2996
2987 sc.nr_scanned = 0; 2997 sc.nr_scanned = 0;
@@ -3265,20 +3275,6 @@ unsigned long global_reclaimable_pages(void)
3265 return nr; 3275 return nr;
3266} 3276}
3267 3277
3268unsigned long zone_reclaimable_pages(struct zone *zone)
3269{
3270 int nr;
3271
3272 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3273 zone_page_state(zone, NR_INACTIVE_FILE);
3274
3275 if (get_nr_swap_pages() > 0)
3276 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3277 zone_page_state(zone, NR_INACTIVE_ANON);
3278
3279 return nr;
3280}
3281
3282#ifdef CONFIG_HIBERNATION 3278#ifdef CONFIG_HIBERNATION
3283/* 3279/*
3284 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 3280 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3576,7 +3572,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3576 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3572 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3577 return ZONE_RECLAIM_FULL; 3573 return ZONE_RECLAIM_FULL;
3578 3574
3579 if (zone->all_unreclaimable) 3575 if (!zone_reclaimable(zone))
3580 return ZONE_RECLAIM_FULL; 3576 return ZONE_RECLAIM_FULL;
3581 3577
3582 /* 3578 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d57a09143bf9..9bb314577911 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -19,6 +19,9 @@
19#include <linux/math64.h> 19#include <linux/math64.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/compaction.h> 21#include <linux/compaction.h>
22#include <linux/mm_inline.h>
23
24#include "internal.h"
22 25
23#ifdef CONFIG_VM_EVENT_COUNTERS 26#ifdef CONFIG_VM_EVENT_COUNTERS
24DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 27DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -1088,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1088 "\n all_unreclaimable: %u" 1091 "\n all_unreclaimable: %u"
1089 "\n start_pfn: %lu" 1092 "\n start_pfn: %lu"
1090 "\n inactive_ratio: %u", 1093 "\n inactive_ratio: %u",
1091 zone->all_unreclaimable, 1094 !zone_reclaimable(zone),
1092 zone->zone_start_pfn, 1095 zone->zone_start_pfn,
1093 zone->inactive_ratio); 1096 zone->inactive_ratio);
1094 seq_putc(m, '\n'); 1097 seq_putc(m, '\n');