summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mmzone.h2
-rw-r--r--mm/internal.h6
-rw-r--r--mm/page_alloc.c9
-rw-r--r--mm/vmscan.c47
-rw-r--r--mm/vmstat.c2
5 files changed, 43 insertions, 23 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 8e02b3750fe0..d2c50ab6ae40 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -630,6 +630,8 @@ typedef struct pglist_data {
630 int kswapd_order; 630 int kswapd_order;
631 enum zone_type kswapd_classzone_idx; 631 enum zone_type kswapd_classzone_idx;
632 632
633 int kswapd_failures; /* Number of 'reclaimed == 0' runs */
634
633#ifdef CONFIG_COMPACTION 635#ifdef CONFIG_COMPACTION
634 int kcompactd_max_order; 636 int kcompactd_max_order;
635 enum zone_type kcompactd_classzone_idx; 637 enum zone_type kcompactd_classzone_idx;
diff --git a/mm/internal.h b/mm/internal.h
index 266efaeaa370..e5a0e0ec2177 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -81,6 +81,12 @@ static inline void set_page_refcounted(struct page *page)
81extern unsigned long highest_memmap_pfn; 81extern unsigned long highest_memmap_pfn;
82 82
83/* 83/*
84 * Maximum number of reclaim retries without progress before the OOM
85 * killer is consider the only way forward.
86 */
87#define MAX_RECLAIM_RETRIES 16
88
89/*
84 * in mm/vmscan.c: 90 * in mm/vmscan.c:
85 */ 91 */
86extern int isolate_lru_page(struct page *page); 92extern int isolate_lru_page(struct page *page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd01501efab9..42c0543e46c3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3522,12 +3522,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3522} 3522}
3523 3523
3524/* 3524/*
3525 * Maximum number of reclaim retries without any progress before OOM killer
3526 * is consider as the only way to move forward.
3527 */
3528#define MAX_RECLAIM_RETRIES 16
3529
3530/*
3531 * Checks whether it makes sense to retry the reclaim to make a forward progress 3525 * Checks whether it makes sense to retry the reclaim to make a forward progress
3532 * for the given allocation request. 3526 * for the given allocation request.
3533 * The reclaim feedback represented by did_some_progress (any progress during 3527 * The reclaim feedback represented by did_some_progress (any progress during
@@ -4534,7 +4528,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
4534 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), 4528 K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
4535 K(node_page_state(pgdat, NR_UNSTABLE_NFS)), 4529 K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
4536 node_page_state(pgdat, NR_PAGES_SCANNED), 4530 node_page_state(pgdat, NR_PAGES_SCANNED),
4537 !pgdat_reclaimable(pgdat) ? "yes" : "no"); 4531 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
4532 "yes" : "no");
4538 } 4533 }
4539 4534
4540 for_each_populated_zone(zone) { 4535 for_each_populated_zone(zone) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc8031ef994d..667644e53b5c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2620,6 +2620,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2620 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, 2620 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2621 sc->nr_scanned - nr_scanned, sc)); 2621 sc->nr_scanned - nr_scanned, sc));
2622 2622
2623 /*
2624 * Kswapd gives up on balancing particular nodes after too
2625 * many failures to reclaim anything from them and goes to
2626 * sleep. On reclaim progress, reset the failure counter. A
2627 * successful direct reclaim run will revive a dormant kswapd.
2628 */
2629 if (reclaimable)
2630 pgdat->kswapd_failures = 0;
2631
2623 return reclaimable; 2632 return reclaimable;
2624} 2633}
2625 2634
@@ -2694,10 +2703,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2694 GFP_KERNEL | __GFP_HARDWALL)) 2703 GFP_KERNEL | __GFP_HARDWALL))
2695 continue; 2704 continue;
2696 2705
2697 if (sc->priority != DEF_PRIORITY &&
2698 !pgdat_reclaimable(zone->zone_pgdat))
2699 continue; /* Let kswapd poll it */
2700
2701 /* 2706 /*
2702 * If we already have plenty of memory free for 2707 * If we already have plenty of memory free for
2703 * compaction in this zone, don't free any more. 2708 * compaction in this zone, don't free any more.
@@ -2817,7 +2822,7 @@ retry:
2817 return 0; 2822 return 0;
2818} 2823}
2819 2824
2820static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) 2825static bool allow_direct_reclaim(pg_data_t *pgdat)
2821{ 2826{
2822 struct zone *zone; 2827 struct zone *zone;
2823 unsigned long pfmemalloc_reserve = 0; 2828 unsigned long pfmemalloc_reserve = 0;
@@ -2825,6 +2830,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2825 int i; 2830 int i;
2826 bool wmark_ok; 2831 bool wmark_ok;
2827 2832
2833 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
2834 return true;
2835
2828 for (i = 0; i <= ZONE_NORMAL; i++) { 2836 for (i = 0; i <= ZONE_NORMAL; i++) {
2829 zone = &pgdat->node_zones[i]; 2837 zone = &pgdat->node_zones[i];
2830 if (!managed_zone(zone) || 2838 if (!managed_zone(zone) ||
@@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2905 2913
2906 /* Throttle based on the first usable node */ 2914 /* Throttle based on the first usable node */
2907 pgdat = zone->zone_pgdat; 2915 pgdat = zone->zone_pgdat;
2908 if (pfmemalloc_watermark_ok(pgdat)) 2916 if (allow_direct_reclaim(pgdat))
2909 goto out; 2917 goto out;
2910 break; 2918 break;
2911 } 2919 }
@@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2927 */ 2935 */
2928 if (!(gfp_mask & __GFP_FS)) { 2936 if (!(gfp_mask & __GFP_FS)) {
2929 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 2937 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2930 pfmemalloc_watermark_ok(pgdat), HZ); 2938 allow_direct_reclaim(pgdat), HZ);
2931 2939
2932 goto check_pending; 2940 goto check_pending;
2933 } 2941 }
2934 2942
2935 /* Throttle until kswapd wakes the process */ 2943 /* Throttle until kswapd wakes the process */
2936 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 2944 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2937 pfmemalloc_watermark_ok(pgdat)); 2945 allow_direct_reclaim(pgdat));
2938 2946
2939check_pending: 2947check_pending:
2940 if (fatal_signal_pending(current)) 2948 if (fatal_signal_pending(current))
@@ -3114,7 +3122,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3114 3122
3115 /* 3123 /*
3116 * The throttled processes are normally woken up in balance_pgdat() as 3124 * The throttled processes are normally woken up in balance_pgdat() as
3117 * soon as pfmemalloc_watermark_ok() is true. But there is a potential 3125 * soon as allow_direct_reclaim() is true. But there is a potential
3118 * race between when kswapd checks the watermarks and a process gets 3126 * race between when kswapd checks the watermarks and a process gets
3119 * throttled. There is also a potential race if processes get 3127 * throttled. There is also a potential race if processes get
3120 * throttled, kswapd wakes, a large process exits thereby balancing the 3128 * throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3128,6 +3136,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3128 if (waitqueue_active(&pgdat->pfmemalloc_wait)) 3136 if (waitqueue_active(&pgdat->pfmemalloc_wait))
3129 wake_up_all(&pgdat->pfmemalloc_wait); 3137 wake_up_all(&pgdat->pfmemalloc_wait);
3130 3138
3139 /* Hopeless node, leave it to direct reclaim */
3140 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3141 return true;
3142
3131 for (i = 0; i <= classzone_idx; i++) { 3143 for (i = 0; i <= classzone_idx; i++) {
3132 struct zone *zone = pgdat->node_zones + i; 3144 struct zone *zone = pgdat->node_zones + i;
3133 3145
@@ -3214,9 +3226,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3214 count_vm_event(PAGEOUTRUN); 3226 count_vm_event(PAGEOUTRUN);
3215 3227
3216 do { 3228 do {
3229 unsigned long nr_reclaimed = sc.nr_reclaimed;
3217 bool raise_priority = true; 3230 bool raise_priority = true;
3218 3231
3219 sc.nr_reclaimed = 0;
3220 sc.reclaim_idx = classzone_idx; 3232 sc.reclaim_idx = classzone_idx;
3221 3233
3222 /* 3234 /*
@@ -3295,7 +3307,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3295 * able to safely make forward progress. Wake them 3307 * able to safely make forward progress. Wake them
3296 */ 3308 */
3297 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 3309 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3298 pfmemalloc_watermark_ok(pgdat)) 3310 allow_direct_reclaim(pgdat))
3299 wake_up_all(&pgdat->pfmemalloc_wait); 3311 wake_up_all(&pgdat->pfmemalloc_wait);
3300 3312
3301 /* Check if kswapd should be suspending */ 3313 /* Check if kswapd should be suspending */
@@ -3306,10 +3318,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
3306 * Raise priority if scanning rate is too low or there was no 3318 * Raise priority if scanning rate is too low or there was no
3307 * progress in reclaiming pages 3319 * progress in reclaiming pages
3308 */ 3320 */
3309 if (raise_priority || !sc.nr_reclaimed) 3321 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3322 if (raise_priority || !nr_reclaimed)
3310 sc.priority--; 3323 sc.priority--;
3311 } while (sc.priority >= 1); 3324 } while (sc.priority >= 1);
3312 3325
3326 if (!sc.nr_reclaimed)
3327 pgdat->kswapd_failures++;
3328
3313out: 3329out:
3314 /* 3330 /*
3315 * Return the order kswapd stopped reclaiming at as 3331 * Return the order kswapd stopped reclaiming at as
@@ -3509,6 +3525,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3509 if (!waitqueue_active(&pgdat->kswapd_wait)) 3525 if (!waitqueue_active(&pgdat->kswapd_wait))
3510 return; 3526 return;
3511 3527
3528 /* Hopeless node, leave it to direct reclaim */
3529 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3530 return;
3531
3512 /* Only wake kswapd if all zones are unbalanced */ 3532 /* Only wake kswapd if all zones are unbalanced */
3513 for (z = 0; z <= classzone_idx; z++) { 3533 for (z = 0; z <= classzone_idx; z++) {
3514 zone = pgdat->node_zones + z; 3534 zone = pgdat->node_zones + z;
@@ -3779,9 +3799,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
3779 sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) 3799 sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
3780 return NODE_RECLAIM_FULL; 3800 return NODE_RECLAIM_FULL;
3781 3801
3782 if (!pgdat_reclaimable(pgdat))
3783 return NODE_RECLAIM_FULL;
3784
3785 /* 3802 /*
3786 * Do not scan if the allocation should not be delayed. 3803 * Do not scan if the allocation should not be delayed.
3787 */ 3804 */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5a4f5c5a31e8..baee70dafba8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1425,7 +1425,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1425 "\n node_unreclaimable: %u" 1425 "\n node_unreclaimable: %u"
1426 "\n start_pfn: %lu" 1426 "\n start_pfn: %lu"
1427 "\n node_inactive_ratio: %u", 1427 "\n node_inactive_ratio: %u",
1428 !pgdat_reclaimable(zone->zone_pgdat), 1428 pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
1429 zone->zone_start_pfn, 1429 zone->zone_start_pfn,
1430 zone->zone_pgdat->inactive_ratio); 1430 zone->zone_pgdat->inactive_ratio);
1431 seq_putc(m, '\n'); 1431 seq_putc(m, '\n');