diff options
-rw-r--r-- | include/linux/mmzone.h | 2 | ||||
-rw-r--r-- | mm/internal.h | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 9 | ||||
-rw-r--r-- | mm/vmscan.c | 47 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
5 files changed, 43 insertions, 23 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8e02b3750fe0..d2c50ab6ae40 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -630,6 +630,8 @@ typedef struct pglist_data { | |||
630 | int kswapd_order; | 630 | int kswapd_order; |
631 | enum zone_type kswapd_classzone_idx; | 631 | enum zone_type kswapd_classzone_idx; |
632 | 632 | ||
633 | int kswapd_failures; /* Number of 'reclaimed == 0' runs */ | ||
634 | |||
633 | #ifdef CONFIG_COMPACTION | 635 | #ifdef CONFIG_COMPACTION |
634 | int kcompactd_max_order; | 636 | int kcompactd_max_order; |
635 | enum zone_type kcompactd_classzone_idx; | 637 | enum zone_type kcompactd_classzone_idx; |
diff --git a/mm/internal.h b/mm/internal.h index 266efaeaa370..e5a0e0ec2177 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -81,6 +81,12 @@ static inline void set_page_refcounted(struct page *page) | |||
81 | extern unsigned long highest_memmap_pfn; | 81 | extern unsigned long highest_memmap_pfn; |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * Maximum number of reclaim retries without progress before the OOM | ||
85 | * killer is consider the only way forward. | ||
86 | */ | ||
87 | #define MAX_RECLAIM_RETRIES 16 | ||
88 | |||
89 | /* | ||
84 | * in mm/vmscan.c: | 90 | * in mm/vmscan.c: |
85 | */ | 91 | */ |
86 | extern int isolate_lru_page(struct page *page); | 92 | extern int isolate_lru_page(struct page *page); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd01501efab9..42c0543e46c3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -3522,12 +3522,6 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | |||
3522 | } | 3522 | } |
3523 | 3523 | ||
3524 | /* | 3524 | /* |
3525 | * Maximum number of reclaim retries without any progress before OOM killer | ||
3526 | * is consider as the only way to move forward. | ||
3527 | */ | ||
3528 | #define MAX_RECLAIM_RETRIES 16 | ||
3529 | |||
3530 | /* | ||
3531 | * Checks whether it makes sense to retry the reclaim to make a forward progress | 3525 | * Checks whether it makes sense to retry the reclaim to make a forward progress |
3532 | * for the given allocation request. | 3526 | * for the given allocation request. |
3533 | * The reclaim feedback represented by did_some_progress (any progress during | 3527 | * The reclaim feedback represented by did_some_progress (any progress during |
@@ -4534,7 +4528,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) | |||
4534 | K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), | 4528 | K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), |
4535 | K(node_page_state(pgdat, NR_UNSTABLE_NFS)), | 4529 | K(node_page_state(pgdat, NR_UNSTABLE_NFS)), |
4536 | node_page_state(pgdat, NR_PAGES_SCANNED), | 4530 | node_page_state(pgdat, NR_PAGES_SCANNED), |
4537 | !pgdat_reclaimable(pgdat) ? "yes" : "no"); | 4531 | pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? |
4532 | "yes" : "no"); | ||
4538 | } | 4533 | } |
4539 | 4534 | ||
4540 | for_each_populated_zone(zone) { | 4535 | for_each_populated_zone(zone) { |
diff --git a/mm/vmscan.c b/mm/vmscan.c index bc8031ef994d..667644e53b5c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2620,6 +2620,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2620 | } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, | 2620 | } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, |
2621 | sc->nr_scanned - nr_scanned, sc)); | 2621 | sc->nr_scanned - nr_scanned, sc)); |
2622 | 2622 | ||
2623 | /* | ||
2624 | * Kswapd gives up on balancing particular nodes after too | ||
2625 | * many failures to reclaim anything from them and goes to | ||
2626 | * sleep. On reclaim progress, reset the failure counter. A | ||
2627 | * successful direct reclaim run will revive a dormant kswapd. | ||
2628 | */ | ||
2629 | if (reclaimable) | ||
2630 | pgdat->kswapd_failures = 0; | ||
2631 | |||
2623 | return reclaimable; | 2632 | return reclaimable; |
2624 | } | 2633 | } |
2625 | 2634 | ||
@@ -2694,10 +2703,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2694 | GFP_KERNEL | __GFP_HARDWALL)) | 2703 | GFP_KERNEL | __GFP_HARDWALL)) |
2695 | continue; | 2704 | continue; |
2696 | 2705 | ||
2697 | if (sc->priority != DEF_PRIORITY && | ||
2698 | !pgdat_reclaimable(zone->zone_pgdat)) | ||
2699 | continue; /* Let kswapd poll it */ | ||
2700 | |||
2701 | /* | 2706 | /* |
2702 | * If we already have plenty of memory free for | 2707 | * If we already have plenty of memory free for |
2703 | * compaction in this zone, don't free any more. | 2708 | * compaction in this zone, don't free any more. |
@@ -2817,7 +2822,7 @@ retry: | |||
2817 | return 0; | 2822 | return 0; |
2818 | } | 2823 | } |
2819 | 2824 | ||
2820 | static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | 2825 | static bool allow_direct_reclaim(pg_data_t *pgdat) |
2821 | { | 2826 | { |
2822 | struct zone *zone; | 2827 | struct zone *zone; |
2823 | unsigned long pfmemalloc_reserve = 0; | 2828 | unsigned long pfmemalloc_reserve = 0; |
@@ -2825,6 +2830,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2825 | int i; | 2830 | int i; |
2826 | bool wmark_ok; | 2831 | bool wmark_ok; |
2827 | 2832 | ||
2833 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) | ||
2834 | return true; | ||
2835 | |||
2828 | for (i = 0; i <= ZONE_NORMAL; i++) { | 2836 | for (i = 0; i <= ZONE_NORMAL; i++) { |
2829 | zone = &pgdat->node_zones[i]; | 2837 | zone = &pgdat->node_zones[i]; |
2830 | if (!managed_zone(zone) || | 2838 | if (!managed_zone(zone) || |
@@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2905 | 2913 | ||
2906 | /* Throttle based on the first usable node */ | 2914 | /* Throttle based on the first usable node */ |
2907 | pgdat = zone->zone_pgdat; | 2915 | pgdat = zone->zone_pgdat; |
2908 | if (pfmemalloc_watermark_ok(pgdat)) | 2916 | if (allow_direct_reclaim(pgdat)) |
2909 | goto out; | 2917 | goto out; |
2910 | break; | 2918 | break; |
2911 | } | 2919 | } |
@@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2927 | */ | 2935 | */ |
2928 | if (!(gfp_mask & __GFP_FS)) { | 2936 | if (!(gfp_mask & __GFP_FS)) { |
2929 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | 2937 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, |
2930 | pfmemalloc_watermark_ok(pgdat), HZ); | 2938 | allow_direct_reclaim(pgdat), HZ); |
2931 | 2939 | ||
2932 | goto check_pending; | 2940 | goto check_pending; |
2933 | } | 2941 | } |
2934 | 2942 | ||
2935 | /* Throttle until kswapd wakes the process */ | 2943 | /* Throttle until kswapd wakes the process */ |
2936 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | 2944 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, |
2937 | pfmemalloc_watermark_ok(pgdat)); | 2945 | allow_direct_reclaim(pgdat)); |
2938 | 2946 | ||
2939 | check_pending: | 2947 | check_pending: |
2940 | if (fatal_signal_pending(current)) | 2948 | if (fatal_signal_pending(current)) |
@@ -3114,7 +3122,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
3114 | 3122 | ||
3115 | /* | 3123 | /* |
3116 | * The throttled processes are normally woken up in balance_pgdat() as | 3124 | * The throttled processes are normally woken up in balance_pgdat() as |
3117 | * soon as pfmemalloc_watermark_ok() is true. But there is a potential | 3125 | * soon as allow_direct_reclaim() is true. But there is a potential |
3118 | * race between when kswapd checks the watermarks and a process gets | 3126 | * race between when kswapd checks the watermarks and a process gets |
3119 | * throttled. There is also a potential race if processes get | 3127 | * throttled. There is also a potential race if processes get |
3120 | * throttled, kswapd wakes, a large process exits thereby balancing the | 3128 | * throttled, kswapd wakes, a large process exits thereby balancing the |
@@ -3128,6 +3136,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
3128 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) | 3136 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) |
3129 | wake_up_all(&pgdat->pfmemalloc_wait); | 3137 | wake_up_all(&pgdat->pfmemalloc_wait); |
3130 | 3138 | ||
3139 | /* Hopeless node, leave it to direct reclaim */ | ||
3140 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) | ||
3141 | return true; | ||
3142 | |||
3131 | for (i = 0; i <= classzone_idx; i++) { | 3143 | for (i = 0; i <= classzone_idx; i++) { |
3132 | struct zone *zone = pgdat->node_zones + i; | 3144 | struct zone *zone = pgdat->node_zones + i; |
3133 | 3145 | ||
@@ -3214,9 +3226,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3214 | count_vm_event(PAGEOUTRUN); | 3226 | count_vm_event(PAGEOUTRUN); |
3215 | 3227 | ||
3216 | do { | 3228 | do { |
3229 | unsigned long nr_reclaimed = sc.nr_reclaimed; | ||
3217 | bool raise_priority = true; | 3230 | bool raise_priority = true; |
3218 | 3231 | ||
3219 | sc.nr_reclaimed = 0; | ||
3220 | sc.reclaim_idx = classzone_idx; | 3232 | sc.reclaim_idx = classzone_idx; |
3221 | 3233 | ||
3222 | /* | 3234 | /* |
@@ -3295,7 +3307,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3295 | * able to safely make forward progress. Wake them | 3307 | * able to safely make forward progress. Wake them |
3296 | */ | 3308 | */ |
3297 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | 3309 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && |
3298 | pfmemalloc_watermark_ok(pgdat)) | 3310 | allow_direct_reclaim(pgdat)) |
3299 | wake_up_all(&pgdat->pfmemalloc_wait); | 3311 | wake_up_all(&pgdat->pfmemalloc_wait); |
3300 | 3312 | ||
3301 | /* Check if kswapd should be suspending */ | 3313 | /* Check if kswapd should be suspending */ |
@@ -3306,10 +3318,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3306 | * Raise priority if scanning rate is too low or there was no | 3318 | * Raise priority if scanning rate is too low or there was no |
3307 | * progress in reclaiming pages | 3319 | * progress in reclaiming pages |
3308 | */ | 3320 | */ |
3309 | if (raise_priority || !sc.nr_reclaimed) | 3321 | nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; |
3322 | if (raise_priority || !nr_reclaimed) | ||
3310 | sc.priority--; | 3323 | sc.priority--; |
3311 | } while (sc.priority >= 1); | 3324 | } while (sc.priority >= 1); |
3312 | 3325 | ||
3326 | if (!sc.nr_reclaimed) | ||
3327 | pgdat->kswapd_failures++; | ||
3328 | |||
3313 | out: | 3329 | out: |
3314 | /* | 3330 | /* |
3315 | * Return the order kswapd stopped reclaiming at as | 3331 | * Return the order kswapd stopped reclaiming at as |
@@ -3509,6 +3525,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | |||
3509 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 3525 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
3510 | return; | 3526 | return; |
3511 | 3527 | ||
3528 | /* Hopeless node, leave it to direct reclaim */ | ||
3529 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) | ||
3530 | return; | ||
3531 | |||
3512 | /* Only wake kswapd if all zones are unbalanced */ | 3532 | /* Only wake kswapd if all zones are unbalanced */ |
3513 | for (z = 0; z <= classzone_idx; z++) { | 3533 | for (z = 0; z <= classzone_idx; z++) { |
3514 | zone = pgdat->node_zones + z; | 3534 | zone = pgdat->node_zones + z; |
@@ -3779,9 +3799,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) | |||
3779 | sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) | 3799 | sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) |
3780 | return NODE_RECLAIM_FULL; | 3800 | return NODE_RECLAIM_FULL; |
3781 | 3801 | ||
3782 | if (!pgdat_reclaimable(pgdat)) | ||
3783 | return NODE_RECLAIM_FULL; | ||
3784 | |||
3785 | /* | 3802 | /* |
3786 | * Do not scan if the allocation should not be delayed. | 3803 | * Do not scan if the allocation should not be delayed. |
3787 | */ | 3804 | */ |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 5a4f5c5a31e8..baee70dafba8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -1425,7 +1425,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1425 | "\n node_unreclaimable: %u" | 1425 | "\n node_unreclaimable: %u" |
1426 | "\n start_pfn: %lu" | 1426 | "\n start_pfn: %lu" |
1427 | "\n node_inactive_ratio: %u", | 1427 | "\n node_inactive_ratio: %u", |
1428 | !pgdat_reclaimable(zone->zone_pgdat), | 1428 | pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, |
1429 | zone->zone_start_pfn, | 1429 | zone->zone_start_pfn, |
1430 | zone->zone_pgdat->inactive_ratio); | 1430 | zone->zone_pgdat->inactive_ratio); |
1431 | seq_putc(m, '\n'); | 1431 | seq_putc(m, '\n'); |