diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 185 |
1 files changed, 169 insertions, 16 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 661576324c7f..8d01243d9560 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */ | |||
133 | static LIST_HEAD(shrinker_list); | 133 | static LIST_HEAD(shrinker_list); |
134 | static DECLARE_RWSEM(shrinker_rwsem); | 134 | static DECLARE_RWSEM(shrinker_rwsem); |
135 | 135 | ||
136 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 136 | #ifdef CONFIG_MEMCG |
137 | static bool global_reclaim(struct scan_control *sc) | 137 | static bool global_reclaim(struct scan_control *sc) |
138 | { | 138 | { |
139 | return !sc->target_mem_cgroup; | 139 | return !sc->target_mem_cgroup; |
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
687 | 687 | ||
688 | cond_resched(); | 688 | cond_resched(); |
689 | 689 | ||
690 | mem_cgroup_uncharge_start(); | ||
690 | while (!list_empty(page_list)) { | 691 | while (!list_empty(page_list)) { |
691 | enum page_references references; | 692 | enum page_references references; |
692 | struct address_space *mapping; | 693 | struct address_space *mapping; |
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
720 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 721 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
721 | 722 | ||
722 | if (PageWriteback(page)) { | 723 | if (PageWriteback(page)) { |
723 | nr_writeback++; | 724 | /* |
724 | unlock_page(page); | 725 | * memcg doesn't have any dirty pages throttling so we |
725 | goto keep; | 726 | * could easily OOM just because too many pages are in |
727 | * writeback and there is nothing else to reclaim. | ||
728 | * | ||
729 | * Check __GFP_IO, certainly because a loop driver | ||
730 | * thread might enter reclaim, and deadlock if it waits | ||
731 | * on a page for which it is needed to do the write | ||
732 | * (loop masks off __GFP_IO|__GFP_FS for this reason); | ||
733 | * but more thought would probably show more reasons. | ||
734 | * | ||
735 | * Don't require __GFP_FS, since we're not going into | ||
736 | * the FS, just waiting on its writeback completion. | ||
737 | * Worryingly, ext4 gfs2 and xfs allocate pages with | ||
738 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so | ||
739 | * testing may_enter_fs here is liable to OOM on them. | ||
740 | */ | ||
741 | if (global_reclaim(sc) || | ||
742 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { | ||
743 | /* | ||
744 | * This is slightly racy - end_page_writeback() | ||
745 | * might have just cleared PageReclaim, then | ||
746 | * setting PageReclaim here end up interpreted | ||
747 | * as PageReadahead - but that does not matter | ||
748 | * enough to care. What we do want is for this | ||
749 | * page to have PageReclaim set next time memcg | ||
750 | * reclaim reaches the tests above, so it will | ||
751 | * then wait_on_page_writeback() to avoid OOM; | ||
752 | * and it's also appropriate in global reclaim. | ||
753 | */ | ||
754 | SetPageReclaim(page); | ||
755 | nr_writeback++; | ||
756 | goto keep_locked; | ||
757 | } | ||
758 | wait_on_page_writeback(page); | ||
726 | } | 759 | } |
727 | 760 | ||
728 | references = page_check_references(page, sc); | 761 | references = page_check_references(page, sc); |
@@ -921,6 +954,7 @@ keep: | |||
921 | 954 | ||
922 | list_splice(&ret_pages, page_list); | 955 | list_splice(&ret_pages, page_list); |
923 | count_vm_events(PGACTIVATE, pgactivate); | 956 | count_vm_events(PGACTIVATE, pgactivate); |
957 | mem_cgroup_uncharge_end(); | ||
924 | *ret_nr_dirty += nr_dirty; | 958 | *ret_nr_dirty += nr_dirty; |
925 | *ret_nr_writeback += nr_writeback; | 959 | *ret_nr_writeback += nr_writeback; |
926 | return nr_reclaimed; | 960 | return nr_reclaimed; |
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
1567 | * by looking at the fraction of the pages scanned we did rotate back | 1601 | * by looking at the fraction of the pages scanned we did rotate back |
1568 | * onto the active list instead of evict. | 1602 | * onto the active list instead of evict. |
1569 | * | 1603 | * |
1570 | * nr[0] = anon pages to scan; nr[1] = file pages to scan | 1604 | * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan |
1605 | * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan | ||
1571 | */ | 1606 | */ |
1572 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | 1607 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, |
1573 | unsigned long *nr) | 1608 | unsigned long *nr) |
@@ -2111,6 +2146,83 @@ out: | |||
2111 | return 0; | 2146 | return 0; |
2112 | } | 2147 | } |
2113 | 2148 | ||
2149 | static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | ||
2150 | { | ||
2151 | struct zone *zone; | ||
2152 | unsigned long pfmemalloc_reserve = 0; | ||
2153 | unsigned long free_pages = 0; | ||
2154 | int i; | ||
2155 | bool wmark_ok; | ||
2156 | |||
2157 | for (i = 0; i <= ZONE_NORMAL; i++) { | ||
2158 | zone = &pgdat->node_zones[i]; | ||
2159 | pfmemalloc_reserve += min_wmark_pages(zone); | ||
2160 | free_pages += zone_page_state(zone, NR_FREE_PAGES); | ||
2161 | } | ||
2162 | |||
2163 | wmark_ok = free_pages > pfmemalloc_reserve / 2; | ||
2164 | |||
2165 | /* kswapd must be awake if processes are being throttled */ | ||
2166 | if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { | ||
2167 | pgdat->classzone_idx = min(pgdat->classzone_idx, | ||
2168 | (enum zone_type)ZONE_NORMAL); | ||
2169 | wake_up_interruptible(&pgdat->kswapd_wait); | ||
2170 | } | ||
2171 | |||
2172 | return wmark_ok; | ||
2173 | } | ||
2174 | |||
2175 | /* | ||
2176 | * Throttle direct reclaimers if backing storage is backed by the network | ||
2177 | * and the PFMEMALLOC reserve for the preferred node is getting dangerously | ||
2178 | * depleted. kswapd will continue to make progress and wake the processes | ||
2179 | * when the low watermark is reached | ||
2180 | */ | ||
2181 | static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | ||
2182 | nodemask_t *nodemask) | ||
2183 | { | ||
2184 | struct zone *zone; | ||
2185 | int high_zoneidx = gfp_zone(gfp_mask); | ||
2186 | pg_data_t *pgdat; | ||
2187 | |||
2188 | /* | ||
2189 | * Kernel threads should not be throttled as they may be indirectly | ||
2190 | * responsible for cleaning pages necessary for reclaim to make forward | ||
2191 | * progress. kjournald for example may enter direct reclaim while | ||
2192 | * committing a transaction where throttling it could forcing other | ||
2193 | * processes to block on log_wait_commit(). | ||
2194 | */ | ||
2195 | if (current->flags & PF_KTHREAD) | ||
2196 | return; | ||
2197 | |||
2198 | /* Check if the pfmemalloc reserves are ok */ | ||
2199 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | ||
2200 | pgdat = zone->zone_pgdat; | ||
2201 | if (pfmemalloc_watermark_ok(pgdat)) | ||
2202 | return; | ||
2203 | |||
2204 | /* Account for the throttling */ | ||
2205 | count_vm_event(PGSCAN_DIRECT_THROTTLE); | ||
2206 | |||
2207 | /* | ||
2208 | * If the caller cannot enter the filesystem, it's possible that it | ||
2209 | * is due to the caller holding an FS lock or performing a journal | ||
2210 | * transaction in the case of a filesystem like ext[3|4]. In this case, | ||
2211 | * it is not safe to block on pfmemalloc_wait as kswapd could be | ||
2212 | * blocked waiting on the same lock. Instead, throttle for up to a | ||
2213 | * second before continuing. | ||
2214 | */ | ||
2215 | if (!(gfp_mask & __GFP_FS)) { | ||
2216 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | ||
2217 | pfmemalloc_watermark_ok(pgdat), HZ); | ||
2218 | return; | ||
2219 | } | ||
2220 | |||
2221 | /* Throttle until kswapd wakes the process */ | ||
2222 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | ||
2223 | pfmemalloc_watermark_ok(pgdat)); | ||
2224 | } | ||
2225 | |||
2114 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 2226 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
2115 | gfp_t gfp_mask, nodemask_t *nodemask) | 2227 | gfp_t gfp_mask, nodemask_t *nodemask) |
2116 | { | 2228 | { |
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2130 | .gfp_mask = sc.gfp_mask, | 2242 | .gfp_mask = sc.gfp_mask, |
2131 | }; | 2243 | }; |
2132 | 2244 | ||
2245 | throttle_direct_reclaim(gfp_mask, zonelist, nodemask); | ||
2246 | |||
2247 | /* | ||
2248 | * Do not enter reclaim if fatal signal is pending. 1 is returned so | ||
2249 | * that the page allocator does not consider triggering OOM | ||
2250 | */ | ||
2251 | if (fatal_signal_pending(current)) | ||
2252 | return 1; | ||
2253 | |||
2133 | trace_mm_vmscan_direct_reclaim_begin(order, | 2254 | trace_mm_vmscan_direct_reclaim_begin(order, |
2134 | sc.may_writepage, | 2255 | sc.may_writepage, |
2135 | gfp_mask); | 2256 | gfp_mask); |
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2141 | return nr_reclaimed; | 2262 | return nr_reclaimed; |
2142 | } | 2263 | } |
2143 | 2264 | ||
2144 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2265 | #ifdef CONFIG_MEMCG |
2145 | 2266 | ||
2146 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | 2267 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, |
2147 | gfp_t gfp_mask, bool noswap, | 2268 | gfp_t gfp_mask, bool noswap, |
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2274 | return balanced_pages >= (present_pages >> 2); | 2395 | return balanced_pages >= (present_pages >> 2); |
2275 | } | 2396 | } |
2276 | 2397 | ||
2277 | /* is kswapd sleeping prematurely? */ | 2398 | /* |
2278 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | 2399 | * Prepare kswapd for sleeping. This verifies that there are no processes |
2400 | * waiting in throttle_direct_reclaim() and that watermarks have been met. | ||
2401 | * | ||
2402 | * Returns true if kswapd is ready to sleep | ||
2403 | */ | ||
2404 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | ||
2279 | int classzone_idx) | 2405 | int classzone_idx) |
2280 | { | 2406 | { |
2281 | int i; | 2407 | int i; |
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2284 | 2410 | ||
2285 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2411 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2286 | if (remaining) | 2412 | if (remaining) |
2287 | return true; | 2413 | return false; |
2414 | |||
2415 | /* | ||
2416 | * There is a potential race between when kswapd checks its watermarks | ||
2417 | * and a process gets throttled. There is also a potential race if | ||
2418 | * processes get throttled, kswapd wakes, a large process exits therby | ||
2419 | * balancing the zones that causes kswapd to miss a wakeup. If kswapd | ||
2420 | * is going to sleep, no process should be sleeping on pfmemalloc_wait | ||
2421 | * so wake them now if necessary. If necessary, processes will wake | ||
2422 | * kswapd and get throttled again | ||
2423 | */ | ||
2424 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) { | ||
2425 | wake_up(&pgdat->pfmemalloc_wait); | ||
2426 | return false; | ||
2427 | } | ||
2288 | 2428 | ||
2289 | /* Check the watermark levels */ | 2429 | /* Check the watermark levels */ |
2290 | for (i = 0; i <= classzone_idx; i++) { | 2430 | for (i = 0; i <= classzone_idx; i++) { |
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2317 | * must be balanced | 2457 | * must be balanced |
2318 | */ | 2458 | */ |
2319 | if (order) | 2459 | if (order) |
2320 | return !pgdat_balanced(pgdat, balanced, classzone_idx); | 2460 | return pgdat_balanced(pgdat, balanced, classzone_idx); |
2321 | else | 2461 | else |
2322 | return !all_zones_ok; | 2462 | return all_zones_ok; |
2323 | } | 2463 | } |
2324 | 2464 | ||
2325 | /* | 2465 | /* |
@@ -2537,7 +2677,7 @@ loop_again: | |||
2537 | * consider it to be no longer congested. It's | 2677 | * consider it to be no longer congested. It's |
2538 | * possible there are dirty pages backed by | 2678 | * possible there are dirty pages backed by |
2539 | * congested BDIs but as pressure is relieved, | 2679 | * congested BDIs but as pressure is relieved, |
2540 | * spectulatively avoid congestion waits | 2680 | * speculatively avoid congestion waits |
2541 | */ | 2681 | */ |
2542 | zone_clear_flag(zone, ZONE_CONGESTED); | 2682 | zone_clear_flag(zone, ZONE_CONGESTED); |
2543 | if (i <= *classzone_idx) | 2683 | if (i <= *classzone_idx) |
@@ -2545,6 +2685,16 @@ loop_again: | |||
2545 | } | 2685 | } |
2546 | 2686 | ||
2547 | } | 2687 | } |
2688 | |||
2689 | /* | ||
2690 | * If the low watermark is met there is no need for processes | ||
2691 | * to be throttled on pfmemalloc_wait as they should not be | ||
2692 | * able to safely make forward progress. Wake them | ||
2693 | */ | ||
2694 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | ||
2695 | pfmemalloc_watermark_ok(pgdat)) | ||
2696 | wake_up(&pgdat->pfmemalloc_wait); | ||
2697 | |||
2548 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) | 2698 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2549 | break; /* kswapd: all done */ | 2699 | break; /* kswapd: all done */ |
2550 | /* | 2700 | /* |
@@ -2646,7 +2796,7 @@ out: | |||
2646 | } | 2796 | } |
2647 | 2797 | ||
2648 | /* | 2798 | /* |
2649 | * Return the order we were reclaiming at so sleeping_prematurely() | 2799 | * Return the order we were reclaiming at so prepare_kswapd_sleep() |
2650 | * makes a decision on the order we were last reclaiming at. However, | 2800 | * makes a decision on the order we were last reclaiming at. However, |
2651 | * if another caller entered the allocator slow path while kswapd | 2801 | * if another caller entered the allocator slow path while kswapd |
2652 | * was awake, order will remain at the higher level | 2802 | * was awake, order will remain at the higher level |
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2666 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2816 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2667 | 2817 | ||
2668 | /* Try to sleep for a short interval */ | 2818 | /* Try to sleep for a short interval */ |
2669 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2819 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2670 | remaining = schedule_timeout(HZ/10); | 2820 | remaining = schedule_timeout(HZ/10); |
2671 | finish_wait(&pgdat->kswapd_wait, &wait); | 2821 | finish_wait(&pgdat->kswapd_wait, &wait); |
2672 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2822 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2676 | * After a short sleep, check if it was a premature sleep. If not, then | 2826 | * After a short sleep, check if it was a premature sleep. If not, then |
2677 | * go fully to sleep until explicitly woken up. | 2827 | * go fully to sleep until explicitly woken up. |
2678 | */ | 2828 | */ |
2679 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2829 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2680 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2830 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
2681 | 2831 | ||
2682 | /* | 2832 | /* |
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2688 | * them before going back to sleep. | 2838 | * them before going back to sleep. |
2689 | */ | 2839 | */ |
2690 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2840 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
2691 | schedule(); | 2841 | |
2842 | if (!kthread_should_stop()) | ||
2843 | schedule(); | ||
2844 | |||
2692 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | 2845 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); |
2693 | } else { | 2846 | } else { |
2694 | if (remaining) | 2847 | if (remaining) |