aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2012-10-09 15:20:05 -0400
committerThomas Gleixner <tglx@linutronix.de>2012-10-09 15:20:05 -0400
commitdb8c246937713e60b7628661ccc187eeb81f2bae (patch)
tree6351e8bca23eef40fce85396d1c6f6cfffbd4b66 /mm/vmscan.c
parentc5f66e99b7cb091e3d51ae8e8156892e8feb7fa3 (diff)
parent28f2b02bc581ffc835bc1691b18d03f62fcf0395 (diff)
Merge branch 'fortglx/3.7/time' of git://git.linaro.org/people/jstultz/linux into timers/core
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c185
1 files changed, 169 insertions, 16 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 661576324c7..8d01243d956 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
133static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
135 135
136#ifdef CONFIG_CGROUP_MEM_RES_CTLR 136#ifdef CONFIG_MEMCG
137static bool global_reclaim(struct scan_control *sc) 137static bool global_reclaim(struct scan_control *sc)
138{ 138{
139 return !sc->target_mem_cgroup; 139 return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
687 687
688 cond_resched(); 688 cond_resched();
689 689
690 mem_cgroup_uncharge_start();
690 while (!list_empty(page_list)) { 691 while (!list_empty(page_list)) {
691 enum page_references references; 692 enum page_references references;
692 struct address_space *mapping; 693 struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
720 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 721 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
721 722
722 if (PageWriteback(page)) { 723 if (PageWriteback(page)) {
723 nr_writeback++; 724 /*
724 unlock_page(page); 725 * memcg doesn't have any dirty pages throttling so we
725 goto keep; 726 * could easily OOM just because too many pages are in
727 * writeback and there is nothing else to reclaim.
728 *
729 * Check __GFP_IO, certainly because a loop driver
730 * thread might enter reclaim, and deadlock if it waits
731 * on a page for which it is needed to do the write
732 * (loop masks off __GFP_IO|__GFP_FS for this reason);
733 * but more thought would probably show more reasons.
734 *
735 * Don't require __GFP_FS, since we're not going into
736 * the FS, just waiting on its writeback completion.
737 * Worryingly, ext4 gfs2 and xfs allocate pages with
738 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
739 * testing may_enter_fs here is liable to OOM on them.
740 */
741 if (global_reclaim(sc) ||
742 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
743 /*
744 * This is slightly racy - end_page_writeback()
745 * might have just cleared PageReclaim, then
746 * setting PageReclaim here end up interpreted
747 * as PageReadahead - but that does not matter
748 * enough to care. What we do want is for this
749 * page to have PageReclaim set next time memcg
750 * reclaim reaches the tests above, so it will
751 * then wait_on_page_writeback() to avoid OOM;
752 * and it's also appropriate in global reclaim.
753 */
754 SetPageReclaim(page);
755 nr_writeback++;
756 goto keep_locked;
757 }
758 wait_on_page_writeback(page);
726 } 759 }
727 760
728 references = page_check_references(page, sc); 761 references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
921 954
922 list_splice(&ret_pages, page_list); 955 list_splice(&ret_pages, page_list);
923 count_vm_events(PGACTIVATE, pgactivate); 956 count_vm_events(PGACTIVATE, pgactivate);
957 mem_cgroup_uncharge_end();
924 *ret_nr_dirty += nr_dirty; 958 *ret_nr_dirty += nr_dirty;
925 *ret_nr_writeback += nr_writeback; 959 *ret_nr_writeback += nr_writeback;
926 return nr_reclaimed; 960 return nr_reclaimed;
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc)
1567 * by looking at the fraction of the pages scanned we did rotate back 1601 * by looking at the fraction of the pages scanned we did rotate back
1568 * onto the active list instead of evict. 1602 * onto the active list instead of evict.
1569 * 1603 *
1570 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1604 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
1605 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1571 */ 1606 */
1572static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1607static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1573 unsigned long *nr) 1608 unsigned long *nr)
@@ -2111,6 +2146,83 @@ out:
2111 return 0; 2146 return 0;
2112} 2147}
2113 2148
2149static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2150{
2151 struct zone *zone;
2152 unsigned long pfmemalloc_reserve = 0;
2153 unsigned long free_pages = 0;
2154 int i;
2155 bool wmark_ok;
2156
2157 for (i = 0; i <= ZONE_NORMAL; i++) {
2158 zone = &pgdat->node_zones[i];
2159 pfmemalloc_reserve += min_wmark_pages(zone);
2160 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2161 }
2162
2163 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2164
2165 /* kswapd must be awake if processes are being throttled */
2166 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2167 pgdat->classzone_idx = min(pgdat->classzone_idx,
2168 (enum zone_type)ZONE_NORMAL);
2169 wake_up_interruptible(&pgdat->kswapd_wait);
2170 }
2171
2172 return wmark_ok;
2173}
2174
2175/*
2176 * Throttle direct reclaimers if backing storage is backed by the network
2177 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2178 * depleted. kswapd will continue to make progress and wake the processes
2179 * when the low watermark is reached
2180 */
2181static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2182 nodemask_t *nodemask)
2183{
2184 struct zone *zone;
2185 int high_zoneidx = gfp_zone(gfp_mask);
2186 pg_data_t *pgdat;
2187
2188 /*
2189 * Kernel threads should not be throttled as they may be indirectly
2190 * responsible for cleaning pages necessary for reclaim to make forward
2191 * progress. kjournald for example may enter direct reclaim while
2192 * committing a transaction where throttling it could forcing other
2193 * processes to block on log_wait_commit().
2194 */
2195 if (current->flags & PF_KTHREAD)
2196 return;
2197
2198 /* Check if the pfmemalloc reserves are ok */
2199 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2200 pgdat = zone->zone_pgdat;
2201 if (pfmemalloc_watermark_ok(pgdat))
2202 return;
2203
2204 /* Account for the throttling */
2205 count_vm_event(PGSCAN_DIRECT_THROTTLE);
2206
2207 /*
2208 * If the caller cannot enter the filesystem, it's possible that it
2209 * is due to the caller holding an FS lock or performing a journal
2210 * transaction in the case of a filesystem like ext[3|4]. In this case,
2211 * it is not safe to block on pfmemalloc_wait as kswapd could be
2212 * blocked waiting on the same lock. Instead, throttle for up to a
2213 * second before continuing.
2214 */
2215 if (!(gfp_mask & __GFP_FS)) {
2216 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2217 pfmemalloc_watermark_ok(pgdat), HZ);
2218 return;
2219 }
2220
2221 /* Throttle until kswapd wakes the process */
2222 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2223 pfmemalloc_watermark_ok(pgdat));
2224}
2225
2114unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2226unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2115 gfp_t gfp_mask, nodemask_t *nodemask) 2227 gfp_t gfp_mask, nodemask_t *nodemask)
2116{ 2228{
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2130 .gfp_mask = sc.gfp_mask, 2242 .gfp_mask = sc.gfp_mask,
2131 }; 2243 };
2132 2244
2245 throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2246
2247 /*
2248 * Do not enter reclaim if fatal signal is pending. 1 is returned so
2249 * that the page allocator does not consider triggering OOM
2250 */
2251 if (fatal_signal_pending(current))
2252 return 1;
2253
2133 trace_mm_vmscan_direct_reclaim_begin(order, 2254 trace_mm_vmscan_direct_reclaim_begin(order,
2134 sc.may_writepage, 2255 sc.may_writepage,
2135 gfp_mask); 2256 gfp_mask);
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2141 return nr_reclaimed; 2262 return nr_reclaimed;
2142} 2263}
2143 2264
2144#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2265#ifdef CONFIG_MEMCG
2145 2266
2146unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, 2267unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2147 gfp_t gfp_mask, bool noswap, 2268 gfp_t gfp_mask, bool noswap,
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2274 return balanced_pages >= (present_pages >> 2); 2395 return balanced_pages >= (present_pages >> 2);
2275} 2396}
2276 2397
2277/* is kswapd sleeping prematurely? */ 2398/*
2278static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2399 * Prepare kswapd for sleeping. This verifies that there are no processes
2400 * waiting in throttle_direct_reclaim() and that watermarks have been met.
2401 *
2402 * Returns true if kswapd is ready to sleep
2403 */
2404static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2279 int classzone_idx) 2405 int classzone_idx)
2280{ 2406{
2281 int i; 2407 int i;
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2284 2410
2285 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2411 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2286 if (remaining) 2412 if (remaining)
2287 return true; 2413 return false;
2414
2415 /*
2416 * There is a potential race between when kswapd checks its watermarks
2417 * and a process gets throttled. There is also a potential race if
2418 * processes get throttled, kswapd wakes, a large process exits therby
2419 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2420 * is going to sleep, no process should be sleeping on pfmemalloc_wait
2421 * so wake them now if necessary. If necessary, processes will wake
2422 * kswapd and get throttled again
2423 */
2424 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2425 wake_up(&pgdat->pfmemalloc_wait);
2426 return false;
2427 }
2288 2428
2289 /* Check the watermark levels */ 2429 /* Check the watermark levels */
2290 for (i = 0; i <= classzone_idx; i++) { 2430 for (i = 0; i <= classzone_idx; i++) {
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2317 * must be balanced 2457 * must be balanced
2318 */ 2458 */
2319 if (order) 2459 if (order)
2320 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2460 return pgdat_balanced(pgdat, balanced, classzone_idx);
2321 else 2461 else
2322 return !all_zones_ok; 2462 return all_zones_ok;
2323} 2463}
2324 2464
2325/* 2465/*
@@ -2537,7 +2677,7 @@ loop_again:
2537 * consider it to be no longer congested. It's 2677 * consider it to be no longer congested. It's
2538 * possible there are dirty pages backed by 2678 * possible there are dirty pages backed by
2539 * congested BDIs but as pressure is relieved, 2679 * congested BDIs but as pressure is relieved,
2540 * spectulatively avoid congestion waits 2680 * speculatively avoid congestion waits
2541 */ 2681 */
2542 zone_clear_flag(zone, ZONE_CONGESTED); 2682 zone_clear_flag(zone, ZONE_CONGESTED);
2543 if (i <= *classzone_idx) 2683 if (i <= *classzone_idx)
@@ -2545,6 +2685,16 @@ loop_again:
2545 } 2685 }
2546 2686
2547 } 2687 }
2688
2689 /*
2690 * If the low watermark is met there is no need for processes
2691 * to be throttled on pfmemalloc_wait as they should not be
2692 * able to safely make forward progress. Wake them
2693 */
2694 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
2695 pfmemalloc_watermark_ok(pgdat))
2696 wake_up(&pgdat->pfmemalloc_wait);
2697
2548 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2698 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2549 break; /* kswapd: all done */ 2699 break; /* kswapd: all done */
2550 /* 2700 /*
@@ -2646,7 +2796,7 @@ out:
2646 } 2796 }
2647 2797
2648 /* 2798 /*
2649 * Return the order we were reclaiming at so sleeping_prematurely() 2799 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2650 * makes a decision on the order we were last reclaiming at. However, 2800 * makes a decision on the order we were last reclaiming at. However,
2651 * if another caller entered the allocator slow path while kswapd 2801 * if another caller entered the allocator slow path while kswapd
2652 * was awake, order will remain at the higher level 2802 * was awake, order will remain at the higher level
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2666 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2816 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2667 2817
2668 /* Try to sleep for a short interval */ 2818 /* Try to sleep for a short interval */
2669 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2819 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2670 remaining = schedule_timeout(HZ/10); 2820 remaining = schedule_timeout(HZ/10);
2671 finish_wait(&pgdat->kswapd_wait, &wait); 2821 finish_wait(&pgdat->kswapd_wait, &wait);
2672 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2822 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2676 * After a short sleep, check if it was a premature sleep. If not, then 2826 * After a short sleep, check if it was a premature sleep. If not, then
2677 * go fully to sleep until explicitly woken up. 2827 * go fully to sleep until explicitly woken up.
2678 */ 2828 */
2679 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2829 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2680 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2830 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2681 2831
2682 /* 2832 /*
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2688 * them before going back to sleep. 2838 * them before going back to sleep.
2689 */ 2839 */
2690 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2840 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2691 schedule(); 2841
2842 if (!kthread_should_stop())
2843 schedule();
2844
2692 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2845 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2693 } else { 2846 } else {
2694 if (remaining) 2847 if (remaining)