aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c259
1 files changed, 180 insertions, 79 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index faa0a088f9cc..7ef69124fa3e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -95,8 +95,6 @@ struct scan_control {
95 /* Can pages be swapped as part of reclaim? */ 95 /* Can pages be swapped as part of reclaim? */
96 int may_swap; 96 int may_swap;
97 97
98 int swappiness;
99
100 int order; 98 int order;
101 99
102 /* 100 /*
@@ -107,6 +105,7 @@ struct scan_control {
107 105
108 /* Which cgroup do we reclaim from */ 106 /* Which cgroup do we reclaim from */
109 struct mem_cgroup *mem_cgroup; 107 struct mem_cgroup *mem_cgroup;
108 struct memcg_scanrecord *memcg_record;
110 109
111 /* 110 /*
112 * Nodemask of nodes allowed by the caller. If NULL, all nodes 111 * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -173,7 +172,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
173 struct scan_control *sc, enum lru_list lru) 172 struct scan_control *sc, enum lru_list lru)
174{ 173{
175 if (!scanning_global_lru(sc)) 174 if (!scanning_global_lru(sc))
176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); 175 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
176 zone_to_nid(zone), zone_idx(zone), BIT(lru));
177 177
178 return zone_page_state(zone, NR_LRU_BASE + lru); 178 return zone_page_state(zone, NR_LRU_BASE + lru);
179} 179}
@@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink,
250 unsigned long long delta; 250 unsigned long long delta;
251 unsigned long total_scan; 251 unsigned long total_scan;
252 unsigned long max_pass; 252 unsigned long max_pass;
253 int shrink_ret = 0;
254 long nr;
255 long new_nr;
256 long batch_size = shrinker->batch ? shrinker->batch
257 : SHRINK_BATCH;
258
259 /*
260 * copy the current shrinker scan count into a local variable
261 * and zero it so that other concurrent shrinker invocations
262 * don't also do this scanning work.
263 */
264 do {
265 nr = shrinker->nr;
266 } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
253 267
268 total_scan = nr;
254 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 269 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
255 delta = (4 * nr_pages_scanned) / shrinker->seeks; 270 delta = (4 * nr_pages_scanned) / shrinker->seeks;
256 delta *= max_pass; 271 delta *= max_pass;
257 do_div(delta, lru_pages + 1); 272 do_div(delta, lru_pages + 1);
258 shrinker->nr += delta; 273 total_scan += delta;
259 if (shrinker->nr < 0) { 274 if (total_scan < 0) {
260 printk(KERN_ERR "shrink_slab: %pF negative objects to " 275 printk(KERN_ERR "shrink_slab: %pF negative objects to "
261 "delete nr=%ld\n", 276 "delete nr=%ld\n",
262 shrinker->shrink, shrinker->nr); 277 shrinker->shrink, total_scan);
263 shrinker->nr = max_pass; 278 total_scan = max_pass;
264 } 279 }
265 280
266 /* 281 /*
282 * We need to avoid excessive windup on filesystem shrinkers
283 * due to large numbers of GFP_NOFS allocations causing the
284 * shrinkers to return -1 all the time. This results in a large
285 * nr being built up so when a shrink that can do some work
286 * comes along it empties the entire cache due to nr >>>
287 * max_pass. This is bad for sustaining a working set in
288 * memory.
289 *
290 * Hence only allow the shrinker to scan the entire cache when
291 * a large delta change is calculated directly.
292 */
293 if (delta < max_pass / 4)
294 total_scan = min(total_scan, max_pass / 2);
295
296 /*
267 * Avoid risking looping forever due to too large nr value: 297 * Avoid risking looping forever due to too large nr value:
268 * never try to free more than twice the estimate number of 298 * never try to free more than twice the estimate number of
269 * freeable entries. 299 * freeable entries.
270 */ 300 */
271 if (shrinker->nr > max_pass * 2) 301 if (total_scan > max_pass * 2)
272 shrinker->nr = max_pass * 2; 302 total_scan = max_pass * 2;
273 303
274 total_scan = shrinker->nr; 304 trace_mm_shrink_slab_start(shrinker, shrink, nr,
275 shrinker->nr = 0; 305 nr_pages_scanned, lru_pages,
306 max_pass, delta, total_scan);
276 307
277 while (total_scan >= SHRINK_BATCH) { 308 while (total_scan >= batch_size) {
278 long this_scan = SHRINK_BATCH;
279 int shrink_ret;
280 int nr_before; 309 int nr_before;
281 310
282 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 311 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
283 shrink_ret = do_shrinker_shrink(shrinker, shrink, 312 shrink_ret = do_shrinker_shrink(shrinker, shrink,
284 this_scan); 313 batch_size);
285 if (shrink_ret == -1) 314 if (shrink_ret == -1)
286 break; 315 break;
287 if (shrink_ret < nr_before) 316 if (shrink_ret < nr_before)
288 ret += nr_before - shrink_ret; 317 ret += nr_before - shrink_ret;
289 count_vm_events(SLABS_SCANNED, this_scan); 318 count_vm_events(SLABS_SCANNED, batch_size);
290 total_scan -= this_scan; 319 total_scan -= batch_size;
291 320
292 cond_resched(); 321 cond_resched();
293 } 322 }
294 323
295 shrinker->nr += total_scan; 324 /*
325 * move the unused scan count back into the shrinker in a
326 * manner that handles concurrent updates. If we exhausted the
327 * scan, there is no need to do an update.
328 */
329 do {
330 nr = shrinker->nr;
331 new_nr = total_scan + nr;
332 if (total_scan <= 0)
333 break;
334 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
335
336 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
296 } 337 }
297 up_read(&shrinker_rwsem); 338 up_read(&shrinker_rwsem);
298out: 339out:
@@ -1124,8 +1165,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1124 nr_lumpy_dirty++; 1165 nr_lumpy_dirty++;
1125 scan++; 1166 scan++;
1126 } else { 1167 } else {
1127 /* the page is freed already. */ 1168 /*
1128 if (!page_count(cursor_page)) 1169 * Check if the page is freed already.
1170 *
1171 * We can't use page_count() as that
1172 * requires compound_head and we don't
1173 * have a pin on the page here. If a
1174 * page is tail, we may or may not
1175 * have isolated the head, so assume
1176 * it's not free, it'd be tricky to
1177 * track the head status without a
1178 * page pin.
1179 */
1180 if (!PageTail(cursor_page) &&
1181 !atomic_read(&cursor_page->_count))
1129 continue; 1182 continue;
1130 break; 1183 break;
1131 } 1184 }
@@ -1296,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1296 int file = is_file_lru(lru); 1349 int file = is_file_lru(lru);
1297 int numpages = hpage_nr_pages(page); 1350 int numpages = hpage_nr_pages(page);
1298 reclaim_stat->recent_rotated[file] += numpages; 1351 reclaim_stat->recent_rotated[file] += numpages;
1352 if (!scanning_global_lru(sc))
1353 sc->memcg_record->nr_rotated[file] += numpages;
1299 } 1354 }
1300 if (!pagevec_add(&pvec, page)) { 1355 if (!pagevec_add(&pvec, page)) {
1301 spin_unlock_irq(&zone->lru_lock); 1356 spin_unlock_irq(&zone->lru_lock);
@@ -1339,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1339 1394
1340 reclaim_stat->recent_scanned[0] += *nr_anon; 1395 reclaim_stat->recent_scanned[0] += *nr_anon;
1341 reclaim_stat->recent_scanned[1] += *nr_file; 1396 reclaim_stat->recent_scanned[1] += *nr_file;
1397 if (!scanning_global_lru(sc)) {
1398 sc->memcg_record->nr_scanned[0] += *nr_anon;
1399 sc->memcg_record->nr_scanned[1] += *nr_file;
1400 }
1342} 1401}
1343 1402
1344/* 1403/*
@@ -1452,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1452 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1511 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1453 } 1512 }
1454 1513
1514 if (!scanning_global_lru(sc))
1515 sc->memcg_record->nr_freed[file] += nr_reclaimed;
1516
1455 local_irq_disable(); 1517 local_irq_disable();
1456 if (current_is_kswapd()) 1518 if (current_is_kswapd())
1457 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1519 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1551,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1551 } 1613 }
1552 1614
1553 reclaim_stat->recent_scanned[file] += nr_taken; 1615 reclaim_stat->recent_scanned[file] += nr_taken;
1616 if (!scanning_global_lru(sc))
1617 sc->memcg_record->nr_scanned[file] += nr_taken;
1554 1618
1555 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1619 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1556 if (file) 1620 if (file)
@@ -1602,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1602 * get_scan_ratio. 1666 * get_scan_ratio.
1603 */ 1667 */
1604 reclaim_stat->recent_rotated[file] += nr_rotated; 1668 reclaim_stat->recent_rotated[file] += nr_rotated;
1669 if (!scanning_global_lru(sc))
1670 sc->memcg_record->nr_rotated[file] += nr_rotated;
1605 1671
1606 move_active_pages_to_lru(zone, &l_active, 1672 move_active_pages_to_lru(zone, &l_active,
1607 LRU_ACTIVE + file * LRU_FILE); 1673 LRU_ACTIVE + file * LRU_FILE);
@@ -1717,6 +1783,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1717 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1783 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1718} 1784}
1719 1785
1786static int vmscan_swappiness(struct scan_control *sc)
1787{
1788 if (scanning_global_lru(sc))
1789 return vm_swappiness;
1790 return mem_cgroup_swappiness(sc->mem_cgroup);
1791}
1792
1720/* 1793/*
1721 * Determine how aggressively the anon and file LRU lists should be 1794 * Determine how aggressively the anon and file LRU lists should be
1722 * scanned. The relative value of each set of LRU lists is determined 1795 * scanned. The relative value of each set of LRU lists is determined
@@ -1736,6 +1809,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1736 enum lru_list l; 1809 enum lru_list l;
1737 int noswap = 0; 1810 int noswap = 0;
1738 int force_scan = 0; 1811 int force_scan = 0;
1812 unsigned long nr_force_scan[2];
1739 1813
1740 1814
1741 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1815 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
@@ -1758,6 +1832,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1758 fraction[0] = 0; 1832 fraction[0] = 0;
1759 fraction[1] = 1; 1833 fraction[1] = 1;
1760 denominator = 1; 1834 denominator = 1;
1835 nr_force_scan[0] = 0;
1836 nr_force_scan[1] = SWAP_CLUSTER_MAX;
1761 goto out; 1837 goto out;
1762 } 1838 }
1763 1839
@@ -1769,6 +1845,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1769 fraction[0] = 1; 1845 fraction[0] = 1;
1770 fraction[1] = 0; 1846 fraction[1] = 0;
1771 denominator = 1; 1847 denominator = 1;
1848 nr_force_scan[0] = SWAP_CLUSTER_MAX;
1849 nr_force_scan[1] = 0;
1772 goto out; 1850 goto out;
1773 } 1851 }
1774 } 1852 }
@@ -1777,8 +1855,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1777 * With swappiness at 100, anonymous and file have the same priority. 1855 * With swappiness at 100, anonymous and file have the same priority.
1778 * This scanning priority is essentially the inverse of IO cost. 1856 * This scanning priority is essentially the inverse of IO cost.
1779 */ 1857 */
1780 anon_prio = sc->swappiness; 1858 anon_prio = vmscan_swappiness(sc);
1781 file_prio = 200 - sc->swappiness; 1859 file_prio = 200 - vmscan_swappiness(sc);
1782 1860
1783 /* 1861 /*
1784 * OK, so we have swap space and a fair amount of page cache 1862 * OK, so we have swap space and a fair amount of page cache
@@ -1817,6 +1895,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1817 fraction[0] = ap; 1895 fraction[0] = ap;
1818 fraction[1] = fp; 1896 fraction[1] = fp;
1819 denominator = ap + fp + 1; 1897 denominator = ap + fp + 1;
1898 if (force_scan) {
1899 unsigned long scan = SWAP_CLUSTER_MAX;
1900 nr_force_scan[0] = div64_u64(scan * ap, denominator);
1901 nr_force_scan[1] = div64_u64(scan * fp, denominator);
1902 }
1820out: 1903out:
1821 for_each_evictable_lru(l) { 1904 for_each_evictable_lru(l) {
1822 int file = is_file_lru(l); 1905 int file = is_file_lru(l);
@@ -1837,12 +1920,8 @@ out:
1837 * memcg, priority drop can cause big latency. So, it's better 1920 * memcg, priority drop can cause big latency. So, it's better
1838 * to scan small amount. See may_noscan above. 1921 * to scan small amount. See may_noscan above.
1839 */ 1922 */
1840 if (!scan && force_scan) { 1923 if (!scan && force_scan)
1841 if (file) 1924 scan = nr_force_scan[file];
1842 scan = SWAP_CLUSTER_MAX;
1843 else if (!noswap)
1844 scan = SWAP_CLUSTER_MAX;
1845 }
1846 nr[l] = scan; 1925 nr[l] = scan;
1847 } 1926 }
1848} 1927}
@@ -1983,14 +2062,13 @@ restart:
1983 * If a zone is deemed to be full of pinned pages then just give it a light 2062 * If a zone is deemed to be full of pinned pages then just give it a light
1984 * scan then give up on it. 2063 * scan then give up on it.
1985 */ 2064 */
1986static unsigned long shrink_zones(int priority, struct zonelist *zonelist, 2065static void shrink_zones(int priority, struct zonelist *zonelist,
1987 struct scan_control *sc) 2066 struct scan_control *sc)
1988{ 2067{
1989 struct zoneref *z; 2068 struct zoneref *z;
1990 struct zone *zone; 2069 struct zone *zone;
1991 unsigned long nr_soft_reclaimed; 2070 unsigned long nr_soft_reclaimed;
1992 unsigned long nr_soft_scanned; 2071 unsigned long nr_soft_scanned;
1993 unsigned long total_scanned = 0;
1994 2072
1995 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2073 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1996 gfp_zone(sc->gfp_mask), sc->nodemask) { 2074 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2005,19 +2083,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
2005 continue; 2083 continue;
2006 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2084 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2007 continue; /* Let kswapd poll it */ 2085 continue; /* Let kswapd poll it */
2086 /*
2087 * This steals pages from memory cgroups over softlimit
2088 * and returns the number of reclaimed pages and
2089 * scanned pages. This works for global memory pressure
2090 * and balancing, not for a memcg's limit.
2091 */
2092 nr_soft_scanned = 0;
2093 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2094 sc->order, sc->gfp_mask,
2095 &nr_soft_scanned);
2096 sc->nr_reclaimed += nr_soft_reclaimed;
2097 sc->nr_scanned += nr_soft_scanned;
2098 /* need some check for avoid more shrink_zone() */
2008 } 2099 }
2009 2100
2010 nr_soft_scanned = 0;
2011 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2012 sc->order, sc->gfp_mask,
2013 &nr_soft_scanned);
2014 sc->nr_reclaimed += nr_soft_reclaimed;
2015 total_scanned += nr_soft_scanned;
2016
2017 shrink_zone(priority, zone, sc); 2101 shrink_zone(priority, zone, sc);
2018 } 2102 }
2019
2020 return total_scanned;
2021} 2103}
2022 2104
2023static bool zone_reclaimable(struct zone *zone) 2105static bool zone_reclaimable(struct zone *zone)
@@ -2081,8 +2163,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2081 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2163 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2082 sc->nr_scanned = 0; 2164 sc->nr_scanned = 0;
2083 if (!priority) 2165 if (!priority)
2084 disable_swap_token(); 2166 disable_swap_token(sc->mem_cgroup);
2085 total_scanned += shrink_zones(priority, zonelist, sc); 2167 shrink_zones(priority, zonelist, sc);
2086 /* 2168 /*
2087 * Don't shrink slabs when reclaiming memory from 2169 * Don't shrink slabs when reclaiming memory from
2088 * over limit cgroups 2170 * over limit cgroups
@@ -2164,7 +2246,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2164 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2246 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2165 .may_unmap = 1, 2247 .may_unmap = 1,
2166 .may_swap = 1, 2248 .may_swap = 1,
2167 .swappiness = vm_swappiness,
2168 .order = order, 2249 .order = order,
2169 .mem_cgroup = NULL, 2250 .mem_cgroup = NULL,
2170 .nodemask = nodemask, 2251 .nodemask = nodemask,
@@ -2187,10 +2268,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2187#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2268#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2188 2269
2189unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2270unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2190 gfp_t gfp_mask, bool noswap, 2271 gfp_t gfp_mask, bool noswap,
2191 unsigned int swappiness, 2272 struct zone *zone,
2192 struct zone *zone, 2273 struct memcg_scanrecord *rec,
2193 unsigned long *nr_scanned) 2274 unsigned long *scanned)
2194{ 2275{
2195 struct scan_control sc = { 2276 struct scan_control sc = {
2196 .nr_scanned = 0, 2277 .nr_scanned = 0,
@@ -2198,10 +2279,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2198 .may_writepage = !laptop_mode, 2279 .may_writepage = !laptop_mode,
2199 .may_unmap = 1, 2280 .may_unmap = 1,
2200 .may_swap = !noswap, 2281 .may_swap = !noswap,
2201 .swappiness = swappiness,
2202 .order = 0, 2282 .order = 0,
2203 .mem_cgroup = mem, 2283 .mem_cgroup = mem,
2284 .memcg_record = rec,
2204 }; 2285 };
2286 unsigned long start, end;
2205 2287
2206 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2288 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2207 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2289 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2210,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2210 sc.may_writepage, 2292 sc.may_writepage,
2211 sc.gfp_mask); 2293 sc.gfp_mask);
2212 2294
2295 start = sched_clock();
2213 /* 2296 /*
2214 * NOTE: Although we can get the priority field, using it 2297 * NOTE: Although we can get the priority field, using it
2215 * here is not a good idea, since it limits the pages we can scan. 2298 * here is not a good idea, since it limits the pages we can scan.
@@ -2218,29 +2301,34 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2218 * the priority and make it zero. 2301 * the priority and make it zero.
2219 */ 2302 */
2220 shrink_zone(0, zone, &sc); 2303 shrink_zone(0, zone, &sc);
2304 end = sched_clock();
2305
2306 if (rec)
2307 rec->elapsed += end - start;
2308 *scanned = sc.nr_scanned;
2221 2309
2222 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2310 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2223 2311
2224 *nr_scanned = sc.nr_scanned;
2225 return sc.nr_reclaimed; 2312 return sc.nr_reclaimed;
2226} 2313}
2227 2314
2228unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2315unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2229 gfp_t gfp_mask, 2316 gfp_t gfp_mask,
2230 bool noswap, 2317 bool noswap,
2231 unsigned int swappiness) 2318 struct memcg_scanrecord *rec)
2232{ 2319{
2233 struct zonelist *zonelist; 2320 struct zonelist *zonelist;
2234 unsigned long nr_reclaimed; 2321 unsigned long nr_reclaimed;
2322 unsigned long start, end;
2235 int nid; 2323 int nid;
2236 struct scan_control sc = { 2324 struct scan_control sc = {
2237 .may_writepage = !laptop_mode, 2325 .may_writepage = !laptop_mode,
2238 .may_unmap = 1, 2326 .may_unmap = 1,
2239 .may_swap = !noswap, 2327 .may_swap = !noswap,
2240 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2328 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2241 .swappiness = swappiness,
2242 .order = 0, 2329 .order = 0,
2243 .mem_cgroup = mem_cont, 2330 .mem_cgroup = mem_cont,
2331 .memcg_record = rec,
2244 .nodemask = NULL, /* we don't care the placement */ 2332 .nodemask = NULL, /* we don't care the placement */
2245 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2333 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2246 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2334 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2249,6 +2337,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2249 .gfp_mask = sc.gfp_mask, 2337 .gfp_mask = sc.gfp_mask,
2250 }; 2338 };
2251 2339
2340 start = sched_clock();
2252 /* 2341 /*
2253 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2342 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2254 * take care of from where we get pages. So the node where we start the 2343 * take care of from where we get pages. So the node where we start the
@@ -2263,6 +2352,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2263 sc.gfp_mask); 2352 sc.gfp_mask);
2264 2353
2265 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2354 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2355 end = sched_clock();
2356 if (rec)
2357 rec->elapsed += end - start;
2266 2358
2267 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2359 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2268 2360
@@ -2295,7 +2387,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2295 for (i = 0; i <= classzone_idx; i++) 2387 for (i = 0; i <= classzone_idx; i++)
2296 present_pages += pgdat->node_zones[i].present_pages; 2388 present_pages += pgdat->node_zones[i].present_pages;
2297 2389
2298 return balanced_pages > (present_pages >> 2); 2390 /* A special case here: if zone has no page, we think it's balanced */
2391 return balanced_pages >= (present_pages >> 2);
2299} 2392}
2300 2393
2301/* is kswapd sleeping prematurely? */ 2394/* is kswapd sleeping prematurely? */
@@ -2311,7 +2404,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2311 return true; 2404 return true;
2312 2405
2313 /* Check the watermark levels */ 2406 /* Check the watermark levels */
2314 for (i = 0; i < pgdat->nr_zones; i++) { 2407 for (i = 0; i <= classzone_idx; i++) {
2315 struct zone *zone = pgdat->node_zones + i; 2408 struct zone *zone = pgdat->node_zones + i;
2316 2409
2317 if (!populated_zone(zone)) 2410 if (!populated_zone(zone))
@@ -2329,7 +2422,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2329 } 2422 }
2330 2423
2331 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 2424 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2332 classzone_idx, 0)) 2425 i, 0))
2333 all_zones_ok = false; 2426 all_zones_ok = false;
2334 else 2427 else
2335 balanced += zone->present_pages; 2428 balanced += zone->present_pages;
@@ -2388,7 +2481,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2388 * we want to put equal scanning pressure on each zone. 2481 * we want to put equal scanning pressure on each zone.
2389 */ 2482 */
2390 .nr_to_reclaim = ULONG_MAX, 2483 .nr_to_reclaim = ULONG_MAX,
2391 .swappiness = vm_swappiness,
2392 .order = order, 2484 .order = order,
2393 .mem_cgroup = NULL, 2485 .mem_cgroup = NULL,
2394 }; 2486 };
@@ -2407,7 +2499,7 @@ loop_again:
2407 2499
2408 /* The swap token gets in the way of swapout... */ 2500 /* The swap token gets in the way of swapout... */
2409 if (!priority) 2501 if (!priority)
2410 disable_swap_token(); 2502 disable_swap_token(NULL);
2411 2503
2412 all_zones_ok = 1; 2504 all_zones_ok = 1;
2413 balanced = 0; 2505 balanced = 0;
@@ -2436,7 +2528,6 @@ loop_again:
2436 if (!zone_watermark_ok_safe(zone, order, 2528 if (!zone_watermark_ok_safe(zone, order,
2437 high_wmark_pages(zone), 0, 0)) { 2529 high_wmark_pages(zone), 0, 0)) {
2438 end_zone = i; 2530 end_zone = i;
2439 *classzone_idx = i;
2440 break; 2531 break;
2441 } 2532 }
2442 } 2533 }
@@ -2495,18 +2586,18 @@ loop_again:
2495 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2586 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2496 if (!zone_watermark_ok_safe(zone, order, 2587 if (!zone_watermark_ok_safe(zone, order,
2497 high_wmark_pages(zone) + balance_gap, 2588 high_wmark_pages(zone) + balance_gap,
2498 end_zone, 0)) 2589 end_zone, 0)) {
2499 shrink_zone(priority, zone, &sc); 2590 shrink_zone(priority, zone, &sc);
2500 reclaim_state->reclaimed_slab = 0;
2501 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2502 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2503 total_scanned += sc.nr_scanned;
2504 2591
2505 if (zone->all_unreclaimable) 2592 reclaim_state->reclaimed_slab = 0;
2506 continue; 2593 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2507 if (nr_slab == 0 && 2594 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2508 !zone_reclaimable(zone)) 2595 total_scanned += sc.nr_scanned;
2509 zone->all_unreclaimable = 1; 2596
2597 if (nr_slab == 0 && !zone_reclaimable(zone))
2598 zone->all_unreclaimable = 1;
2599 }
2600
2510 /* 2601 /*
2511 * If we've done a decent amount of scanning and 2602 * If we've done a decent amount of scanning and
2512 * the reclaim ratio is low, start doing writepage 2603 * the reclaim ratio is low, start doing writepage
@@ -2516,6 +2607,12 @@ loop_again:
2516 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2607 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2517 sc.may_writepage = 1; 2608 sc.may_writepage = 1;
2518 2609
2610 if (zone->all_unreclaimable) {
2611 if (end_zone && end_zone == i)
2612 end_zone--;
2613 continue;
2614 }
2615
2519 if (!zone_watermark_ok_safe(zone, order, 2616 if (!zone_watermark_ok_safe(zone, order,
2520 high_wmark_pages(zone), end_zone, 0)) { 2617 high_wmark_pages(zone), end_zone, 0)) {
2521 all_zones_ok = 0; 2618 all_zones_ok = 0;
@@ -2694,8 +2791,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2694 */ 2791 */
2695static int kswapd(void *p) 2792static int kswapd(void *p)
2696{ 2793{
2697 unsigned long order; 2794 unsigned long order, new_order;
2698 int classzone_idx; 2795 int classzone_idx, new_classzone_idx;
2699 pg_data_t *pgdat = (pg_data_t*)p; 2796 pg_data_t *pgdat = (pg_data_t*)p;
2700 struct task_struct *tsk = current; 2797 struct task_struct *tsk = current;
2701 2798
@@ -2725,17 +2822,23 @@ static int kswapd(void *p)
2725 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2822 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2726 set_freezable(); 2823 set_freezable();
2727 2824
2728 order = 0; 2825 order = new_order = 0;
2729 classzone_idx = MAX_NR_ZONES - 1; 2826 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2730 for ( ; ; ) { 2827 for ( ; ; ) {
2731 unsigned long new_order;
2732 int new_classzone_idx;
2733 int ret; 2828 int ret;
2734 2829
2735 new_order = pgdat->kswapd_max_order; 2830 /*
2736 new_classzone_idx = pgdat->classzone_idx; 2831 * If the last balance_pgdat was unsuccessful it's unlikely a
2737 pgdat->kswapd_max_order = 0; 2832 * new request of a similar or harder type will succeed soon
2738 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2833 * so consider going to sleep on the basis we reclaimed at
2834 */
2835 if (classzone_idx >= new_classzone_idx && order == new_order) {
2836 new_order = pgdat->kswapd_max_order;
2837 new_classzone_idx = pgdat->classzone_idx;
2838 pgdat->kswapd_max_order = 0;
2839 pgdat->classzone_idx = pgdat->nr_zones - 1;
2840 }
2841
2739 if (order < new_order || classzone_idx > new_classzone_idx) { 2842 if (order < new_order || classzone_idx > new_classzone_idx) {
2740 /* 2843 /*
2741 * Don't sleep if someone wants a larger 'order' 2844 * Don't sleep if someone wants a larger 'order'
@@ -2748,7 +2851,7 @@ static int kswapd(void *p)
2748 order = pgdat->kswapd_max_order; 2851 order = pgdat->kswapd_max_order;
2749 classzone_idx = pgdat->classzone_idx; 2852 classzone_idx = pgdat->classzone_idx;
2750 pgdat->kswapd_max_order = 0; 2853 pgdat->kswapd_max_order = 0;
2751 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2854 pgdat->classzone_idx = pgdat->nr_zones - 1;
2752 } 2855 }
2753 2856
2754 ret = try_to_freeze(); 2857 ret = try_to_freeze();
@@ -2847,7 +2950,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2847 .may_writepage = 1, 2950 .may_writepage = 1,
2848 .nr_to_reclaim = nr_to_reclaim, 2951 .nr_to_reclaim = nr_to_reclaim,
2849 .hibernation_mode = 1, 2952 .hibernation_mode = 1,
2850 .swappiness = vm_swappiness,
2851 .order = 0, 2953 .order = 0,
2852 }; 2954 };
2853 struct shrink_control shrink = { 2955 struct shrink_control shrink = {
@@ -3034,7 +3136,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3034 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3136 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3035 SWAP_CLUSTER_MAX), 3137 SWAP_CLUSTER_MAX),
3036 .gfp_mask = gfp_mask, 3138 .gfp_mask = gfp_mask,
3037 .swappiness = vm_swappiness,
3038 .order = order, 3139 .order = order,
3039 }; 3140 };
3040 struct shrink_control shrink = { 3141 struct shrink_control shrink = {