diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 259 |
1 files changed, 180 insertions, 79 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index faa0a088f9cc..7ef69124fa3e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -95,8 +95,6 @@ struct scan_control { | |||
95 | /* Can pages be swapped as part of reclaim? */ | 95 | /* Can pages be swapped as part of reclaim? */ |
96 | int may_swap; | 96 | int may_swap; |
97 | 97 | ||
98 | int swappiness; | ||
99 | |||
100 | int order; | 98 | int order; |
101 | 99 | ||
102 | /* | 100 | /* |
@@ -107,6 +105,7 @@ struct scan_control { | |||
107 | 105 | ||
108 | /* Which cgroup do we reclaim from */ | 106 | /* Which cgroup do we reclaim from */ |
109 | struct mem_cgroup *mem_cgroup; | 107 | struct mem_cgroup *mem_cgroup; |
108 | struct memcg_scanrecord *memcg_record; | ||
110 | 109 | ||
111 | /* | 110 | /* |
112 | * Nodemask of nodes allowed by the caller. If NULL, all nodes | 111 | * Nodemask of nodes allowed by the caller. If NULL, all nodes |
@@ -173,7 +172,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, | |||
173 | struct scan_control *sc, enum lru_list lru) | 172 | struct scan_control *sc, enum lru_list lru) |
174 | { | 173 | { |
175 | if (!scanning_global_lru(sc)) | 174 | if (!scanning_global_lru(sc)) |
176 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); | 175 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, |
176 | zone_to_nid(zone), zone_idx(zone), BIT(lru)); | ||
177 | 177 | ||
178 | return zone_page_state(zone, NR_LRU_BASE + lru); | 178 | return zone_page_state(zone, NR_LRU_BASE + lru); |
179 | } | 179 | } |
@@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink, | |||
250 | unsigned long long delta; | 250 | unsigned long long delta; |
251 | unsigned long total_scan; | 251 | unsigned long total_scan; |
252 | unsigned long max_pass; | 252 | unsigned long max_pass; |
253 | int shrink_ret = 0; | ||
254 | long nr; | ||
255 | long new_nr; | ||
256 | long batch_size = shrinker->batch ? shrinker->batch | ||
257 | : SHRINK_BATCH; | ||
258 | |||
259 | /* | ||
260 | * copy the current shrinker scan count into a local variable | ||
261 | * and zero it so that other concurrent shrinker invocations | ||
262 | * don't also do this scanning work. | ||
263 | */ | ||
264 | do { | ||
265 | nr = shrinker->nr; | ||
266 | } while (cmpxchg(&shrinker->nr, nr, 0) != nr); | ||
253 | 267 | ||
268 | total_scan = nr; | ||
254 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); | 269 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
255 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 270 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
256 | delta *= max_pass; | 271 | delta *= max_pass; |
257 | do_div(delta, lru_pages + 1); | 272 | do_div(delta, lru_pages + 1); |
258 | shrinker->nr += delta; | 273 | total_scan += delta; |
259 | if (shrinker->nr < 0) { | 274 | if (total_scan < 0) { |
260 | printk(KERN_ERR "shrink_slab: %pF negative objects to " | 275 | printk(KERN_ERR "shrink_slab: %pF negative objects to " |
261 | "delete nr=%ld\n", | 276 | "delete nr=%ld\n", |
262 | shrinker->shrink, shrinker->nr); | 277 | shrinker->shrink, total_scan); |
263 | shrinker->nr = max_pass; | 278 | total_scan = max_pass; |
264 | } | 279 | } |
265 | 280 | ||
266 | /* | 281 | /* |
282 | * We need to avoid excessive windup on filesystem shrinkers | ||
283 | * due to large numbers of GFP_NOFS allocations causing the | ||
284 | * shrinkers to return -1 all the time. This results in a large | ||
285 | * nr being built up so when a shrink that can do some work | ||
286 | * comes along it empties the entire cache due to nr >>> | ||
287 | * max_pass. This is bad for sustaining a working set in | ||
288 | * memory. | ||
289 | * | ||
290 | * Hence only allow the shrinker to scan the entire cache when | ||
291 | * a large delta change is calculated directly. | ||
292 | */ | ||
293 | if (delta < max_pass / 4) | ||
294 | total_scan = min(total_scan, max_pass / 2); | ||
295 | |||
296 | /* | ||
267 | * Avoid risking looping forever due to too large nr value: | 297 | * Avoid risking looping forever due to too large nr value: |
268 | * never try to free more than twice the estimate number of | 298 | * never try to free more than twice the estimate number of |
269 | * freeable entries. | 299 | * freeable entries. |
270 | */ | 300 | */ |
271 | if (shrinker->nr > max_pass * 2) | 301 | if (total_scan > max_pass * 2) |
272 | shrinker->nr = max_pass * 2; | 302 | total_scan = max_pass * 2; |
273 | 303 | ||
274 | total_scan = shrinker->nr; | 304 | trace_mm_shrink_slab_start(shrinker, shrink, nr, |
275 | shrinker->nr = 0; | 305 | nr_pages_scanned, lru_pages, |
306 | max_pass, delta, total_scan); | ||
276 | 307 | ||
277 | while (total_scan >= SHRINK_BATCH) { | 308 | while (total_scan >= batch_size) { |
278 | long this_scan = SHRINK_BATCH; | ||
279 | int shrink_ret; | ||
280 | int nr_before; | 309 | int nr_before; |
281 | 310 | ||
282 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); | 311 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
283 | shrink_ret = do_shrinker_shrink(shrinker, shrink, | 312 | shrink_ret = do_shrinker_shrink(shrinker, shrink, |
284 | this_scan); | 313 | batch_size); |
285 | if (shrink_ret == -1) | 314 | if (shrink_ret == -1) |
286 | break; | 315 | break; |
287 | if (shrink_ret < nr_before) | 316 | if (shrink_ret < nr_before) |
288 | ret += nr_before - shrink_ret; | 317 | ret += nr_before - shrink_ret; |
289 | count_vm_events(SLABS_SCANNED, this_scan); | 318 | count_vm_events(SLABS_SCANNED, batch_size); |
290 | total_scan -= this_scan; | 319 | total_scan -= batch_size; |
291 | 320 | ||
292 | cond_resched(); | 321 | cond_resched(); |
293 | } | 322 | } |
294 | 323 | ||
295 | shrinker->nr += total_scan; | 324 | /* |
325 | * move the unused scan count back into the shrinker in a | ||
326 | * manner that handles concurrent updates. If we exhausted the | ||
327 | * scan, there is no need to do an update. | ||
328 | */ | ||
329 | do { | ||
330 | nr = shrinker->nr; | ||
331 | new_nr = total_scan + nr; | ||
332 | if (total_scan <= 0) | ||
333 | break; | ||
334 | } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); | ||
335 | |||
336 | trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); | ||
296 | } | 337 | } |
297 | up_read(&shrinker_rwsem); | 338 | up_read(&shrinker_rwsem); |
298 | out: | 339 | out: |
@@ -1124,8 +1165,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1124 | nr_lumpy_dirty++; | 1165 | nr_lumpy_dirty++; |
1125 | scan++; | 1166 | scan++; |
1126 | } else { | 1167 | } else { |
1127 | /* the page is freed already. */ | 1168 | /* |
1128 | if (!page_count(cursor_page)) | 1169 | * Check if the page is freed already. |
1170 | * | ||
1171 | * We can't use page_count() as that | ||
1172 | * requires compound_head and we don't | ||
1173 | * have a pin on the page here. If a | ||
1174 | * page is tail, we may or may not | ||
1175 | * have isolated the head, so assume | ||
1176 | * it's not free, it'd be tricky to | ||
1177 | * track the head status without a | ||
1178 | * page pin. | ||
1179 | */ | ||
1180 | if (!PageTail(cursor_page) && | ||
1181 | !atomic_read(&cursor_page->_count)) | ||
1129 | continue; | 1182 | continue; |
1130 | break; | 1183 | break; |
1131 | } | 1184 | } |
@@ -1296,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1296 | int file = is_file_lru(lru); | 1349 | int file = is_file_lru(lru); |
1297 | int numpages = hpage_nr_pages(page); | 1350 | int numpages = hpage_nr_pages(page); |
1298 | reclaim_stat->recent_rotated[file] += numpages; | 1351 | reclaim_stat->recent_rotated[file] += numpages; |
1352 | if (!scanning_global_lru(sc)) | ||
1353 | sc->memcg_record->nr_rotated[file] += numpages; | ||
1299 | } | 1354 | } |
1300 | if (!pagevec_add(&pvec, page)) { | 1355 | if (!pagevec_add(&pvec, page)) { |
1301 | spin_unlock_irq(&zone->lru_lock); | 1356 | spin_unlock_irq(&zone->lru_lock); |
@@ -1339,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, | |||
1339 | 1394 | ||
1340 | reclaim_stat->recent_scanned[0] += *nr_anon; | 1395 | reclaim_stat->recent_scanned[0] += *nr_anon; |
1341 | reclaim_stat->recent_scanned[1] += *nr_file; | 1396 | reclaim_stat->recent_scanned[1] += *nr_file; |
1397 | if (!scanning_global_lru(sc)) { | ||
1398 | sc->memcg_record->nr_scanned[0] += *nr_anon; | ||
1399 | sc->memcg_record->nr_scanned[1] += *nr_file; | ||
1400 | } | ||
1342 | } | 1401 | } |
1343 | 1402 | ||
1344 | /* | 1403 | /* |
@@ -1452,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1452 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1511 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
1453 | } | 1512 | } |
1454 | 1513 | ||
1514 | if (!scanning_global_lru(sc)) | ||
1515 | sc->memcg_record->nr_freed[file] += nr_reclaimed; | ||
1516 | |||
1455 | local_irq_disable(); | 1517 | local_irq_disable(); |
1456 | if (current_is_kswapd()) | 1518 | if (current_is_kswapd()) |
1457 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1519 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); |
@@ -1551,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1551 | } | 1613 | } |
1552 | 1614 | ||
1553 | reclaim_stat->recent_scanned[file] += nr_taken; | 1615 | reclaim_stat->recent_scanned[file] += nr_taken; |
1616 | if (!scanning_global_lru(sc)) | ||
1617 | sc->memcg_record->nr_scanned[file] += nr_taken; | ||
1554 | 1618 | ||
1555 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | 1619 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
1556 | if (file) | 1620 | if (file) |
@@ -1602,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1602 | * get_scan_ratio. | 1666 | * get_scan_ratio. |
1603 | */ | 1667 | */ |
1604 | reclaim_stat->recent_rotated[file] += nr_rotated; | 1668 | reclaim_stat->recent_rotated[file] += nr_rotated; |
1669 | if (!scanning_global_lru(sc)) | ||
1670 | sc->memcg_record->nr_rotated[file] += nr_rotated; | ||
1605 | 1671 | ||
1606 | move_active_pages_to_lru(zone, &l_active, | 1672 | move_active_pages_to_lru(zone, &l_active, |
1607 | LRU_ACTIVE + file * LRU_FILE); | 1673 | LRU_ACTIVE + file * LRU_FILE); |
@@ -1717,6 +1783,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
1717 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1783 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
1718 | } | 1784 | } |
1719 | 1785 | ||
1786 | static int vmscan_swappiness(struct scan_control *sc) | ||
1787 | { | ||
1788 | if (scanning_global_lru(sc)) | ||
1789 | return vm_swappiness; | ||
1790 | return mem_cgroup_swappiness(sc->mem_cgroup); | ||
1791 | } | ||
1792 | |||
1720 | /* | 1793 | /* |
1721 | * Determine how aggressively the anon and file LRU lists should be | 1794 | * Determine how aggressively the anon and file LRU lists should be |
1722 | * scanned. The relative value of each set of LRU lists is determined | 1795 | * scanned. The relative value of each set of LRU lists is determined |
@@ -1736,6 +1809,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1736 | enum lru_list l; | 1809 | enum lru_list l; |
1737 | int noswap = 0; | 1810 | int noswap = 0; |
1738 | int force_scan = 0; | 1811 | int force_scan = 0; |
1812 | unsigned long nr_force_scan[2]; | ||
1739 | 1813 | ||
1740 | 1814 | ||
1741 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1815 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + |
@@ -1758,6 +1832,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1758 | fraction[0] = 0; | 1832 | fraction[0] = 0; |
1759 | fraction[1] = 1; | 1833 | fraction[1] = 1; |
1760 | denominator = 1; | 1834 | denominator = 1; |
1835 | nr_force_scan[0] = 0; | ||
1836 | nr_force_scan[1] = SWAP_CLUSTER_MAX; | ||
1761 | goto out; | 1837 | goto out; |
1762 | } | 1838 | } |
1763 | 1839 | ||
@@ -1769,6 +1845,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1769 | fraction[0] = 1; | 1845 | fraction[0] = 1; |
1770 | fraction[1] = 0; | 1846 | fraction[1] = 0; |
1771 | denominator = 1; | 1847 | denominator = 1; |
1848 | nr_force_scan[0] = SWAP_CLUSTER_MAX; | ||
1849 | nr_force_scan[1] = 0; | ||
1772 | goto out; | 1850 | goto out; |
1773 | } | 1851 | } |
1774 | } | 1852 | } |
@@ -1777,8 +1855,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1777 | * With swappiness at 100, anonymous and file have the same priority. | 1855 | * With swappiness at 100, anonymous and file have the same priority. |
1778 | * This scanning priority is essentially the inverse of IO cost. | 1856 | * This scanning priority is essentially the inverse of IO cost. |
1779 | */ | 1857 | */ |
1780 | anon_prio = sc->swappiness; | 1858 | anon_prio = vmscan_swappiness(sc); |
1781 | file_prio = 200 - sc->swappiness; | 1859 | file_prio = 200 - vmscan_swappiness(sc); |
1782 | 1860 | ||
1783 | /* | 1861 | /* |
1784 | * OK, so we have swap space and a fair amount of page cache | 1862 | * OK, so we have swap space and a fair amount of page cache |
@@ -1817,6 +1895,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1817 | fraction[0] = ap; | 1895 | fraction[0] = ap; |
1818 | fraction[1] = fp; | 1896 | fraction[1] = fp; |
1819 | denominator = ap + fp + 1; | 1897 | denominator = ap + fp + 1; |
1898 | if (force_scan) { | ||
1899 | unsigned long scan = SWAP_CLUSTER_MAX; | ||
1900 | nr_force_scan[0] = div64_u64(scan * ap, denominator); | ||
1901 | nr_force_scan[1] = div64_u64(scan * fp, denominator); | ||
1902 | } | ||
1820 | out: | 1903 | out: |
1821 | for_each_evictable_lru(l) { | 1904 | for_each_evictable_lru(l) { |
1822 | int file = is_file_lru(l); | 1905 | int file = is_file_lru(l); |
@@ -1837,12 +1920,8 @@ out: | |||
1837 | * memcg, priority drop can cause big latency. So, it's better | 1920 | * memcg, priority drop can cause big latency. So, it's better |
1838 | * to scan small amount. See may_noscan above. | 1921 | * to scan small amount. See may_noscan above. |
1839 | */ | 1922 | */ |
1840 | if (!scan && force_scan) { | 1923 | if (!scan && force_scan) |
1841 | if (file) | 1924 | scan = nr_force_scan[file]; |
1842 | scan = SWAP_CLUSTER_MAX; | ||
1843 | else if (!noswap) | ||
1844 | scan = SWAP_CLUSTER_MAX; | ||
1845 | } | ||
1846 | nr[l] = scan; | 1925 | nr[l] = scan; |
1847 | } | 1926 | } |
1848 | } | 1927 | } |
@@ -1983,14 +2062,13 @@ restart: | |||
1983 | * If a zone is deemed to be full of pinned pages then just give it a light | 2062 | * If a zone is deemed to be full of pinned pages then just give it a light |
1984 | * scan then give up on it. | 2063 | * scan then give up on it. |
1985 | */ | 2064 | */ |
1986 | static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | 2065 | static void shrink_zones(int priority, struct zonelist *zonelist, |
1987 | struct scan_control *sc) | 2066 | struct scan_control *sc) |
1988 | { | 2067 | { |
1989 | struct zoneref *z; | 2068 | struct zoneref *z; |
1990 | struct zone *zone; | 2069 | struct zone *zone; |
1991 | unsigned long nr_soft_reclaimed; | 2070 | unsigned long nr_soft_reclaimed; |
1992 | unsigned long nr_soft_scanned; | 2071 | unsigned long nr_soft_scanned; |
1993 | unsigned long total_scanned = 0; | ||
1994 | 2072 | ||
1995 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2073 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1996 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2074 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2005,19 +2083,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
2005 | continue; | 2083 | continue; |
2006 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2084 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2007 | continue; /* Let kswapd poll it */ | 2085 | continue; /* Let kswapd poll it */ |
2086 | /* | ||
2087 | * This steals pages from memory cgroups over softlimit | ||
2088 | * and returns the number of reclaimed pages and | ||
2089 | * scanned pages. This works for global memory pressure | ||
2090 | * and balancing, not for a memcg's limit. | ||
2091 | */ | ||
2092 | nr_soft_scanned = 0; | ||
2093 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2094 | sc->order, sc->gfp_mask, | ||
2095 | &nr_soft_scanned); | ||
2096 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2097 | sc->nr_scanned += nr_soft_scanned; | ||
2098 | /* need some check for avoid more shrink_zone() */ | ||
2008 | } | 2099 | } |
2009 | 2100 | ||
2010 | nr_soft_scanned = 0; | ||
2011 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2012 | sc->order, sc->gfp_mask, | ||
2013 | &nr_soft_scanned); | ||
2014 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2015 | total_scanned += nr_soft_scanned; | ||
2016 | |||
2017 | shrink_zone(priority, zone, sc); | 2101 | shrink_zone(priority, zone, sc); |
2018 | } | 2102 | } |
2019 | |||
2020 | return total_scanned; | ||
2021 | } | 2103 | } |
2022 | 2104 | ||
2023 | static bool zone_reclaimable(struct zone *zone) | 2105 | static bool zone_reclaimable(struct zone *zone) |
@@ -2081,8 +2163,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2081 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2163 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2082 | sc->nr_scanned = 0; | 2164 | sc->nr_scanned = 0; |
2083 | if (!priority) | 2165 | if (!priority) |
2084 | disable_swap_token(); | 2166 | disable_swap_token(sc->mem_cgroup); |
2085 | total_scanned += shrink_zones(priority, zonelist, sc); | 2167 | shrink_zones(priority, zonelist, sc); |
2086 | /* | 2168 | /* |
2087 | * Don't shrink slabs when reclaiming memory from | 2169 | * Don't shrink slabs when reclaiming memory from |
2088 | * over limit cgroups | 2170 | * over limit cgroups |
@@ -2164,7 +2246,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2164 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2246 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2165 | .may_unmap = 1, | 2247 | .may_unmap = 1, |
2166 | .may_swap = 1, | 2248 | .may_swap = 1, |
2167 | .swappiness = vm_swappiness, | ||
2168 | .order = order, | 2249 | .order = order, |
2169 | .mem_cgroup = NULL, | 2250 | .mem_cgroup = NULL, |
2170 | .nodemask = nodemask, | 2251 | .nodemask = nodemask, |
@@ -2187,10 +2268,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2187 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2268 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
2188 | 2269 | ||
2189 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2270 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
2190 | gfp_t gfp_mask, bool noswap, | 2271 | gfp_t gfp_mask, bool noswap, |
2191 | unsigned int swappiness, | 2272 | struct zone *zone, |
2192 | struct zone *zone, | 2273 | struct memcg_scanrecord *rec, |
2193 | unsigned long *nr_scanned) | 2274 | unsigned long *scanned) |
2194 | { | 2275 | { |
2195 | struct scan_control sc = { | 2276 | struct scan_control sc = { |
2196 | .nr_scanned = 0, | 2277 | .nr_scanned = 0, |
@@ -2198,10 +2279,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2198 | .may_writepage = !laptop_mode, | 2279 | .may_writepage = !laptop_mode, |
2199 | .may_unmap = 1, | 2280 | .may_unmap = 1, |
2200 | .may_swap = !noswap, | 2281 | .may_swap = !noswap, |
2201 | .swappiness = swappiness, | ||
2202 | .order = 0, | 2282 | .order = 0, |
2203 | .mem_cgroup = mem, | 2283 | .mem_cgroup = mem, |
2284 | .memcg_record = rec, | ||
2204 | }; | 2285 | }; |
2286 | unsigned long start, end; | ||
2205 | 2287 | ||
2206 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2288 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2207 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2289 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
@@ -2210,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2210 | sc.may_writepage, | 2292 | sc.may_writepage, |
2211 | sc.gfp_mask); | 2293 | sc.gfp_mask); |
2212 | 2294 | ||
2295 | start = sched_clock(); | ||
2213 | /* | 2296 | /* |
2214 | * NOTE: Although we can get the priority field, using it | 2297 | * NOTE: Although we can get the priority field, using it |
2215 | * here is not a good idea, since it limits the pages we can scan. | 2298 | * here is not a good idea, since it limits the pages we can scan. |
@@ -2218,29 +2301,34 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2218 | * the priority and make it zero. | 2301 | * the priority and make it zero. |
2219 | */ | 2302 | */ |
2220 | shrink_zone(0, zone, &sc); | 2303 | shrink_zone(0, zone, &sc); |
2304 | end = sched_clock(); | ||
2305 | |||
2306 | if (rec) | ||
2307 | rec->elapsed += end - start; | ||
2308 | *scanned = sc.nr_scanned; | ||
2221 | 2309 | ||
2222 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2310 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2223 | 2311 | ||
2224 | *nr_scanned = sc.nr_scanned; | ||
2225 | return sc.nr_reclaimed; | 2312 | return sc.nr_reclaimed; |
2226 | } | 2313 | } |
2227 | 2314 | ||
2228 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 2315 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
2229 | gfp_t gfp_mask, | 2316 | gfp_t gfp_mask, |
2230 | bool noswap, | 2317 | bool noswap, |
2231 | unsigned int swappiness) | 2318 | struct memcg_scanrecord *rec) |
2232 | { | 2319 | { |
2233 | struct zonelist *zonelist; | 2320 | struct zonelist *zonelist; |
2234 | unsigned long nr_reclaimed; | 2321 | unsigned long nr_reclaimed; |
2322 | unsigned long start, end; | ||
2235 | int nid; | 2323 | int nid; |
2236 | struct scan_control sc = { | 2324 | struct scan_control sc = { |
2237 | .may_writepage = !laptop_mode, | 2325 | .may_writepage = !laptop_mode, |
2238 | .may_unmap = 1, | 2326 | .may_unmap = 1, |
2239 | .may_swap = !noswap, | 2327 | .may_swap = !noswap, |
2240 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2328 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2241 | .swappiness = swappiness, | ||
2242 | .order = 0, | 2329 | .order = 0, |
2243 | .mem_cgroup = mem_cont, | 2330 | .mem_cgroup = mem_cont, |
2331 | .memcg_record = rec, | ||
2244 | .nodemask = NULL, /* we don't care the placement */ | 2332 | .nodemask = NULL, /* we don't care the placement */ |
2245 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2333 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2246 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 2334 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
@@ -2249,6 +2337,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2249 | .gfp_mask = sc.gfp_mask, | 2337 | .gfp_mask = sc.gfp_mask, |
2250 | }; | 2338 | }; |
2251 | 2339 | ||
2340 | start = sched_clock(); | ||
2252 | /* | 2341 | /* |
2253 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't | 2342 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't |
2254 | * take care of from where we get pages. So the node where we start the | 2343 | * take care of from where we get pages. So the node where we start the |
@@ -2263,6 +2352,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2263 | sc.gfp_mask); | 2352 | sc.gfp_mask); |
2264 | 2353 | ||
2265 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); | 2354 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
2355 | end = sched_clock(); | ||
2356 | if (rec) | ||
2357 | rec->elapsed += end - start; | ||
2266 | 2358 | ||
2267 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 2359 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
2268 | 2360 | ||
@@ -2295,7 +2387,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2295 | for (i = 0; i <= classzone_idx; i++) | 2387 | for (i = 0; i <= classzone_idx; i++) |
2296 | present_pages += pgdat->node_zones[i].present_pages; | 2388 | present_pages += pgdat->node_zones[i].present_pages; |
2297 | 2389 | ||
2298 | return balanced_pages > (present_pages >> 2); | 2390 | /* A special case here: if zone has no page, we think it's balanced */ |
2391 | return balanced_pages >= (present_pages >> 2); | ||
2299 | } | 2392 | } |
2300 | 2393 | ||
2301 | /* is kswapd sleeping prematurely? */ | 2394 | /* is kswapd sleeping prematurely? */ |
@@ -2311,7 +2404,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2311 | return true; | 2404 | return true; |
2312 | 2405 | ||
2313 | /* Check the watermark levels */ | 2406 | /* Check the watermark levels */ |
2314 | for (i = 0; i < pgdat->nr_zones; i++) { | 2407 | for (i = 0; i <= classzone_idx; i++) { |
2315 | struct zone *zone = pgdat->node_zones + i; | 2408 | struct zone *zone = pgdat->node_zones + i; |
2316 | 2409 | ||
2317 | if (!populated_zone(zone)) | 2410 | if (!populated_zone(zone)) |
@@ -2329,7 +2422,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2329 | } | 2422 | } |
2330 | 2423 | ||
2331 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), | 2424 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2332 | classzone_idx, 0)) | 2425 | i, 0)) |
2333 | all_zones_ok = false; | 2426 | all_zones_ok = false; |
2334 | else | 2427 | else |
2335 | balanced += zone->present_pages; | 2428 | balanced += zone->present_pages; |
@@ -2388,7 +2481,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2388 | * we want to put equal scanning pressure on each zone. | 2481 | * we want to put equal scanning pressure on each zone. |
2389 | */ | 2482 | */ |
2390 | .nr_to_reclaim = ULONG_MAX, | 2483 | .nr_to_reclaim = ULONG_MAX, |
2391 | .swappiness = vm_swappiness, | ||
2392 | .order = order, | 2484 | .order = order, |
2393 | .mem_cgroup = NULL, | 2485 | .mem_cgroup = NULL, |
2394 | }; | 2486 | }; |
@@ -2407,7 +2499,7 @@ loop_again: | |||
2407 | 2499 | ||
2408 | /* The swap token gets in the way of swapout... */ | 2500 | /* The swap token gets in the way of swapout... */ |
2409 | if (!priority) | 2501 | if (!priority) |
2410 | disable_swap_token(); | 2502 | disable_swap_token(NULL); |
2411 | 2503 | ||
2412 | all_zones_ok = 1; | 2504 | all_zones_ok = 1; |
2413 | balanced = 0; | 2505 | balanced = 0; |
@@ -2436,7 +2528,6 @@ loop_again: | |||
2436 | if (!zone_watermark_ok_safe(zone, order, | 2528 | if (!zone_watermark_ok_safe(zone, order, |
2437 | high_wmark_pages(zone), 0, 0)) { | 2529 | high_wmark_pages(zone), 0, 0)) { |
2438 | end_zone = i; | 2530 | end_zone = i; |
2439 | *classzone_idx = i; | ||
2440 | break; | 2531 | break; |
2441 | } | 2532 | } |
2442 | } | 2533 | } |
@@ -2495,18 +2586,18 @@ loop_again: | |||
2495 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2586 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2496 | if (!zone_watermark_ok_safe(zone, order, | 2587 | if (!zone_watermark_ok_safe(zone, order, |
2497 | high_wmark_pages(zone) + balance_gap, | 2588 | high_wmark_pages(zone) + balance_gap, |
2498 | end_zone, 0)) | 2589 | end_zone, 0)) { |
2499 | shrink_zone(priority, zone, &sc); | 2590 | shrink_zone(priority, zone, &sc); |
2500 | reclaim_state->reclaimed_slab = 0; | ||
2501 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | ||
2502 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2503 | total_scanned += sc.nr_scanned; | ||
2504 | 2591 | ||
2505 | if (zone->all_unreclaimable) | 2592 | reclaim_state->reclaimed_slab = 0; |
2506 | continue; | 2593 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
2507 | if (nr_slab == 0 && | 2594 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2508 | !zone_reclaimable(zone)) | 2595 | total_scanned += sc.nr_scanned; |
2509 | zone->all_unreclaimable = 1; | 2596 | |
2597 | if (nr_slab == 0 && !zone_reclaimable(zone)) | ||
2598 | zone->all_unreclaimable = 1; | ||
2599 | } | ||
2600 | |||
2510 | /* | 2601 | /* |
2511 | * If we've done a decent amount of scanning and | 2602 | * If we've done a decent amount of scanning and |
2512 | * the reclaim ratio is low, start doing writepage | 2603 | * the reclaim ratio is low, start doing writepage |
@@ -2516,6 +2607,12 @@ loop_again: | |||
2516 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2607 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2517 | sc.may_writepage = 1; | 2608 | sc.may_writepage = 1; |
2518 | 2609 | ||
2610 | if (zone->all_unreclaimable) { | ||
2611 | if (end_zone && end_zone == i) | ||
2612 | end_zone--; | ||
2613 | continue; | ||
2614 | } | ||
2615 | |||
2519 | if (!zone_watermark_ok_safe(zone, order, | 2616 | if (!zone_watermark_ok_safe(zone, order, |
2520 | high_wmark_pages(zone), end_zone, 0)) { | 2617 | high_wmark_pages(zone), end_zone, 0)) { |
2521 | all_zones_ok = 0; | 2618 | all_zones_ok = 0; |
@@ -2694,8 +2791,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2694 | */ | 2791 | */ |
2695 | static int kswapd(void *p) | 2792 | static int kswapd(void *p) |
2696 | { | 2793 | { |
2697 | unsigned long order; | 2794 | unsigned long order, new_order; |
2698 | int classzone_idx; | 2795 | int classzone_idx, new_classzone_idx; |
2699 | pg_data_t *pgdat = (pg_data_t*)p; | 2796 | pg_data_t *pgdat = (pg_data_t*)p; |
2700 | struct task_struct *tsk = current; | 2797 | struct task_struct *tsk = current; |
2701 | 2798 | ||
@@ -2725,17 +2822,23 @@ static int kswapd(void *p) | |||
2725 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 2822 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
2726 | set_freezable(); | 2823 | set_freezable(); |
2727 | 2824 | ||
2728 | order = 0; | 2825 | order = new_order = 0; |
2729 | classzone_idx = MAX_NR_ZONES - 1; | 2826 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2730 | for ( ; ; ) { | 2827 | for ( ; ; ) { |
2731 | unsigned long new_order; | ||
2732 | int new_classzone_idx; | ||
2733 | int ret; | 2828 | int ret; |
2734 | 2829 | ||
2735 | new_order = pgdat->kswapd_max_order; | 2830 | /* |
2736 | new_classzone_idx = pgdat->classzone_idx; | 2831 | * If the last balance_pgdat was unsuccessful it's unlikely a |
2737 | pgdat->kswapd_max_order = 0; | 2832 | * new request of a similar or harder type will succeed soon |
2738 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | 2833 | * so consider going to sleep on the basis we reclaimed at |
2834 | */ | ||
2835 | if (classzone_idx >= new_classzone_idx && order == new_order) { | ||
2836 | new_order = pgdat->kswapd_max_order; | ||
2837 | new_classzone_idx = pgdat->classzone_idx; | ||
2838 | pgdat->kswapd_max_order = 0; | ||
2839 | pgdat->classzone_idx = pgdat->nr_zones - 1; | ||
2840 | } | ||
2841 | |||
2739 | if (order < new_order || classzone_idx > new_classzone_idx) { | 2842 | if (order < new_order || classzone_idx > new_classzone_idx) { |
2740 | /* | 2843 | /* |
2741 | * Don't sleep if someone wants a larger 'order' | 2844 | * Don't sleep if someone wants a larger 'order' |
@@ -2748,7 +2851,7 @@ static int kswapd(void *p) | |||
2748 | order = pgdat->kswapd_max_order; | 2851 | order = pgdat->kswapd_max_order; |
2749 | classzone_idx = pgdat->classzone_idx; | 2852 | classzone_idx = pgdat->classzone_idx; |
2750 | pgdat->kswapd_max_order = 0; | 2853 | pgdat->kswapd_max_order = 0; |
2751 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | 2854 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
2752 | } | 2855 | } |
2753 | 2856 | ||
2754 | ret = try_to_freeze(); | 2857 | ret = try_to_freeze(); |
@@ -2847,7 +2950,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
2847 | .may_writepage = 1, | 2950 | .may_writepage = 1, |
2848 | .nr_to_reclaim = nr_to_reclaim, | 2951 | .nr_to_reclaim = nr_to_reclaim, |
2849 | .hibernation_mode = 1, | 2952 | .hibernation_mode = 1, |
2850 | .swappiness = vm_swappiness, | ||
2851 | .order = 0, | 2953 | .order = 0, |
2852 | }; | 2954 | }; |
2853 | struct shrink_control shrink = { | 2955 | struct shrink_control shrink = { |
@@ -3034,7 +3136,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3034 | .nr_to_reclaim = max_t(unsigned long, nr_pages, | 3136 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
3035 | SWAP_CLUSTER_MAX), | 3137 | SWAP_CLUSTER_MAX), |
3036 | .gfp_mask = gfp_mask, | 3138 | .gfp_mask = gfp_mask, |
3037 | .swappiness = vm_swappiness, | ||
3038 | .order = order, | 3139 | .order = order, |
3039 | }; | 3140 | }; |
3040 | struct shrink_control shrink = { | 3141 | struct shrink_control shrink = { |