aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c435
1 files changed, 332 insertions, 103 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d31d7ce52c0..17497d0cd8b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
32#include <linux/topology.h> 32#include <linux/topology.h>
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/compaction.h>
35#include <linux/notifier.h> 36#include <linux/notifier.h>
36#include <linux/rwsem.h> 37#include <linux/rwsem.h>
37#include <linux/delay.h> 38#include <linux/delay.h>
@@ -51,11 +52,23 @@
51#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h> 53#include <trace/events/vmscan.h>
53 54
54enum lumpy_mode { 55/*
55 LUMPY_MODE_NONE, 56 * reclaim_mode determines how the inactive list is shrunk
56 LUMPY_MODE_ASYNC, 57 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
57 LUMPY_MODE_SYNC, 58 * RECLAIM_MODE_ASYNC: Do not block
58}; 59 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
60 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
61 * page from the LRU and reclaim all pages within a
62 * naturally aligned range
63 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
64 * order-0 pages and then compact the zone
65 */
66typedef unsigned __bitwise__ reclaim_mode_t;
67#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
68#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
69#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
70#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
71#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
59 72
60struct scan_control { 73struct scan_control {
61 /* Incremented by the number of inactive pages that were scanned */ 74 /* Incremented by the number of inactive pages that were scanned */
@@ -88,7 +101,7 @@ struct scan_control {
88 * Intend to reclaim enough continuous memory rather than reclaim 101 * Intend to reclaim enough continuous memory rather than reclaim
89 * enough amount of memory. i.e, mode for high order allocation. 102 * enough amount of memory. i.e, mode for high order allocation.
90 */ 103 */
91 enum lumpy_mode lumpy_reclaim_mode; 104 reclaim_mode_t reclaim_mode;
92 105
93 /* Which cgroup do we reclaim from */ 106 /* Which cgroup do we reclaim from */
94 struct mem_cgroup *mem_cgroup; 107 struct mem_cgroup *mem_cgroup;
@@ -271,34 +284,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
271 return ret; 284 return ret;
272} 285}
273 286
274static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, 287static void set_reclaim_mode(int priority, struct scan_control *sc,
275 bool sync) 288 bool sync)
276{ 289{
277 enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; 290 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
278 291
279 /* 292 /*
280 * Some reclaim have alredy been failed. No worth to try synchronous 293 * Initially assume we are entering either lumpy reclaim or
281 * lumpy reclaim. 294 * reclaim/compaction.Depending on the order, we will either set the
295 * sync mode or just reclaim order-0 pages later.
282 */ 296 */
283 if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 297 if (COMPACTION_BUILD)
284 return; 298 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
299 else
300 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
285 301
286 /* 302 /*
287 * If we need a large contiguous chunk of memory, or have 303 * Avoid using lumpy reclaim or reclaim/compaction if possible by
288 * trouble getting a small set of contiguous pages, we 304 * restricting when its set to either costly allocations or when
289 * will reclaim both active and inactive pages. 305 * under memory pressure
290 */ 306 */
291 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 307 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
292 sc->lumpy_reclaim_mode = mode; 308 sc->reclaim_mode |= syncmode;
293 else if (sc->order && priority < DEF_PRIORITY - 2) 309 else if (sc->order && priority < DEF_PRIORITY - 2)
294 sc->lumpy_reclaim_mode = mode; 310 sc->reclaim_mode |= syncmode;
295 else 311 else
296 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 312 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
297} 313}
298 314
299static void disable_lumpy_reclaim_mode(struct scan_control *sc) 315static void reset_reclaim_mode(struct scan_control *sc)
300{ 316{
301 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 317 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
302} 318}
303 319
304static inline int is_page_cache_freeable(struct page *page) 320static inline int is_page_cache_freeable(struct page *page)
@@ -429,7 +445,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
429 * first attempt to free a range of pages fails. 445 * first attempt to free a range of pages fails.
430 */ 446 */
431 if (PageWriteback(page) && 447 if (PageWriteback(page) &&
432 sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) 448 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
433 wait_on_page_writeback(page); 449 wait_on_page_writeback(page);
434 450
435 if (!PageWriteback(page)) { 451 if (!PageWriteback(page)) {
@@ -437,7 +453,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
437 ClearPageReclaim(page); 453 ClearPageReclaim(page);
438 } 454 }
439 trace_mm_vmscan_writepage(page, 455 trace_mm_vmscan_writepage(page,
440 trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); 456 trace_reclaim_flags(page, sc->reclaim_mode));
441 inc_zone_page_state(page, NR_VMSCAN_WRITE); 457 inc_zone_page_state(page, NR_VMSCAN_WRITE);
442 return PAGE_SUCCESS; 458 return PAGE_SUCCESS;
443 } 459 }
@@ -494,9 +510,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
494 spin_unlock_irq(&mapping->tree_lock); 510 spin_unlock_irq(&mapping->tree_lock);
495 swapcache_free(swap, page); 511 swapcache_free(swap, page);
496 } else { 512 } else {
513 void (*freepage)(struct page *);
514
515 freepage = mapping->a_ops->freepage;
516
497 __remove_from_page_cache(page); 517 __remove_from_page_cache(page);
498 spin_unlock_irq(&mapping->tree_lock); 518 spin_unlock_irq(&mapping->tree_lock);
499 mem_cgroup_uncharge_cache_page(page); 519 mem_cgroup_uncharge_cache_page(page);
520
521 if (freepage != NULL)
522 freepage(page);
500 } 523 }
501 524
502 return 1; 525 return 1;
@@ -615,7 +638,7 @@ static enum page_references page_check_references(struct page *page,
615 referenced_page = TestClearPageReferenced(page); 638 referenced_page = TestClearPageReferenced(page);
616 639
617 /* Lumpy reclaim - ignore references */ 640 /* Lumpy reclaim - ignore references */
618 if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) 641 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
619 return PAGEREF_RECLAIM; 642 return PAGEREF_RECLAIM;
620 643
621 /* 644 /*
@@ -732,7 +755,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
732 * for any page for which writeback has already 755 * for any page for which writeback has already
733 * started. 756 * started.
734 */ 757 */
735 if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && 758 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
736 may_enter_fs) 759 may_enter_fs)
737 wait_on_page_writeback(page); 760 wait_on_page_writeback(page);
738 else { 761 else {
@@ -888,7 +911,7 @@ cull_mlocked:
888 try_to_free_swap(page); 911 try_to_free_swap(page);
889 unlock_page(page); 912 unlock_page(page);
890 putback_lru_page(page); 913 putback_lru_page(page);
891 disable_lumpy_reclaim_mode(sc); 914 reset_reclaim_mode(sc);
892 continue; 915 continue;
893 916
894activate_locked: 917activate_locked:
@@ -901,7 +924,7 @@ activate_locked:
901keep_locked: 924keep_locked:
902 unlock_page(page); 925 unlock_page(page);
903keep: 926keep:
904 disable_lumpy_reclaim_mode(sc); 927 reset_reclaim_mode(sc);
905keep_lumpy: 928keep_lumpy:
906 list_add(&page->lru, &ret_pages); 929 list_add(&page->lru, &ret_pages);
907 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 930 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
@@ -1021,7 +1044,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1021 case 0: 1044 case 0:
1022 list_move(&page->lru, dst); 1045 list_move(&page->lru, dst);
1023 mem_cgroup_del_lru(page); 1046 mem_cgroup_del_lru(page);
1024 nr_taken++; 1047 nr_taken += hpage_nr_pages(page);
1025 break; 1048 break;
1026 1049
1027 case -EBUSY: 1050 case -EBUSY:
@@ -1079,7 +1102,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1079 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1102 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1080 list_move(&cursor_page->lru, dst); 1103 list_move(&cursor_page->lru, dst);
1081 mem_cgroup_del_lru(cursor_page); 1104 mem_cgroup_del_lru(cursor_page);
1082 nr_taken++; 1105 nr_taken += hpage_nr_pages(page);
1083 nr_lumpy_taken++; 1106 nr_lumpy_taken++;
1084 if (PageDirty(cursor_page)) 1107 if (PageDirty(cursor_page))
1085 nr_lumpy_dirty++; 1108 nr_lumpy_dirty++;
@@ -1134,14 +1157,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1134 struct page *page; 1157 struct page *page;
1135 1158
1136 list_for_each_entry(page, page_list, lru) { 1159 list_for_each_entry(page, page_list, lru) {
1160 int numpages = hpage_nr_pages(page);
1137 lru = page_lru_base_type(page); 1161 lru = page_lru_base_type(page);
1138 if (PageActive(page)) { 1162 if (PageActive(page)) {
1139 lru += LRU_ACTIVE; 1163 lru += LRU_ACTIVE;
1140 ClearPageActive(page); 1164 ClearPageActive(page);
1141 nr_active++; 1165 nr_active += numpages;
1142 } 1166 }
1143 if (count) 1167 if (count)
1144 count[lru]++; 1168 count[lru] += numpages;
1145 } 1169 }
1146 1170
1147 return nr_active; 1171 return nr_active;
@@ -1251,7 +1275,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1251 add_page_to_lru_list(zone, page, lru); 1275 add_page_to_lru_list(zone, page, lru);
1252 if (is_active_lru(lru)) { 1276 if (is_active_lru(lru)) {
1253 int file = is_file_lru(lru); 1277 int file = is_file_lru(lru);
1254 reclaim_stat->recent_rotated[file]++; 1278 int numpages = hpage_nr_pages(page);
1279 reclaim_stat->recent_rotated[file] += numpages;
1255 } 1280 }
1256 if (!pagevec_add(&pvec, page)) { 1281 if (!pagevec_add(&pvec, page)) {
1257 spin_unlock_irq(&zone->lru_lock); 1282 spin_unlock_irq(&zone->lru_lock);
@@ -1317,7 +1342,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1317 return false; 1342 return false;
1318 1343
1319 /* Only stall on lumpy reclaim */ 1344 /* Only stall on lumpy reclaim */
1320 if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 1345 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1321 return false; 1346 return false;
1322 1347
1323 /* If we have relaimed everything on the isolated list, no stall */ 1348 /* If we have relaimed everything on the isolated list, no stall */
@@ -1361,15 +1386,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1361 return SWAP_CLUSTER_MAX; 1386 return SWAP_CLUSTER_MAX;
1362 } 1387 }
1363 1388
1364 set_lumpy_reclaim_mode(priority, sc, false); 1389 set_reclaim_mode(priority, sc, false);
1365 lru_add_drain(); 1390 lru_add_drain();
1366 spin_lock_irq(&zone->lru_lock); 1391 spin_lock_irq(&zone->lru_lock);
1367 1392
1368 if (scanning_global_lru(sc)) { 1393 if (scanning_global_lru(sc)) {
1369 nr_taken = isolate_pages_global(nr_to_scan, 1394 nr_taken = isolate_pages_global(nr_to_scan,
1370 &page_list, &nr_scanned, sc->order, 1395 &page_list, &nr_scanned, sc->order,
1371 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1396 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1372 ISOLATE_INACTIVE : ISOLATE_BOTH, 1397 ISOLATE_BOTH : ISOLATE_INACTIVE,
1373 zone, 0, file); 1398 zone, 0, file);
1374 zone->pages_scanned += nr_scanned; 1399 zone->pages_scanned += nr_scanned;
1375 if (current_is_kswapd()) 1400 if (current_is_kswapd())
@@ -1381,8 +1406,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1381 } else { 1406 } else {
1382 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1407 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1383 &page_list, &nr_scanned, sc->order, 1408 &page_list, &nr_scanned, sc->order,
1384 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1409 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1385 ISOLATE_INACTIVE : ISOLATE_BOTH, 1410 ISOLATE_BOTH : ISOLATE_INACTIVE,
1386 zone, sc->mem_cgroup, 1411 zone, sc->mem_cgroup,
1387 0, file); 1412 0, file);
1388 /* 1413 /*
@@ -1404,7 +1429,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1404 1429
1405 /* Check if we should syncronously wait for writeback */ 1430 /* Check if we should syncronously wait for writeback */
1406 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1431 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1407 set_lumpy_reclaim_mode(priority, sc, true); 1432 set_reclaim_mode(priority, sc, true);
1408 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1433 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1409 } 1434 }
1410 1435
@@ -1419,7 +1444,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1419 zone_idx(zone), 1444 zone_idx(zone),
1420 nr_scanned, nr_reclaimed, 1445 nr_scanned, nr_reclaimed,
1421 priority, 1446 priority,
1422 trace_shrink_flags(file, sc->lumpy_reclaim_mode)); 1447 trace_shrink_flags(file, sc->reclaim_mode));
1423 return nr_reclaimed; 1448 return nr_reclaimed;
1424} 1449}
1425 1450
@@ -1459,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1459 1484
1460 list_move(&page->lru, &zone->lru[lru].list); 1485 list_move(&page->lru, &zone->lru[lru].list);
1461 mem_cgroup_add_lru_list(page, lru); 1486 mem_cgroup_add_lru_list(page, lru);
1462 pgmoved++; 1487 pgmoved += hpage_nr_pages(page);
1463 1488
1464 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1489 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1465 spin_unlock_irq(&zone->lru_lock); 1490 spin_unlock_irq(&zone->lru_lock);
@@ -1527,7 +1552,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1527 } 1552 }
1528 1553
1529 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1554 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1530 nr_rotated++; 1555 nr_rotated += hpage_nr_pages(page);
1531 /* 1556 /*
1532 * Identify referenced, file-backed active pages and 1557 * Identify referenced, file-backed active pages and
1533 * give them one more trip around the active list. So 1558 * give them one more trip around the active list. So
@@ -1798,6 +1823,57 @@ out:
1798} 1823}
1799 1824
1800/* 1825/*
1826 * Reclaim/compaction depends on a number of pages being freed. To avoid
1827 * disruption to the system, a small number of order-0 pages continue to be
1828 * rotated and reclaimed in the normal fashion. However, by the time we get
1829 * back to the allocator and call try_to_compact_zone(), we ensure that
1830 * there are enough free pages for it to be likely successful
1831 */
1832static inline bool should_continue_reclaim(struct zone *zone,
1833 unsigned long nr_reclaimed,
1834 unsigned long nr_scanned,
1835 struct scan_control *sc)
1836{
1837 unsigned long pages_for_compaction;
1838 unsigned long inactive_lru_pages;
1839
1840 /* If not in reclaim/compaction mode, stop */
1841 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1842 return false;
1843
1844 /*
1845 * If we failed to reclaim and have scanned the full list, stop.
1846 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
1847 * faster but obviously would be less likely to succeed
1848 * allocation. If this is desirable, use GFP_REPEAT to decide
1849 * if both reclaimed and scanned should be checked or just
1850 * reclaimed
1851 */
1852 if (!nr_reclaimed && !nr_scanned)
1853 return false;
1854
1855 /*
1856 * If we have not reclaimed enough pages for compaction and the
1857 * inactive lists are large enough, continue reclaiming
1858 */
1859 pages_for_compaction = (2UL << sc->order);
1860 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1861 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1862 if (sc->nr_reclaimed < pages_for_compaction &&
1863 inactive_lru_pages > pages_for_compaction)
1864 return true;
1865
1866 /* If compaction would go ahead or the allocation would succeed, stop */
1867 switch (compaction_suitable(zone, sc->order)) {
1868 case COMPACT_PARTIAL:
1869 case COMPACT_CONTINUE:
1870 return false;
1871 default:
1872 return true;
1873 }
1874}
1875
1876/*
1801 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1877 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1802 */ 1878 */
1803static void shrink_zone(int priority, struct zone *zone, 1879static void shrink_zone(int priority, struct zone *zone,
@@ -1806,9 +1882,12 @@ static void shrink_zone(int priority, struct zone *zone,
1806 unsigned long nr[NR_LRU_LISTS]; 1882 unsigned long nr[NR_LRU_LISTS];
1807 unsigned long nr_to_scan; 1883 unsigned long nr_to_scan;
1808 enum lru_list l; 1884 enum lru_list l;
1809 unsigned long nr_reclaimed = sc->nr_reclaimed; 1885 unsigned long nr_reclaimed, nr_scanned;
1810 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1886 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1811 1887
1888restart:
1889 nr_reclaimed = 0;
1890 nr_scanned = sc->nr_scanned;
1812 get_scan_count(zone, sc, nr, priority); 1891 get_scan_count(zone, sc, nr, priority);
1813 1892
1814 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1893 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1834,8 +1913,7 @@ static void shrink_zone(int priority, struct zone *zone,
1834 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1913 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1835 break; 1914 break;
1836 } 1915 }
1837 1916 sc->nr_reclaimed += nr_reclaimed;
1838 sc->nr_reclaimed = nr_reclaimed;
1839 1917
1840 /* 1918 /*
1841 * Even if we did not try to evict anon pages at all, we want to 1919 * Even if we did not try to evict anon pages at all, we want to
@@ -1844,6 +1922,11 @@ static void shrink_zone(int priority, struct zone *zone,
1844 if (inactive_anon_is_low(zone, sc)) 1922 if (inactive_anon_is_low(zone, sc))
1845 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1923 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1846 1924
1925 /* reclaim/compaction might need reclaim to continue */
1926 if (should_continue_reclaim(zone, nr_reclaimed,
1927 sc->nr_scanned - nr_scanned, sc))
1928 goto restart;
1929
1847 throttle_vm_writeout(sc->gfp_mask); 1930 throttle_vm_writeout(sc->gfp_mask);
1848} 1931}
1849 1932
@@ -2000,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2000 struct zone *preferred_zone; 2083 struct zone *preferred_zone;
2001 2084
2002 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2085 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2003 NULL, &preferred_zone); 2086 &cpuset_current_mems_allowed,
2087 &preferred_zone);
2004 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2088 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2005 } 2089 }
2006 } 2090 }
@@ -2117,38 +2201,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2117} 2201}
2118#endif 2202#endif
2119 2203
2204/*
2205 * pgdat_balanced is used when checking if a node is balanced for high-order
2206 * allocations. Only zones that meet watermarks and are in a zone allowed
2207 * by the callers classzone_idx are added to balanced_pages. The total of
2208 * balanced pages must be at least 25% of the zones allowed by classzone_idx
2209 * for the node to be considered balanced. Forcing all zones to be balanced
2210 * for high orders can cause excessive reclaim when there are imbalanced zones.
2211 * The choice of 25% is due to
2212 * o a 16M DMA zone that is balanced will not balance a zone on any
2213 * reasonable sized machine
2214 * o On all other machines, the top zone must be at least a reasonable
2215 * precentage of the middle zones. For example, on 32-bit x86, highmem
2216 * would need to be at least 256M for it to be balance a whole node.
2217 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2218 * to balance a node on its own. These seemed like reasonable ratios.
2219 */
2220static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2221 int classzone_idx)
2222{
2223 unsigned long present_pages = 0;
2224 int i;
2225
2226 for (i = 0; i <= classzone_idx; i++)
2227 present_pages += pgdat->node_zones[i].present_pages;
2228
2229 return balanced_pages > (present_pages >> 2);
2230}
2231
2120/* is kswapd sleeping prematurely? */ 2232/* is kswapd sleeping prematurely? */
2121static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) 2233static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2234 int classzone_idx)
2122{ 2235{
2123 int i; 2236 int i;
2237 unsigned long balanced = 0;
2238 bool all_zones_ok = true;
2124 2239
2125 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2240 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2126 if (remaining) 2241 if (remaining)
2127 return 1; 2242 return true;
2128 2243
2129 /* If after HZ/10, a zone is below the high mark, it's premature */ 2244 /* Check the watermark levels */
2130 for (i = 0; i < pgdat->nr_zones; i++) { 2245 for (i = 0; i < pgdat->nr_zones; i++) {
2131 struct zone *zone = pgdat->node_zones + i; 2246 struct zone *zone = pgdat->node_zones + i;
2132 2247
2133 if (!populated_zone(zone)) 2248 if (!populated_zone(zone))
2134 continue; 2249 continue;
2135 2250
2136 if (zone->all_unreclaimable) 2251 /*
2252 * balance_pgdat() skips over all_unreclaimable after
2253 * DEF_PRIORITY. Effectively, it considers them balanced so
2254 * they must be considered balanced here as well if kswapd
2255 * is to sleep
2256 */
2257 if (zone->all_unreclaimable) {
2258 balanced += zone->present_pages;
2137 continue; 2259 continue;
2260 }
2138 2261
2139 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 2262 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2140 0, 0)) 2263 classzone_idx, 0))
2141 return 1; 2264 all_zones_ok = false;
2265 else
2266 balanced += zone->present_pages;
2142 } 2267 }
2143 2268
2144 return 0; 2269 /*
2270 * For high-order requests, the balanced zones must contain at least
2271 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2272 * must be balanced
2273 */
2274 if (order)
2275 return pgdat_balanced(pgdat, balanced, classzone_idx);
2276 else
2277 return !all_zones_ok;
2145} 2278}
2146 2279
2147/* 2280/*
2148 * For kswapd, balance_pgdat() will work across all this node's zones until 2281 * For kswapd, balance_pgdat() will work across all this node's zones until
2149 * they are all at high_wmark_pages(zone). 2282 * they are all at high_wmark_pages(zone).
2150 * 2283 *
2151 * Returns the number of pages which were actually freed. 2284 * Returns the final order kswapd was reclaiming at
2152 * 2285 *
2153 * There is special handling here for zones which are full of pinned pages. 2286 * There is special handling here for zones which are full of pinned pages.
2154 * This can happen if the pages are all mlocked, or if they are all used by 2287 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2165,11 +2298,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
2165 * interoperates with the page allocator fallback scheme to ensure that aging 2298 * interoperates with the page allocator fallback scheme to ensure that aging
2166 * of pages is balanced across the zones. 2299 * of pages is balanced across the zones.
2167 */ 2300 */
2168static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 2301static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2302 int *classzone_idx)
2169{ 2303{
2170 int all_zones_ok; 2304 int all_zones_ok;
2305 unsigned long balanced;
2171 int priority; 2306 int priority;
2172 int i; 2307 int i;
2308 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2173 unsigned long total_scanned; 2309 unsigned long total_scanned;
2174 struct reclaim_state *reclaim_state = current->reclaim_state; 2310 struct reclaim_state *reclaim_state = current->reclaim_state;
2175 struct scan_control sc = { 2311 struct scan_control sc = {
@@ -2192,7 +2328,6 @@ loop_again:
2192 count_vm_event(PAGEOUTRUN); 2328 count_vm_event(PAGEOUTRUN);
2193 2329
2194 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2330 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2195 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2196 unsigned long lru_pages = 0; 2331 unsigned long lru_pages = 0;
2197 int has_under_min_watermark_zone = 0; 2332 int has_under_min_watermark_zone = 0;
2198 2333
@@ -2201,6 +2336,7 @@ loop_again:
2201 disable_swap_token(); 2336 disable_swap_token();
2202 2337
2203 all_zones_ok = 1; 2338 all_zones_ok = 1;
2339 balanced = 0;
2204 2340
2205 /* 2341 /*
2206 * Scan in the highmem->dma direction for the highest 2342 * Scan in the highmem->dma direction for the highest
@@ -2223,9 +2359,10 @@ loop_again:
2223 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2359 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2224 &sc, priority, 0); 2360 &sc, priority, 0);
2225 2361
2226 if (!zone_watermark_ok(zone, order, 2362 if (!zone_watermark_ok_safe(zone, order,
2227 high_wmark_pages(zone), 0, 0)) { 2363 high_wmark_pages(zone), 0, 0)) {
2228 end_zone = i; 2364 end_zone = i;
2365 *classzone_idx = i;
2229 break; 2366 break;
2230 } 2367 }
2231 } 2368 }
@@ -2248,6 +2385,7 @@ loop_again:
2248 * cause too much scanning of the lower zones. 2385 * cause too much scanning of the lower zones.
2249 */ 2386 */
2250 for (i = 0; i <= end_zone; i++) { 2387 for (i = 0; i <= end_zone; i++) {
2388 int compaction;
2251 struct zone *zone = pgdat->node_zones + i; 2389 struct zone *zone = pgdat->node_zones + i;
2252 int nr_slab; 2390 int nr_slab;
2253 2391
@@ -2269,7 +2407,7 @@ loop_again:
2269 * We put equal pressure on every zone, unless one 2407 * We put equal pressure on every zone, unless one
2270 * zone has way too many pages free already. 2408 * zone has way too many pages free already.
2271 */ 2409 */
2272 if (!zone_watermark_ok(zone, order, 2410 if (!zone_watermark_ok_safe(zone, order,
2273 8*high_wmark_pages(zone), end_zone, 0)) 2411 8*high_wmark_pages(zone), end_zone, 0))
2274 shrink_zone(priority, zone, &sc); 2412 shrink_zone(priority, zone, &sc);
2275 reclaim_state->reclaimed_slab = 0; 2413 reclaim_state->reclaimed_slab = 0;
@@ -2277,9 +2415,26 @@ loop_again:
2277 lru_pages); 2415 lru_pages);
2278 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2416 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2279 total_scanned += sc.nr_scanned; 2417 total_scanned += sc.nr_scanned;
2418
2419 compaction = 0;
2420 if (order &&
2421 zone_watermark_ok(zone, 0,
2422 high_wmark_pages(zone),
2423 end_zone, 0) &&
2424 !zone_watermark_ok(zone, order,
2425 high_wmark_pages(zone),
2426 end_zone, 0)) {
2427 compact_zone_order(zone,
2428 order,
2429 sc.gfp_mask, false,
2430 COMPACT_MODE_KSWAPD);
2431 compaction = 1;
2432 }
2433
2280 if (zone->all_unreclaimable) 2434 if (zone->all_unreclaimable)
2281 continue; 2435 continue;
2282 if (nr_slab == 0 && !zone_reclaimable(zone)) 2436 if (!compaction && nr_slab == 0 &&
2437 !zone_reclaimable(zone))
2283 zone->all_unreclaimable = 1; 2438 zone->all_unreclaimable = 1;
2284 /* 2439 /*
2285 * If we've done a decent amount of scanning and 2440 * If we've done a decent amount of scanning and
@@ -2290,7 +2445,7 @@ loop_again:
2290 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2445 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2291 sc.may_writepage = 1; 2446 sc.may_writepage = 1;
2292 2447
2293 if (!zone_watermark_ok(zone, order, 2448 if (!zone_watermark_ok_safe(zone, order,
2294 high_wmark_pages(zone), end_zone, 0)) { 2449 high_wmark_pages(zone), end_zone, 0)) {
2295 all_zones_ok = 0; 2450 all_zones_ok = 0;
2296 /* 2451 /*
@@ -2298,7 +2453,7 @@ loop_again:
2298 * means that we have a GFP_ATOMIC allocation 2453 * means that we have a GFP_ATOMIC allocation
2299 * failure risk. Hurry up! 2454 * failure risk. Hurry up!
2300 */ 2455 */
2301 if (!zone_watermark_ok(zone, order, 2456 if (!zone_watermark_ok_safe(zone, order,
2302 min_wmark_pages(zone), end_zone, 0)) 2457 min_wmark_pages(zone), end_zone, 0))
2303 has_under_min_watermark_zone = 1; 2458 has_under_min_watermark_zone = 1;
2304 } else { 2459 } else {
@@ -2310,10 +2465,12 @@ loop_again:
2310 * spectulatively avoid congestion waits 2465 * spectulatively avoid congestion waits
2311 */ 2466 */
2312 zone_clear_flag(zone, ZONE_CONGESTED); 2467 zone_clear_flag(zone, ZONE_CONGESTED);
2468 if (i <= *classzone_idx)
2469 balanced += zone->present_pages;
2313 } 2470 }
2314 2471
2315 } 2472 }
2316 if (all_zones_ok) 2473 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2317 break; /* kswapd: all done */ 2474 break; /* kswapd: all done */
2318 /* 2475 /*
2319 * OK, kswapd is getting into trouble. Take a nap, then take 2476 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2336,7 +2493,13 @@ loop_again:
2336 break; 2493 break;
2337 } 2494 }
2338out: 2495out:
2339 if (!all_zones_ok) { 2496
2497 /*
2498 * order-0: All zones must meet high watermark for a balanced node
2499 * high-order: Balanced zones must make up at least 25% of the node
2500 * for the node to be balanced
2501 */
2502 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2340 cond_resched(); 2503 cond_resched();
2341 2504
2342 try_to_freeze(); 2505 try_to_freeze();
@@ -2361,7 +2524,88 @@ out:
2361 goto loop_again; 2524 goto loop_again;
2362 } 2525 }
2363 2526
2364 return sc.nr_reclaimed; 2527 /*
2528 * If kswapd was reclaiming at a higher order, it has the option of
2529 * sleeping without all zones being balanced. Before it does, it must
2530 * ensure that the watermarks for order-0 on *all* zones are met and
2531 * that the congestion flags are cleared. The congestion flag must
2532 * be cleared as kswapd is the only mechanism that clears the flag
2533 * and it is potentially going to sleep here.
2534 */
2535 if (order) {
2536 for (i = 0; i <= end_zone; i++) {
2537 struct zone *zone = pgdat->node_zones + i;
2538
2539 if (!populated_zone(zone))
2540 continue;
2541
2542 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2543 continue;
2544
2545 /* Confirm the zone is balanced for order-0 */
2546 if (!zone_watermark_ok(zone, 0,
2547 high_wmark_pages(zone), 0, 0)) {
2548 order = sc.order = 0;
2549 goto loop_again;
2550 }
2551
2552 /* If balanced, clear the congested flag */
2553 zone_clear_flag(zone, ZONE_CONGESTED);
2554 }
2555 }
2556
2557 /*
2558 * Return the order we were reclaiming at so sleeping_prematurely()
2559 * makes a decision on the order we were last reclaiming at. However,
2560 * if another caller entered the allocator slow path while kswapd
2561 * was awake, order will remain at the higher level
2562 */
2563 *classzone_idx = end_zone;
2564 return order;
2565}
2566
2567static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2568{
2569 long remaining = 0;
2570 DEFINE_WAIT(wait);
2571
2572 if (freezing(current) || kthread_should_stop())
2573 return;
2574
2575 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2576
2577 /* Try to sleep for a short interval */
2578 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2579 remaining = schedule_timeout(HZ/10);
2580 finish_wait(&pgdat->kswapd_wait, &wait);
2581 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2582 }
2583
2584 /*
2585 * After a short sleep, check if it was a premature sleep. If not, then
2586 * go fully to sleep until explicitly woken up.
2587 */
2588 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2589 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2590
2591 /*
2592 * vmstat counters are not perfectly accurate and the estimated
2593 * value for counters such as NR_FREE_PAGES can deviate from the
2594 * true value by nr_online_cpus * threshold. To avoid the zone
2595 * watermarks being breached while under pressure, we reduce the
2596 * per-cpu vmstat threshold while kswapd is awake and restore
2597 * them before going back to sleep.
2598 */
2599 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2600 schedule();
2601 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2602 } else {
2603 if (remaining)
2604 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2605 else
2606 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2607 }
2608 finish_wait(&pgdat->kswapd_wait, &wait);
2365} 2609}
2366 2610
2367/* 2611/*
@@ -2380,9 +2624,10 @@ out:
2380static int kswapd(void *p) 2624static int kswapd(void *p)
2381{ 2625{
2382 unsigned long order; 2626 unsigned long order;
2627 int classzone_idx;
2383 pg_data_t *pgdat = (pg_data_t*)p; 2628 pg_data_t *pgdat = (pg_data_t*)p;
2384 struct task_struct *tsk = current; 2629 struct task_struct *tsk = current;
2385 DEFINE_WAIT(wait); 2630
2386 struct reclaim_state reclaim_state = { 2631 struct reclaim_state reclaim_state = {
2387 .reclaimed_slab = 0, 2632 .reclaimed_slab = 0,
2388 }; 2633 };
@@ -2410,49 +2655,30 @@ static int kswapd(void *p)
2410 set_freezable(); 2655 set_freezable();
2411 2656
2412 order = 0; 2657 order = 0;
2658 classzone_idx = MAX_NR_ZONES - 1;
2413 for ( ; ; ) { 2659 for ( ; ; ) {
2414 unsigned long new_order; 2660 unsigned long new_order;
2661 int new_classzone_idx;
2415 int ret; 2662 int ret;
2416 2663
2417 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2418 new_order = pgdat->kswapd_max_order; 2664 new_order = pgdat->kswapd_max_order;
2665 new_classzone_idx = pgdat->classzone_idx;
2419 pgdat->kswapd_max_order = 0; 2666 pgdat->kswapd_max_order = 0;
2420 if (order < new_order) { 2667 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2668 if (order < new_order || classzone_idx > new_classzone_idx) {
2421 /* 2669 /*
2422 * Don't sleep if someone wants a larger 'order' 2670 * Don't sleep if someone wants a larger 'order'
2423 * allocation 2671 * allocation or has tigher zone constraints
2424 */ 2672 */
2425 order = new_order; 2673 order = new_order;
2674 classzone_idx = new_classzone_idx;
2426 } else { 2675 } else {
2427 if (!freezing(current) && !kthread_should_stop()) { 2676 kswapd_try_to_sleep(pgdat, order, classzone_idx);
2428 long remaining = 0;
2429
2430 /* Try to sleep for a short interval */
2431 if (!sleeping_prematurely(pgdat, order, remaining)) {
2432 remaining = schedule_timeout(HZ/10);
2433 finish_wait(&pgdat->kswapd_wait, &wait);
2434 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2435 }
2436
2437 /*
2438 * After a short sleep, check if it was a
2439 * premature sleep. If not, then go fully
2440 * to sleep until explicitly woken up
2441 */
2442 if (!sleeping_prematurely(pgdat, order, remaining)) {
2443 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2444 schedule();
2445 } else {
2446 if (remaining)
2447 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2448 else
2449 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2450 }
2451 }
2452
2453 order = pgdat->kswapd_max_order; 2677 order = pgdat->kswapd_max_order;
2678 classzone_idx = pgdat->classzone_idx;
2679 pgdat->kswapd_max_order = 0;
2680 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2454 } 2681 }
2455 finish_wait(&pgdat->kswapd_wait, &wait);
2456 2682
2457 ret = try_to_freeze(); 2683 ret = try_to_freeze();
2458 if (kthread_should_stop()) 2684 if (kthread_should_stop())
@@ -2464,7 +2690,7 @@ static int kswapd(void *p)
2464 */ 2690 */
2465 if (!ret) { 2691 if (!ret) {
2466 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2692 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2467 balance_pgdat(pgdat, order); 2693 order = balance_pgdat(pgdat, order, &classzone_idx);
2468 } 2694 }
2469 } 2695 }
2470 return 0; 2696 return 0;
@@ -2473,23 +2699,26 @@ static int kswapd(void *p)
2473/* 2699/*
2474 * A zone is low on free memory, so wake its kswapd task to service it. 2700 * A zone is low on free memory, so wake its kswapd task to service it.
2475 */ 2701 */
2476void wakeup_kswapd(struct zone *zone, int order) 2702void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2477{ 2703{
2478 pg_data_t *pgdat; 2704 pg_data_t *pgdat;
2479 2705
2480 if (!populated_zone(zone)) 2706 if (!populated_zone(zone))
2481 return; 2707 return;
2482 2708
2483 pgdat = zone->zone_pgdat;
2484 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2485 return;
2486 if (pgdat->kswapd_max_order < order)
2487 pgdat->kswapd_max_order = order;
2488 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2489 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2709 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2490 return; 2710 return;
2711 pgdat = zone->zone_pgdat;
2712 if (pgdat->kswapd_max_order < order) {
2713 pgdat->kswapd_max_order = order;
2714 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2715 }
2491 if (!waitqueue_active(&pgdat->kswapd_wait)) 2716 if (!waitqueue_active(&pgdat->kswapd_wait))
2492 return; 2717 return;
2718 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2719 return;
2720
2721 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2493 wake_up_interruptible(&pgdat->kswapd_wait); 2722 wake_up_interruptible(&pgdat->kswapd_wait);
2494} 2723}
2495 2724