diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 435 |
1 files changed, 332 insertions, 103 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index d31d7ce52c0..17497d0cd8b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
35 | #include <linux/compaction.h> | ||
35 | #include <linux/notifier.h> | 36 | #include <linux/notifier.h> |
36 | #include <linux/rwsem.h> | 37 | #include <linux/rwsem.h> |
37 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
@@ -51,11 +52,23 @@ | |||
51 | #define CREATE_TRACE_POINTS | 52 | #define CREATE_TRACE_POINTS |
52 | #include <trace/events/vmscan.h> | 53 | #include <trace/events/vmscan.h> |
53 | 54 | ||
54 | enum lumpy_mode { | 55 | /* |
55 | LUMPY_MODE_NONE, | 56 | * reclaim_mode determines how the inactive list is shrunk |
56 | LUMPY_MODE_ASYNC, | 57 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages |
57 | LUMPY_MODE_SYNC, | 58 | * RECLAIM_MODE_ASYNC: Do not block |
58 | }; | 59 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback |
60 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | ||
61 | * page from the LRU and reclaim all pages within a | ||
62 | * naturally aligned range | ||
63 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | ||
64 | * order-0 pages and then compact the zone | ||
65 | */ | ||
66 | typedef unsigned __bitwise__ reclaim_mode_t; | ||
67 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | ||
68 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | ||
69 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | ||
70 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | ||
71 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | ||
59 | 72 | ||
60 | struct scan_control { | 73 | struct scan_control { |
61 | /* Incremented by the number of inactive pages that were scanned */ | 74 | /* Incremented by the number of inactive pages that were scanned */ |
@@ -88,7 +101,7 @@ struct scan_control { | |||
88 | * Intend to reclaim enough continuous memory rather than reclaim | 101 | * Intend to reclaim enough continuous memory rather than reclaim |
89 | * enough amount of memory. i.e, mode for high order allocation. | 102 | * enough amount of memory. i.e, mode for high order allocation. |
90 | */ | 103 | */ |
91 | enum lumpy_mode lumpy_reclaim_mode; | 104 | reclaim_mode_t reclaim_mode; |
92 | 105 | ||
93 | /* Which cgroup do we reclaim from */ | 106 | /* Which cgroup do we reclaim from */ |
94 | struct mem_cgroup *mem_cgroup; | 107 | struct mem_cgroup *mem_cgroup; |
@@ -271,34 +284,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
271 | return ret; | 284 | return ret; |
272 | } | 285 | } |
273 | 286 | ||
274 | static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, | 287 | static void set_reclaim_mode(int priority, struct scan_control *sc, |
275 | bool sync) | 288 | bool sync) |
276 | { | 289 | { |
277 | enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; | 290 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; |
278 | 291 | ||
279 | /* | 292 | /* |
280 | * Some reclaim have alredy been failed. No worth to try synchronous | 293 | * Initially assume we are entering either lumpy reclaim or |
281 | * lumpy reclaim. | 294 | * reclaim/compaction.Depending on the order, we will either set the |
295 | * sync mode or just reclaim order-0 pages later. | ||
282 | */ | 296 | */ |
283 | if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 297 | if (COMPACTION_BUILD) |
284 | return; | 298 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; |
299 | else | ||
300 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | ||
285 | 301 | ||
286 | /* | 302 | /* |
287 | * If we need a large contiguous chunk of memory, or have | 303 | * Avoid using lumpy reclaim or reclaim/compaction if possible by |
288 | * trouble getting a small set of contiguous pages, we | 304 | * restricting when its set to either costly allocations or when |
289 | * will reclaim both active and inactive pages. | 305 | * under memory pressure |
290 | */ | 306 | */ |
291 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 307 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) |
292 | sc->lumpy_reclaim_mode = mode; | 308 | sc->reclaim_mode |= syncmode; |
293 | else if (sc->order && priority < DEF_PRIORITY - 2) | 309 | else if (sc->order && priority < DEF_PRIORITY - 2) |
294 | sc->lumpy_reclaim_mode = mode; | 310 | sc->reclaim_mode |= syncmode; |
295 | else | 311 | else |
296 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 312 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
297 | } | 313 | } |
298 | 314 | ||
299 | static void disable_lumpy_reclaim_mode(struct scan_control *sc) | 315 | static void reset_reclaim_mode(struct scan_control *sc) |
300 | { | 316 | { |
301 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 317 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
302 | } | 318 | } |
303 | 319 | ||
304 | static inline int is_page_cache_freeable(struct page *page) | 320 | static inline int is_page_cache_freeable(struct page *page) |
@@ -429,7 +445,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
429 | * first attempt to free a range of pages fails. | 445 | * first attempt to free a range of pages fails. |
430 | */ | 446 | */ |
431 | if (PageWriteback(page) && | 447 | if (PageWriteback(page) && |
432 | sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) | 448 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) |
433 | wait_on_page_writeback(page); | 449 | wait_on_page_writeback(page); |
434 | 450 | ||
435 | if (!PageWriteback(page)) { | 451 | if (!PageWriteback(page)) { |
@@ -437,7 +453,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
437 | ClearPageReclaim(page); | 453 | ClearPageReclaim(page); |
438 | } | 454 | } |
439 | trace_mm_vmscan_writepage(page, | 455 | trace_mm_vmscan_writepage(page, |
440 | trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); | 456 | trace_reclaim_flags(page, sc->reclaim_mode)); |
441 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 457 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
442 | return PAGE_SUCCESS; | 458 | return PAGE_SUCCESS; |
443 | } | 459 | } |
@@ -494,9 +510,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
494 | spin_unlock_irq(&mapping->tree_lock); | 510 | spin_unlock_irq(&mapping->tree_lock); |
495 | swapcache_free(swap, page); | 511 | swapcache_free(swap, page); |
496 | } else { | 512 | } else { |
513 | void (*freepage)(struct page *); | ||
514 | |||
515 | freepage = mapping->a_ops->freepage; | ||
516 | |||
497 | __remove_from_page_cache(page); | 517 | __remove_from_page_cache(page); |
498 | spin_unlock_irq(&mapping->tree_lock); | 518 | spin_unlock_irq(&mapping->tree_lock); |
499 | mem_cgroup_uncharge_cache_page(page); | 519 | mem_cgroup_uncharge_cache_page(page); |
520 | |||
521 | if (freepage != NULL) | ||
522 | freepage(page); | ||
500 | } | 523 | } |
501 | 524 | ||
502 | return 1; | 525 | return 1; |
@@ -615,7 +638,7 @@ static enum page_references page_check_references(struct page *page, | |||
615 | referenced_page = TestClearPageReferenced(page); | 638 | referenced_page = TestClearPageReferenced(page); |
616 | 639 | ||
617 | /* Lumpy reclaim - ignore references */ | 640 | /* Lumpy reclaim - ignore references */ |
618 | if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) | 641 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
619 | return PAGEREF_RECLAIM; | 642 | return PAGEREF_RECLAIM; |
620 | 643 | ||
621 | /* | 644 | /* |
@@ -732,7 +755,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
732 | * for any page for which writeback has already | 755 | * for any page for which writeback has already |
733 | * started. | 756 | * started. |
734 | */ | 757 | */ |
735 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && | 758 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
736 | may_enter_fs) | 759 | may_enter_fs) |
737 | wait_on_page_writeback(page); | 760 | wait_on_page_writeback(page); |
738 | else { | 761 | else { |
@@ -888,7 +911,7 @@ cull_mlocked: | |||
888 | try_to_free_swap(page); | 911 | try_to_free_swap(page); |
889 | unlock_page(page); | 912 | unlock_page(page); |
890 | putback_lru_page(page); | 913 | putback_lru_page(page); |
891 | disable_lumpy_reclaim_mode(sc); | 914 | reset_reclaim_mode(sc); |
892 | continue; | 915 | continue; |
893 | 916 | ||
894 | activate_locked: | 917 | activate_locked: |
@@ -901,7 +924,7 @@ activate_locked: | |||
901 | keep_locked: | 924 | keep_locked: |
902 | unlock_page(page); | 925 | unlock_page(page); |
903 | keep: | 926 | keep: |
904 | disable_lumpy_reclaim_mode(sc); | 927 | reset_reclaim_mode(sc); |
905 | keep_lumpy: | 928 | keep_lumpy: |
906 | list_add(&page->lru, &ret_pages); | 929 | list_add(&page->lru, &ret_pages); |
907 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 930 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
@@ -1021,7 +1044,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1021 | case 0: | 1044 | case 0: |
1022 | list_move(&page->lru, dst); | 1045 | list_move(&page->lru, dst); |
1023 | mem_cgroup_del_lru(page); | 1046 | mem_cgroup_del_lru(page); |
1024 | nr_taken++; | 1047 | nr_taken += hpage_nr_pages(page); |
1025 | break; | 1048 | break; |
1026 | 1049 | ||
1027 | case -EBUSY: | 1050 | case -EBUSY: |
@@ -1079,7 +1102,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1079 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | 1102 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
1080 | list_move(&cursor_page->lru, dst); | 1103 | list_move(&cursor_page->lru, dst); |
1081 | mem_cgroup_del_lru(cursor_page); | 1104 | mem_cgroup_del_lru(cursor_page); |
1082 | nr_taken++; | 1105 | nr_taken += hpage_nr_pages(page); |
1083 | nr_lumpy_taken++; | 1106 | nr_lumpy_taken++; |
1084 | if (PageDirty(cursor_page)) | 1107 | if (PageDirty(cursor_page)) |
1085 | nr_lumpy_dirty++; | 1108 | nr_lumpy_dirty++; |
@@ -1134,14 +1157,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, | |||
1134 | struct page *page; | 1157 | struct page *page; |
1135 | 1158 | ||
1136 | list_for_each_entry(page, page_list, lru) { | 1159 | list_for_each_entry(page, page_list, lru) { |
1160 | int numpages = hpage_nr_pages(page); | ||
1137 | lru = page_lru_base_type(page); | 1161 | lru = page_lru_base_type(page); |
1138 | if (PageActive(page)) { | 1162 | if (PageActive(page)) { |
1139 | lru += LRU_ACTIVE; | 1163 | lru += LRU_ACTIVE; |
1140 | ClearPageActive(page); | 1164 | ClearPageActive(page); |
1141 | nr_active++; | 1165 | nr_active += numpages; |
1142 | } | 1166 | } |
1143 | if (count) | 1167 | if (count) |
1144 | count[lru]++; | 1168 | count[lru] += numpages; |
1145 | } | 1169 | } |
1146 | 1170 | ||
1147 | return nr_active; | 1171 | return nr_active; |
@@ -1251,7 +1275,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1251 | add_page_to_lru_list(zone, page, lru); | 1275 | add_page_to_lru_list(zone, page, lru); |
1252 | if (is_active_lru(lru)) { | 1276 | if (is_active_lru(lru)) { |
1253 | int file = is_file_lru(lru); | 1277 | int file = is_file_lru(lru); |
1254 | reclaim_stat->recent_rotated[file]++; | 1278 | int numpages = hpage_nr_pages(page); |
1279 | reclaim_stat->recent_rotated[file] += numpages; | ||
1255 | } | 1280 | } |
1256 | if (!pagevec_add(&pvec, page)) { | 1281 | if (!pagevec_add(&pvec, page)) { |
1257 | spin_unlock_irq(&zone->lru_lock); | 1282 | spin_unlock_irq(&zone->lru_lock); |
@@ -1317,7 +1342,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
1317 | return false; | 1342 | return false; |
1318 | 1343 | ||
1319 | /* Only stall on lumpy reclaim */ | 1344 | /* Only stall on lumpy reclaim */ |
1320 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 1345 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
1321 | return false; | 1346 | return false; |
1322 | 1347 | ||
1323 | /* If we have relaimed everything on the isolated list, no stall */ | 1348 | /* If we have relaimed everything on the isolated list, no stall */ |
@@ -1361,15 +1386,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1361 | return SWAP_CLUSTER_MAX; | 1386 | return SWAP_CLUSTER_MAX; |
1362 | } | 1387 | } |
1363 | 1388 | ||
1364 | set_lumpy_reclaim_mode(priority, sc, false); | 1389 | set_reclaim_mode(priority, sc, false); |
1365 | lru_add_drain(); | 1390 | lru_add_drain(); |
1366 | spin_lock_irq(&zone->lru_lock); | 1391 | spin_lock_irq(&zone->lru_lock); |
1367 | 1392 | ||
1368 | if (scanning_global_lru(sc)) { | 1393 | if (scanning_global_lru(sc)) { |
1369 | nr_taken = isolate_pages_global(nr_to_scan, | 1394 | nr_taken = isolate_pages_global(nr_to_scan, |
1370 | &page_list, &nr_scanned, sc->order, | 1395 | &page_list, &nr_scanned, sc->order, |
1371 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1396 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1372 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1397 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1373 | zone, 0, file); | 1398 | zone, 0, file); |
1374 | zone->pages_scanned += nr_scanned; | 1399 | zone->pages_scanned += nr_scanned; |
1375 | if (current_is_kswapd()) | 1400 | if (current_is_kswapd()) |
@@ -1381,8 +1406,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1381 | } else { | 1406 | } else { |
1382 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1407 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, |
1383 | &page_list, &nr_scanned, sc->order, | 1408 | &page_list, &nr_scanned, sc->order, |
1384 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1409 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1385 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1410 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1386 | zone, sc->mem_cgroup, | 1411 | zone, sc->mem_cgroup, |
1387 | 0, file); | 1412 | 0, file); |
1388 | /* | 1413 | /* |
@@ -1404,7 +1429,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1404 | 1429 | ||
1405 | /* Check if we should syncronously wait for writeback */ | 1430 | /* Check if we should syncronously wait for writeback */ |
1406 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1431 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1407 | set_lumpy_reclaim_mode(priority, sc, true); | 1432 | set_reclaim_mode(priority, sc, true); |
1408 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1433 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
1409 | } | 1434 | } |
1410 | 1435 | ||
@@ -1419,7 +1444,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1419 | zone_idx(zone), | 1444 | zone_idx(zone), |
1420 | nr_scanned, nr_reclaimed, | 1445 | nr_scanned, nr_reclaimed, |
1421 | priority, | 1446 | priority, |
1422 | trace_shrink_flags(file, sc->lumpy_reclaim_mode)); | 1447 | trace_shrink_flags(file, sc->reclaim_mode)); |
1423 | return nr_reclaimed; | 1448 | return nr_reclaimed; |
1424 | } | 1449 | } |
1425 | 1450 | ||
@@ -1459,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1459 | 1484 | ||
1460 | list_move(&page->lru, &zone->lru[lru].list); | 1485 | list_move(&page->lru, &zone->lru[lru].list); |
1461 | mem_cgroup_add_lru_list(page, lru); | 1486 | mem_cgroup_add_lru_list(page, lru); |
1462 | pgmoved++; | 1487 | pgmoved += hpage_nr_pages(page); |
1463 | 1488 | ||
1464 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | 1489 | if (!pagevec_add(&pvec, page) || list_empty(list)) { |
1465 | spin_unlock_irq(&zone->lru_lock); | 1490 | spin_unlock_irq(&zone->lru_lock); |
@@ -1527,7 +1552,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1527 | } | 1552 | } |
1528 | 1553 | ||
1529 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | 1554 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1530 | nr_rotated++; | 1555 | nr_rotated += hpage_nr_pages(page); |
1531 | /* | 1556 | /* |
1532 | * Identify referenced, file-backed active pages and | 1557 | * Identify referenced, file-backed active pages and |
1533 | * give them one more trip around the active list. So | 1558 | * give them one more trip around the active list. So |
@@ -1798,6 +1823,57 @@ out: | |||
1798 | } | 1823 | } |
1799 | 1824 | ||
1800 | /* | 1825 | /* |
1826 | * Reclaim/compaction depends on a number of pages being freed. To avoid | ||
1827 | * disruption to the system, a small number of order-0 pages continue to be | ||
1828 | * rotated and reclaimed in the normal fashion. However, by the time we get | ||
1829 | * back to the allocator and call try_to_compact_zone(), we ensure that | ||
1830 | * there are enough free pages for it to be likely successful | ||
1831 | */ | ||
1832 | static inline bool should_continue_reclaim(struct zone *zone, | ||
1833 | unsigned long nr_reclaimed, | ||
1834 | unsigned long nr_scanned, | ||
1835 | struct scan_control *sc) | ||
1836 | { | ||
1837 | unsigned long pages_for_compaction; | ||
1838 | unsigned long inactive_lru_pages; | ||
1839 | |||
1840 | /* If not in reclaim/compaction mode, stop */ | ||
1841 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | ||
1842 | return false; | ||
1843 | |||
1844 | /* | ||
1845 | * If we failed to reclaim and have scanned the full list, stop. | ||
1846 | * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far | ||
1847 | * faster but obviously would be less likely to succeed | ||
1848 | * allocation. If this is desirable, use GFP_REPEAT to decide | ||
1849 | * if both reclaimed and scanned should be checked or just | ||
1850 | * reclaimed | ||
1851 | */ | ||
1852 | if (!nr_reclaimed && !nr_scanned) | ||
1853 | return false; | ||
1854 | |||
1855 | /* | ||
1856 | * If we have not reclaimed enough pages for compaction and the | ||
1857 | * inactive lists are large enough, continue reclaiming | ||
1858 | */ | ||
1859 | pages_for_compaction = (2UL << sc->order); | ||
1860 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + | ||
1861 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1862 | if (sc->nr_reclaimed < pages_for_compaction && | ||
1863 | inactive_lru_pages > pages_for_compaction) | ||
1864 | return true; | ||
1865 | |||
1866 | /* If compaction would go ahead or the allocation would succeed, stop */ | ||
1867 | switch (compaction_suitable(zone, sc->order)) { | ||
1868 | case COMPACT_PARTIAL: | ||
1869 | case COMPACT_CONTINUE: | ||
1870 | return false; | ||
1871 | default: | ||
1872 | return true; | ||
1873 | } | ||
1874 | } | ||
1875 | |||
1876 | /* | ||
1801 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1877 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1802 | */ | 1878 | */ |
1803 | static void shrink_zone(int priority, struct zone *zone, | 1879 | static void shrink_zone(int priority, struct zone *zone, |
@@ -1806,9 +1882,12 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1806 | unsigned long nr[NR_LRU_LISTS]; | 1882 | unsigned long nr[NR_LRU_LISTS]; |
1807 | unsigned long nr_to_scan; | 1883 | unsigned long nr_to_scan; |
1808 | enum lru_list l; | 1884 | enum lru_list l; |
1809 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1885 | unsigned long nr_reclaimed, nr_scanned; |
1810 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1886 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1811 | 1887 | ||
1888 | restart: | ||
1889 | nr_reclaimed = 0; | ||
1890 | nr_scanned = sc->nr_scanned; | ||
1812 | get_scan_count(zone, sc, nr, priority); | 1891 | get_scan_count(zone, sc, nr, priority); |
1813 | 1892 | ||
1814 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1893 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
@@ -1834,8 +1913,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1834 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 1913 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1835 | break; | 1914 | break; |
1836 | } | 1915 | } |
1837 | 1916 | sc->nr_reclaimed += nr_reclaimed; | |
1838 | sc->nr_reclaimed = nr_reclaimed; | ||
1839 | 1917 | ||
1840 | /* | 1918 | /* |
1841 | * Even if we did not try to evict anon pages at all, we want to | 1919 | * Even if we did not try to evict anon pages at all, we want to |
@@ -1844,6 +1922,11 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1844 | if (inactive_anon_is_low(zone, sc)) | 1922 | if (inactive_anon_is_low(zone, sc)) |
1845 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1923 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1846 | 1924 | ||
1925 | /* reclaim/compaction might need reclaim to continue */ | ||
1926 | if (should_continue_reclaim(zone, nr_reclaimed, | ||
1927 | sc->nr_scanned - nr_scanned, sc)) | ||
1928 | goto restart; | ||
1929 | |||
1847 | throttle_vm_writeout(sc->gfp_mask); | 1930 | throttle_vm_writeout(sc->gfp_mask); |
1848 | } | 1931 | } |
1849 | 1932 | ||
@@ -2000,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2000 | struct zone *preferred_zone; | 2083 | struct zone *preferred_zone; |
2001 | 2084 | ||
2002 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | 2085 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), |
2003 | NULL, &preferred_zone); | 2086 | &cpuset_current_mems_allowed, |
2087 | &preferred_zone); | ||
2004 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | 2088 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); |
2005 | } | 2089 | } |
2006 | } | 2090 | } |
@@ -2117,38 +2201,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2117 | } | 2201 | } |
2118 | #endif | 2202 | #endif |
2119 | 2203 | ||
2204 | /* | ||
2205 | * pgdat_balanced is used when checking if a node is balanced for high-order | ||
2206 | * allocations. Only zones that meet watermarks and are in a zone allowed | ||
2207 | * by the callers classzone_idx are added to balanced_pages. The total of | ||
2208 | * balanced pages must be at least 25% of the zones allowed by classzone_idx | ||
2209 | * for the node to be considered balanced. Forcing all zones to be balanced | ||
2210 | * for high orders can cause excessive reclaim when there are imbalanced zones. | ||
2211 | * The choice of 25% is due to | ||
2212 | * o a 16M DMA zone that is balanced will not balance a zone on any | ||
2213 | * reasonable sized machine | ||
2214 | * o On all other machines, the top zone must be at least a reasonable | ||
2215 | * precentage of the middle zones. For example, on 32-bit x86, highmem | ||
2216 | * would need to be at least 256M for it to be balance a whole node. | ||
2217 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | ||
2218 | * to balance a node on its own. These seemed like reasonable ratios. | ||
2219 | */ | ||
2220 | static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | ||
2221 | int classzone_idx) | ||
2222 | { | ||
2223 | unsigned long present_pages = 0; | ||
2224 | int i; | ||
2225 | |||
2226 | for (i = 0; i <= classzone_idx; i++) | ||
2227 | present_pages += pgdat->node_zones[i].present_pages; | ||
2228 | |||
2229 | return balanced_pages > (present_pages >> 2); | ||
2230 | } | ||
2231 | |||
2120 | /* is kswapd sleeping prematurely? */ | 2232 | /* is kswapd sleeping prematurely? */ |
2121 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | 2233 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, |
2234 | int classzone_idx) | ||
2122 | { | 2235 | { |
2123 | int i; | 2236 | int i; |
2237 | unsigned long balanced = 0; | ||
2238 | bool all_zones_ok = true; | ||
2124 | 2239 | ||
2125 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2240 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2126 | if (remaining) | 2241 | if (remaining) |
2127 | return 1; | 2242 | return true; |
2128 | 2243 | ||
2129 | /* If after HZ/10, a zone is below the high mark, it's premature */ | 2244 | /* Check the watermark levels */ |
2130 | for (i = 0; i < pgdat->nr_zones; i++) { | 2245 | for (i = 0; i < pgdat->nr_zones; i++) { |
2131 | struct zone *zone = pgdat->node_zones + i; | 2246 | struct zone *zone = pgdat->node_zones + i; |
2132 | 2247 | ||
2133 | if (!populated_zone(zone)) | 2248 | if (!populated_zone(zone)) |
2134 | continue; | 2249 | continue; |
2135 | 2250 | ||
2136 | if (zone->all_unreclaimable) | 2251 | /* |
2252 | * balance_pgdat() skips over all_unreclaimable after | ||
2253 | * DEF_PRIORITY. Effectively, it considers them balanced so | ||
2254 | * they must be considered balanced here as well if kswapd | ||
2255 | * is to sleep | ||
2256 | */ | ||
2257 | if (zone->all_unreclaimable) { | ||
2258 | balanced += zone->present_pages; | ||
2137 | continue; | 2259 | continue; |
2260 | } | ||
2138 | 2261 | ||
2139 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 2262 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2140 | 0, 0)) | 2263 | classzone_idx, 0)) |
2141 | return 1; | 2264 | all_zones_ok = false; |
2265 | else | ||
2266 | balanced += zone->present_pages; | ||
2142 | } | 2267 | } |
2143 | 2268 | ||
2144 | return 0; | 2269 | /* |
2270 | * For high-order requests, the balanced zones must contain at least | ||
2271 | * 25% of the nodes pages for kswapd to sleep. For order-0, all zones | ||
2272 | * must be balanced | ||
2273 | */ | ||
2274 | if (order) | ||
2275 | return pgdat_balanced(pgdat, balanced, classzone_idx); | ||
2276 | else | ||
2277 | return !all_zones_ok; | ||
2145 | } | 2278 | } |
2146 | 2279 | ||
2147 | /* | 2280 | /* |
2148 | * For kswapd, balance_pgdat() will work across all this node's zones until | 2281 | * For kswapd, balance_pgdat() will work across all this node's zones until |
2149 | * they are all at high_wmark_pages(zone). | 2282 | * they are all at high_wmark_pages(zone). |
2150 | * | 2283 | * |
2151 | * Returns the number of pages which were actually freed. | 2284 | * Returns the final order kswapd was reclaiming at |
2152 | * | 2285 | * |
2153 | * There is special handling here for zones which are full of pinned pages. | 2286 | * There is special handling here for zones which are full of pinned pages. |
2154 | * This can happen if the pages are all mlocked, or if they are all used by | 2287 | * This can happen if the pages are all mlocked, or if they are all used by |
@@ -2165,11 +2298,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
2165 | * interoperates with the page allocator fallback scheme to ensure that aging | 2298 | * interoperates with the page allocator fallback scheme to ensure that aging |
2166 | * of pages is balanced across the zones. | 2299 | * of pages is balanced across the zones. |
2167 | */ | 2300 | */ |
2168 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 2301 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2302 | int *classzone_idx) | ||
2169 | { | 2303 | { |
2170 | int all_zones_ok; | 2304 | int all_zones_ok; |
2305 | unsigned long balanced; | ||
2171 | int priority; | 2306 | int priority; |
2172 | int i; | 2307 | int i; |
2308 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2173 | unsigned long total_scanned; | 2309 | unsigned long total_scanned; |
2174 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2310 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2175 | struct scan_control sc = { | 2311 | struct scan_control sc = { |
@@ -2192,7 +2328,6 @@ loop_again: | |||
2192 | count_vm_event(PAGEOUTRUN); | 2328 | count_vm_event(PAGEOUTRUN); |
2193 | 2329 | ||
2194 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2330 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2195 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2196 | unsigned long lru_pages = 0; | 2331 | unsigned long lru_pages = 0; |
2197 | int has_under_min_watermark_zone = 0; | 2332 | int has_under_min_watermark_zone = 0; |
2198 | 2333 | ||
@@ -2201,6 +2336,7 @@ loop_again: | |||
2201 | disable_swap_token(); | 2336 | disable_swap_token(); |
2202 | 2337 | ||
2203 | all_zones_ok = 1; | 2338 | all_zones_ok = 1; |
2339 | balanced = 0; | ||
2204 | 2340 | ||
2205 | /* | 2341 | /* |
2206 | * Scan in the highmem->dma direction for the highest | 2342 | * Scan in the highmem->dma direction for the highest |
@@ -2223,9 +2359,10 @@ loop_again: | |||
2223 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2359 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
2224 | &sc, priority, 0); | 2360 | &sc, priority, 0); |
2225 | 2361 | ||
2226 | if (!zone_watermark_ok(zone, order, | 2362 | if (!zone_watermark_ok_safe(zone, order, |
2227 | high_wmark_pages(zone), 0, 0)) { | 2363 | high_wmark_pages(zone), 0, 0)) { |
2228 | end_zone = i; | 2364 | end_zone = i; |
2365 | *classzone_idx = i; | ||
2229 | break; | 2366 | break; |
2230 | } | 2367 | } |
2231 | } | 2368 | } |
@@ -2248,6 +2385,7 @@ loop_again: | |||
2248 | * cause too much scanning of the lower zones. | 2385 | * cause too much scanning of the lower zones. |
2249 | */ | 2386 | */ |
2250 | for (i = 0; i <= end_zone; i++) { | 2387 | for (i = 0; i <= end_zone; i++) { |
2388 | int compaction; | ||
2251 | struct zone *zone = pgdat->node_zones + i; | 2389 | struct zone *zone = pgdat->node_zones + i; |
2252 | int nr_slab; | 2390 | int nr_slab; |
2253 | 2391 | ||
@@ -2269,7 +2407,7 @@ loop_again: | |||
2269 | * We put equal pressure on every zone, unless one | 2407 | * We put equal pressure on every zone, unless one |
2270 | * zone has way too many pages free already. | 2408 | * zone has way too many pages free already. |
2271 | */ | 2409 | */ |
2272 | if (!zone_watermark_ok(zone, order, | 2410 | if (!zone_watermark_ok_safe(zone, order, |
2273 | 8*high_wmark_pages(zone), end_zone, 0)) | 2411 | 8*high_wmark_pages(zone), end_zone, 0)) |
2274 | shrink_zone(priority, zone, &sc); | 2412 | shrink_zone(priority, zone, &sc); |
2275 | reclaim_state->reclaimed_slab = 0; | 2413 | reclaim_state->reclaimed_slab = 0; |
@@ -2277,9 +2415,26 @@ loop_again: | |||
2277 | lru_pages); | 2415 | lru_pages); |
2278 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2416 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2279 | total_scanned += sc.nr_scanned; | 2417 | total_scanned += sc.nr_scanned; |
2418 | |||
2419 | compaction = 0; | ||
2420 | if (order && | ||
2421 | zone_watermark_ok(zone, 0, | ||
2422 | high_wmark_pages(zone), | ||
2423 | end_zone, 0) && | ||
2424 | !zone_watermark_ok(zone, order, | ||
2425 | high_wmark_pages(zone), | ||
2426 | end_zone, 0)) { | ||
2427 | compact_zone_order(zone, | ||
2428 | order, | ||
2429 | sc.gfp_mask, false, | ||
2430 | COMPACT_MODE_KSWAPD); | ||
2431 | compaction = 1; | ||
2432 | } | ||
2433 | |||
2280 | if (zone->all_unreclaimable) | 2434 | if (zone->all_unreclaimable) |
2281 | continue; | 2435 | continue; |
2282 | if (nr_slab == 0 && !zone_reclaimable(zone)) | 2436 | if (!compaction && nr_slab == 0 && |
2437 | !zone_reclaimable(zone)) | ||
2283 | zone->all_unreclaimable = 1; | 2438 | zone->all_unreclaimable = 1; |
2284 | /* | 2439 | /* |
2285 | * If we've done a decent amount of scanning and | 2440 | * If we've done a decent amount of scanning and |
@@ -2290,7 +2445,7 @@ loop_again: | |||
2290 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2445 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2291 | sc.may_writepage = 1; | 2446 | sc.may_writepage = 1; |
2292 | 2447 | ||
2293 | if (!zone_watermark_ok(zone, order, | 2448 | if (!zone_watermark_ok_safe(zone, order, |
2294 | high_wmark_pages(zone), end_zone, 0)) { | 2449 | high_wmark_pages(zone), end_zone, 0)) { |
2295 | all_zones_ok = 0; | 2450 | all_zones_ok = 0; |
2296 | /* | 2451 | /* |
@@ -2298,7 +2453,7 @@ loop_again: | |||
2298 | * means that we have a GFP_ATOMIC allocation | 2453 | * means that we have a GFP_ATOMIC allocation |
2299 | * failure risk. Hurry up! | 2454 | * failure risk. Hurry up! |
2300 | */ | 2455 | */ |
2301 | if (!zone_watermark_ok(zone, order, | 2456 | if (!zone_watermark_ok_safe(zone, order, |
2302 | min_wmark_pages(zone), end_zone, 0)) | 2457 | min_wmark_pages(zone), end_zone, 0)) |
2303 | has_under_min_watermark_zone = 1; | 2458 | has_under_min_watermark_zone = 1; |
2304 | } else { | 2459 | } else { |
@@ -2310,10 +2465,12 @@ loop_again: | |||
2310 | * spectulatively avoid congestion waits | 2465 | * spectulatively avoid congestion waits |
2311 | */ | 2466 | */ |
2312 | zone_clear_flag(zone, ZONE_CONGESTED); | 2467 | zone_clear_flag(zone, ZONE_CONGESTED); |
2468 | if (i <= *classzone_idx) | ||
2469 | balanced += zone->present_pages; | ||
2313 | } | 2470 | } |
2314 | 2471 | ||
2315 | } | 2472 | } |
2316 | if (all_zones_ok) | 2473 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2317 | break; /* kswapd: all done */ | 2474 | break; /* kswapd: all done */ |
2318 | /* | 2475 | /* |
2319 | * OK, kswapd is getting into trouble. Take a nap, then take | 2476 | * OK, kswapd is getting into trouble. Take a nap, then take |
@@ -2336,7 +2493,13 @@ loop_again: | |||
2336 | break; | 2493 | break; |
2337 | } | 2494 | } |
2338 | out: | 2495 | out: |
2339 | if (!all_zones_ok) { | 2496 | |
2497 | /* | ||
2498 | * order-0: All zones must meet high watermark for a balanced node | ||
2499 | * high-order: Balanced zones must make up at least 25% of the node | ||
2500 | * for the node to be balanced | ||
2501 | */ | ||
2502 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { | ||
2340 | cond_resched(); | 2503 | cond_resched(); |
2341 | 2504 | ||
2342 | try_to_freeze(); | 2505 | try_to_freeze(); |
@@ -2361,7 +2524,88 @@ out: | |||
2361 | goto loop_again; | 2524 | goto loop_again; |
2362 | } | 2525 | } |
2363 | 2526 | ||
2364 | return sc.nr_reclaimed; | 2527 | /* |
2528 | * If kswapd was reclaiming at a higher order, it has the option of | ||
2529 | * sleeping without all zones being balanced. Before it does, it must | ||
2530 | * ensure that the watermarks for order-0 on *all* zones are met and | ||
2531 | * that the congestion flags are cleared. The congestion flag must | ||
2532 | * be cleared as kswapd is the only mechanism that clears the flag | ||
2533 | * and it is potentially going to sleep here. | ||
2534 | */ | ||
2535 | if (order) { | ||
2536 | for (i = 0; i <= end_zone; i++) { | ||
2537 | struct zone *zone = pgdat->node_zones + i; | ||
2538 | |||
2539 | if (!populated_zone(zone)) | ||
2540 | continue; | ||
2541 | |||
2542 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | ||
2543 | continue; | ||
2544 | |||
2545 | /* Confirm the zone is balanced for order-0 */ | ||
2546 | if (!zone_watermark_ok(zone, 0, | ||
2547 | high_wmark_pages(zone), 0, 0)) { | ||
2548 | order = sc.order = 0; | ||
2549 | goto loop_again; | ||
2550 | } | ||
2551 | |||
2552 | /* If balanced, clear the congested flag */ | ||
2553 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2554 | } | ||
2555 | } | ||
2556 | |||
2557 | /* | ||
2558 | * Return the order we were reclaiming at so sleeping_prematurely() | ||
2559 | * makes a decision on the order we were last reclaiming at. However, | ||
2560 | * if another caller entered the allocator slow path while kswapd | ||
2561 | * was awake, order will remain at the higher level | ||
2562 | */ | ||
2563 | *classzone_idx = end_zone; | ||
2564 | return order; | ||
2565 | } | ||
2566 | |||
2567 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | ||
2568 | { | ||
2569 | long remaining = 0; | ||
2570 | DEFINE_WAIT(wait); | ||
2571 | |||
2572 | if (freezing(current) || kthread_should_stop()) | ||
2573 | return; | ||
2574 | |||
2575 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2576 | |||
2577 | /* Try to sleep for a short interval */ | ||
2578 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2579 | remaining = schedule_timeout(HZ/10); | ||
2580 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2581 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2582 | } | ||
2583 | |||
2584 | /* | ||
2585 | * After a short sleep, check if it was a premature sleep. If not, then | ||
2586 | * go fully to sleep until explicitly woken up. | ||
2587 | */ | ||
2588 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2589 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2590 | |||
2591 | /* | ||
2592 | * vmstat counters are not perfectly accurate and the estimated | ||
2593 | * value for counters such as NR_FREE_PAGES can deviate from the | ||
2594 | * true value by nr_online_cpus * threshold. To avoid the zone | ||
2595 | * watermarks being breached while under pressure, we reduce the | ||
2596 | * per-cpu vmstat threshold while kswapd is awake and restore | ||
2597 | * them before going back to sleep. | ||
2598 | */ | ||
2599 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | ||
2600 | schedule(); | ||
2601 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | ||
2602 | } else { | ||
2603 | if (remaining) | ||
2604 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2605 | else | ||
2606 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2607 | } | ||
2608 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2365 | } | 2609 | } |
2366 | 2610 | ||
2367 | /* | 2611 | /* |
@@ -2380,9 +2624,10 @@ out: | |||
2380 | static int kswapd(void *p) | 2624 | static int kswapd(void *p) |
2381 | { | 2625 | { |
2382 | unsigned long order; | 2626 | unsigned long order; |
2627 | int classzone_idx; | ||
2383 | pg_data_t *pgdat = (pg_data_t*)p; | 2628 | pg_data_t *pgdat = (pg_data_t*)p; |
2384 | struct task_struct *tsk = current; | 2629 | struct task_struct *tsk = current; |
2385 | DEFINE_WAIT(wait); | 2630 | |
2386 | struct reclaim_state reclaim_state = { | 2631 | struct reclaim_state reclaim_state = { |
2387 | .reclaimed_slab = 0, | 2632 | .reclaimed_slab = 0, |
2388 | }; | 2633 | }; |
@@ -2410,49 +2655,30 @@ static int kswapd(void *p) | |||
2410 | set_freezable(); | 2655 | set_freezable(); |
2411 | 2656 | ||
2412 | order = 0; | 2657 | order = 0; |
2658 | classzone_idx = MAX_NR_ZONES - 1; | ||
2413 | for ( ; ; ) { | 2659 | for ( ; ; ) { |
2414 | unsigned long new_order; | 2660 | unsigned long new_order; |
2661 | int new_classzone_idx; | ||
2415 | int ret; | 2662 | int ret; |
2416 | 2663 | ||
2417 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2418 | new_order = pgdat->kswapd_max_order; | 2664 | new_order = pgdat->kswapd_max_order; |
2665 | new_classzone_idx = pgdat->classzone_idx; | ||
2419 | pgdat->kswapd_max_order = 0; | 2666 | pgdat->kswapd_max_order = 0; |
2420 | if (order < new_order) { | 2667 | pgdat->classzone_idx = MAX_NR_ZONES - 1; |
2668 | if (order < new_order || classzone_idx > new_classzone_idx) { | ||
2421 | /* | 2669 | /* |
2422 | * Don't sleep if someone wants a larger 'order' | 2670 | * Don't sleep if someone wants a larger 'order' |
2423 | * allocation | 2671 | * allocation or has tigher zone constraints |
2424 | */ | 2672 | */ |
2425 | order = new_order; | 2673 | order = new_order; |
2674 | classzone_idx = new_classzone_idx; | ||
2426 | } else { | 2675 | } else { |
2427 | if (!freezing(current) && !kthread_should_stop()) { | 2676 | kswapd_try_to_sleep(pgdat, order, classzone_idx); |
2428 | long remaining = 0; | ||
2429 | |||
2430 | /* Try to sleep for a short interval */ | ||
2431 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2432 | remaining = schedule_timeout(HZ/10); | ||
2433 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2434 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2435 | } | ||
2436 | |||
2437 | /* | ||
2438 | * After a short sleep, check if it was a | ||
2439 | * premature sleep. If not, then go fully | ||
2440 | * to sleep until explicitly woken up | ||
2441 | */ | ||
2442 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2443 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2444 | schedule(); | ||
2445 | } else { | ||
2446 | if (remaining) | ||
2447 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2448 | else | ||
2449 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2450 | } | ||
2451 | } | ||
2452 | |||
2453 | order = pgdat->kswapd_max_order; | 2677 | order = pgdat->kswapd_max_order; |
2678 | classzone_idx = pgdat->classzone_idx; | ||
2679 | pgdat->kswapd_max_order = 0; | ||
2680 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | ||
2454 | } | 2681 | } |
2455 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2456 | 2682 | ||
2457 | ret = try_to_freeze(); | 2683 | ret = try_to_freeze(); |
2458 | if (kthread_should_stop()) | 2684 | if (kthread_should_stop()) |
@@ -2464,7 +2690,7 @@ static int kswapd(void *p) | |||
2464 | */ | 2690 | */ |
2465 | if (!ret) { | 2691 | if (!ret) { |
2466 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2692 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2467 | balance_pgdat(pgdat, order); | 2693 | order = balance_pgdat(pgdat, order, &classzone_idx); |
2468 | } | 2694 | } |
2469 | } | 2695 | } |
2470 | return 0; | 2696 | return 0; |
@@ -2473,23 +2699,26 @@ static int kswapd(void *p) | |||
2473 | /* | 2699 | /* |
2474 | * A zone is low on free memory, so wake its kswapd task to service it. | 2700 | * A zone is low on free memory, so wake its kswapd task to service it. |
2475 | */ | 2701 | */ |
2476 | void wakeup_kswapd(struct zone *zone, int order) | 2702 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
2477 | { | 2703 | { |
2478 | pg_data_t *pgdat; | 2704 | pg_data_t *pgdat; |
2479 | 2705 | ||
2480 | if (!populated_zone(zone)) | 2706 | if (!populated_zone(zone)) |
2481 | return; | 2707 | return; |
2482 | 2708 | ||
2483 | pgdat = zone->zone_pgdat; | ||
2484 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2485 | return; | ||
2486 | if (pgdat->kswapd_max_order < order) | ||
2487 | pgdat->kswapd_max_order = order; | ||
2488 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2489 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2709 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2490 | return; | 2710 | return; |
2711 | pgdat = zone->zone_pgdat; | ||
2712 | if (pgdat->kswapd_max_order < order) { | ||
2713 | pgdat->kswapd_max_order = order; | ||
2714 | pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); | ||
2715 | } | ||
2491 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2716 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
2492 | return; | 2717 | return; |
2718 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2719 | return; | ||
2720 | |||
2721 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2493 | wake_up_interruptible(&pgdat->kswapd_wait); | 2722 | wake_up_interruptible(&pgdat->kswapd_wait); |
2494 | } | 2723 | } |
2495 | 2724 | ||