diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 432 |
1 files changed, 328 insertions, 104 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ca587c69274..99999a9b2b0b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
35 | #include <linux/compaction.h> | ||
35 | #include <linux/notifier.h> | 36 | #include <linux/notifier.h> |
36 | #include <linux/rwsem.h> | 37 | #include <linux/rwsem.h> |
37 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
@@ -40,6 +41,7 @@ | |||
40 | #include <linux/memcontrol.h> | 41 | #include <linux/memcontrol.h> |
41 | #include <linux/delayacct.h> | 42 | #include <linux/delayacct.h> |
42 | #include <linux/sysctl.h> | 43 | #include <linux/sysctl.h> |
44 | #include <linux/compaction.h> | ||
43 | 45 | ||
44 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
45 | #include <asm/div64.h> | 47 | #include <asm/div64.h> |
@@ -51,11 +53,23 @@ | |||
51 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
52 | #include <trace/events/vmscan.h> | 54 | #include <trace/events/vmscan.h> |
53 | 55 | ||
54 | enum lumpy_mode { | 56 | /* |
55 | LUMPY_MODE_NONE, | 57 | * reclaim_mode determines how the inactive list is shrunk |
56 | LUMPY_MODE_ASYNC, | 58 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages |
57 | LUMPY_MODE_SYNC, | 59 | * RECLAIM_MODE_ASYNC: Do not block |
58 | }; | 60 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback |
61 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | ||
62 | * page from the LRU and reclaim all pages within a | ||
63 | * naturally aligned range | ||
64 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | ||
65 | * order-0 pages and then compact the zone | ||
66 | */ | ||
67 | typedef unsigned __bitwise__ reclaim_mode_t; | ||
68 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | ||
69 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | ||
70 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | ||
71 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | ||
72 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | ||
59 | 73 | ||
60 | struct scan_control { | 74 | struct scan_control { |
61 | /* Incremented by the number of inactive pages that were scanned */ | 75 | /* Incremented by the number of inactive pages that were scanned */ |
@@ -88,7 +102,7 @@ struct scan_control { | |||
88 | * Intend to reclaim enough continuous memory rather than reclaim | 102 | * Intend to reclaim enough continuous memory rather than reclaim |
89 | * enough amount of memory. i.e, mode for high order allocation. | 103 | * enough amount of memory. i.e, mode for high order allocation. |
90 | */ | 104 | */ |
91 | enum lumpy_mode lumpy_reclaim_mode; | 105 | reclaim_mode_t reclaim_mode; |
92 | 106 | ||
93 | /* Which cgroup do we reclaim from */ | 107 | /* Which cgroup do we reclaim from */ |
94 | struct mem_cgroup *mem_cgroup; | 108 | struct mem_cgroup *mem_cgroup; |
@@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
271 | return ret; | 285 | return ret; |
272 | } | 286 | } |
273 | 287 | ||
274 | static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, | 288 | static void set_reclaim_mode(int priority, struct scan_control *sc, |
275 | bool sync) | 289 | bool sync) |
276 | { | 290 | { |
277 | enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; | 291 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; |
278 | 292 | ||
279 | /* | 293 | /* |
280 | * Some reclaim have alredy been failed. No worth to try synchronous | 294 | * Initially assume we are entering either lumpy reclaim or |
281 | * lumpy reclaim. | 295 | * reclaim/compaction.Depending on the order, we will either set the |
296 | * sync mode or just reclaim order-0 pages later. | ||
282 | */ | 297 | */ |
283 | if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 298 | if (COMPACTION_BUILD) |
284 | return; | 299 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; |
300 | else | ||
301 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | ||
285 | 302 | ||
286 | /* | 303 | /* |
287 | * If we need a large contiguous chunk of memory, or have | 304 | * Avoid using lumpy reclaim or reclaim/compaction if possible by |
288 | * trouble getting a small set of contiguous pages, we | 305 | * restricting when its set to either costly allocations or when |
289 | * will reclaim both active and inactive pages. | 306 | * under memory pressure |
290 | */ | 307 | */ |
291 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 308 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) |
292 | sc->lumpy_reclaim_mode = mode; | 309 | sc->reclaim_mode |= syncmode; |
293 | else if (sc->order && priority < DEF_PRIORITY - 2) | 310 | else if (sc->order && priority < DEF_PRIORITY - 2) |
294 | sc->lumpy_reclaim_mode = mode; | 311 | sc->reclaim_mode |= syncmode; |
295 | else | 312 | else |
296 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 313 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
297 | } | 314 | } |
298 | 315 | ||
299 | static void disable_lumpy_reclaim_mode(struct scan_control *sc) | 316 | static void reset_reclaim_mode(struct scan_control *sc) |
300 | { | 317 | { |
301 | sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; | 318 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; |
302 | } | 319 | } |
303 | 320 | ||
304 | static inline int is_page_cache_freeable(struct page *page) | 321 | static inline int is_page_cache_freeable(struct page *page) |
@@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
429 | * first attempt to free a range of pages fails. | 446 | * first attempt to free a range of pages fails. |
430 | */ | 447 | */ |
431 | if (PageWriteback(page) && | 448 | if (PageWriteback(page) && |
432 | sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) | 449 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) |
433 | wait_on_page_writeback(page); | 450 | wait_on_page_writeback(page); |
434 | 451 | ||
435 | if (!PageWriteback(page)) { | 452 | if (!PageWriteback(page)) { |
@@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
437 | ClearPageReclaim(page); | 454 | ClearPageReclaim(page); |
438 | } | 455 | } |
439 | trace_mm_vmscan_writepage(page, | 456 | trace_mm_vmscan_writepage(page, |
440 | trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); | 457 | trace_reclaim_flags(page, sc->reclaim_mode)); |
441 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 458 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
442 | return PAGE_SUCCESS; | 459 | return PAGE_SUCCESS; |
443 | } | 460 | } |
@@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page, | |||
622 | referenced_page = TestClearPageReferenced(page); | 639 | referenced_page = TestClearPageReferenced(page); |
623 | 640 | ||
624 | /* Lumpy reclaim - ignore references */ | 641 | /* Lumpy reclaim - ignore references */ |
625 | if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) | 642 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
626 | return PAGEREF_RECLAIM; | 643 | return PAGEREF_RECLAIM; |
627 | 644 | ||
628 | /* | 645 | /* |
@@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
739 | * for any page for which writeback has already | 756 | * for any page for which writeback has already |
740 | * started. | 757 | * started. |
741 | */ | 758 | */ |
742 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && | 759 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
743 | may_enter_fs) | 760 | may_enter_fs) |
744 | wait_on_page_writeback(page); | 761 | wait_on_page_writeback(page); |
745 | else { | 762 | else { |
@@ -895,7 +912,7 @@ cull_mlocked: | |||
895 | try_to_free_swap(page); | 912 | try_to_free_swap(page); |
896 | unlock_page(page); | 913 | unlock_page(page); |
897 | putback_lru_page(page); | 914 | putback_lru_page(page); |
898 | disable_lumpy_reclaim_mode(sc); | 915 | reset_reclaim_mode(sc); |
899 | continue; | 916 | continue; |
900 | 917 | ||
901 | activate_locked: | 918 | activate_locked: |
@@ -908,7 +925,7 @@ activate_locked: | |||
908 | keep_locked: | 925 | keep_locked: |
909 | unlock_page(page); | 926 | unlock_page(page); |
910 | keep: | 927 | keep: |
911 | disable_lumpy_reclaim_mode(sc); | 928 | reset_reclaim_mode(sc); |
912 | keep_lumpy: | 929 | keep_lumpy: |
913 | list_add(&page->lru, &ret_pages); | 930 | list_add(&page->lru, &ret_pages); |
914 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 931 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
@@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1028 | case 0: | 1045 | case 0: |
1029 | list_move(&page->lru, dst); | 1046 | list_move(&page->lru, dst); |
1030 | mem_cgroup_del_lru(page); | 1047 | mem_cgroup_del_lru(page); |
1031 | nr_taken++; | 1048 | nr_taken += hpage_nr_pages(page); |
1032 | break; | 1049 | break; |
1033 | 1050 | ||
1034 | case -EBUSY: | 1051 | case -EBUSY: |
@@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1086 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | 1103 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
1087 | list_move(&cursor_page->lru, dst); | 1104 | list_move(&cursor_page->lru, dst); |
1088 | mem_cgroup_del_lru(cursor_page); | 1105 | mem_cgroup_del_lru(cursor_page); |
1089 | nr_taken++; | 1106 | nr_taken += hpage_nr_pages(page); |
1090 | nr_lumpy_taken++; | 1107 | nr_lumpy_taken++; |
1091 | if (PageDirty(cursor_page)) | 1108 | if (PageDirty(cursor_page)) |
1092 | nr_lumpy_dirty++; | 1109 | nr_lumpy_dirty++; |
@@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, | |||
1141 | struct page *page; | 1158 | struct page *page; |
1142 | 1159 | ||
1143 | list_for_each_entry(page, page_list, lru) { | 1160 | list_for_each_entry(page, page_list, lru) { |
1161 | int numpages = hpage_nr_pages(page); | ||
1144 | lru = page_lru_base_type(page); | 1162 | lru = page_lru_base_type(page); |
1145 | if (PageActive(page)) { | 1163 | if (PageActive(page)) { |
1146 | lru += LRU_ACTIVE; | 1164 | lru += LRU_ACTIVE; |
1147 | ClearPageActive(page); | 1165 | ClearPageActive(page); |
1148 | nr_active++; | 1166 | nr_active += numpages; |
1149 | } | 1167 | } |
1150 | if (count) | 1168 | if (count) |
1151 | count[lru]++; | 1169 | count[lru] += numpages; |
1152 | } | 1170 | } |
1153 | 1171 | ||
1154 | return nr_active; | 1172 | return nr_active; |
@@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1253 | spin_lock_irq(&zone->lru_lock); | 1271 | spin_lock_irq(&zone->lru_lock); |
1254 | continue; | 1272 | continue; |
1255 | } | 1273 | } |
1256 | SetPageLRU(page); | ||
1257 | lru = page_lru(page); | 1274 | lru = page_lru(page); |
1258 | add_page_to_lru_list(zone, page, lru); | ||
1259 | if (is_active_lru(lru)) { | 1275 | if (is_active_lru(lru)) { |
1260 | int file = is_file_lru(lru); | 1276 | int file = is_file_lru(lru); |
1261 | reclaim_stat->recent_rotated[file]++; | 1277 | int numpages = hpage_nr_pages(page); |
1278 | reclaim_stat->recent_rotated[file] += numpages; | ||
1279 | if (putback_active_lru_page(zone, page)) | ||
1280 | continue; | ||
1262 | } | 1281 | } |
1282 | SetPageLRU(page); | ||
1283 | add_page_to_lru_list(zone, page, lru); | ||
1263 | if (!pagevec_add(&pvec, page)) { | 1284 | if (!pagevec_add(&pvec, page)) { |
1264 | spin_unlock_irq(&zone->lru_lock); | 1285 | spin_unlock_irq(&zone->lru_lock); |
1265 | __pagevec_release(&pvec); | 1286 | __pagevec_release(&pvec); |
@@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
1324 | return false; | 1345 | return false; |
1325 | 1346 | ||
1326 | /* Only stall on lumpy reclaim */ | 1347 | /* Only stall on lumpy reclaim */ |
1327 | if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) | 1348 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
1328 | return false; | 1349 | return false; |
1329 | 1350 | ||
1330 | /* If we have relaimed everything on the isolated list, no stall */ | 1351 | /* If we have relaimed everything on the isolated list, no stall */ |
@@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1368 | return SWAP_CLUSTER_MAX; | 1389 | return SWAP_CLUSTER_MAX; |
1369 | } | 1390 | } |
1370 | 1391 | ||
1371 | set_lumpy_reclaim_mode(priority, sc, false); | 1392 | set_reclaim_mode(priority, sc, false); |
1372 | lru_add_drain(); | 1393 | lru_add_drain(); |
1373 | spin_lock_irq(&zone->lru_lock); | 1394 | spin_lock_irq(&zone->lru_lock); |
1374 | 1395 | ||
1375 | if (scanning_global_lru(sc)) { | 1396 | if (scanning_global_lru(sc)) { |
1376 | nr_taken = isolate_pages_global(nr_to_scan, | 1397 | nr_taken = isolate_pages_global(nr_to_scan, |
1377 | &page_list, &nr_scanned, sc->order, | 1398 | &page_list, &nr_scanned, sc->order, |
1378 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1399 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1379 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1400 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1380 | zone, 0, file); | 1401 | zone, 0, file); |
1381 | zone->pages_scanned += nr_scanned; | 1402 | zone->pages_scanned += nr_scanned; |
1382 | if (current_is_kswapd()) | 1403 | if (current_is_kswapd()) |
@@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1388 | } else { | 1409 | } else { |
1389 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1410 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, |
1390 | &page_list, &nr_scanned, sc->order, | 1411 | &page_list, &nr_scanned, sc->order, |
1391 | sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? | 1412 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1392 | ISOLATE_INACTIVE : ISOLATE_BOTH, | 1413 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1393 | zone, sc->mem_cgroup, | 1414 | zone, sc->mem_cgroup, |
1394 | 0, file); | 1415 | 0, file); |
1395 | /* | 1416 | /* |
@@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1411 | 1432 | ||
1412 | /* Check if we should syncronously wait for writeback */ | 1433 | /* Check if we should syncronously wait for writeback */ |
1413 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1434 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1414 | set_lumpy_reclaim_mode(priority, sc, true); | 1435 | set_reclaim_mode(priority, sc, true); |
1415 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | 1436 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); |
1416 | } | 1437 | } |
1417 | 1438 | ||
@@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1426 | zone_idx(zone), | 1447 | zone_idx(zone), |
1427 | nr_scanned, nr_reclaimed, | 1448 | nr_scanned, nr_reclaimed, |
1428 | priority, | 1449 | priority, |
1429 | trace_shrink_flags(file, sc->lumpy_reclaim_mode)); | 1450 | trace_shrink_flags(file, sc->reclaim_mode)); |
1430 | return nr_reclaimed; | 1451 | return nr_reclaimed; |
1431 | } | 1452 | } |
1432 | 1453 | ||
@@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1466 | 1487 | ||
1467 | list_move(&page->lru, &zone->lru[lru].list); | 1488 | list_move(&page->lru, &zone->lru[lru].list); |
1468 | mem_cgroup_add_lru_list(page, lru); | 1489 | mem_cgroup_add_lru_list(page, lru); |
1469 | pgmoved++; | 1490 | pgmoved += hpage_nr_pages(page); |
1470 | 1491 | ||
1471 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | 1492 | if (!pagevec_add(&pvec, page) || list_empty(list)) { |
1472 | spin_unlock_irq(&zone->lru_lock); | 1493 | spin_unlock_irq(&zone->lru_lock); |
@@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1534 | } | 1555 | } |
1535 | 1556 | ||
1536 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | 1557 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1537 | nr_rotated++; | 1558 | nr_rotated += hpage_nr_pages(page); |
1538 | /* | 1559 | /* |
1539 | * Identify referenced, file-backed active pages and | 1560 | * Identify referenced, file-backed active pages and |
1540 | * give them one more trip around the active list. So | 1561 | * give them one more trip around the active list. So |
@@ -1805,6 +1826,57 @@ out: | |||
1805 | } | 1826 | } |
1806 | 1827 | ||
1807 | /* | 1828 | /* |
1829 | * Reclaim/compaction depends on a number of pages being freed. To avoid | ||
1830 | * disruption to the system, a small number of order-0 pages continue to be | ||
1831 | * rotated and reclaimed in the normal fashion. However, by the time we get | ||
1832 | * back to the allocator and call try_to_compact_zone(), we ensure that | ||
1833 | * there are enough free pages for it to be likely successful | ||
1834 | */ | ||
1835 | static inline bool should_continue_reclaim(struct zone *zone, | ||
1836 | unsigned long nr_reclaimed, | ||
1837 | unsigned long nr_scanned, | ||
1838 | struct scan_control *sc) | ||
1839 | { | ||
1840 | unsigned long pages_for_compaction; | ||
1841 | unsigned long inactive_lru_pages; | ||
1842 | |||
1843 | /* If not in reclaim/compaction mode, stop */ | ||
1844 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | ||
1845 | return false; | ||
1846 | |||
1847 | /* | ||
1848 | * If we failed to reclaim and have scanned the full list, stop. | ||
1849 | * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far | ||
1850 | * faster but obviously would be less likely to succeed | ||
1851 | * allocation. If this is desirable, use GFP_REPEAT to decide | ||
1852 | * if both reclaimed and scanned should be checked or just | ||
1853 | * reclaimed | ||
1854 | */ | ||
1855 | if (!nr_reclaimed && !nr_scanned) | ||
1856 | return false; | ||
1857 | |||
1858 | /* | ||
1859 | * If we have not reclaimed enough pages for compaction and the | ||
1860 | * inactive lists are large enough, continue reclaiming | ||
1861 | */ | ||
1862 | pages_for_compaction = (2UL << sc->order); | ||
1863 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + | ||
1864 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1865 | if (sc->nr_reclaimed < pages_for_compaction && | ||
1866 | inactive_lru_pages > pages_for_compaction) | ||
1867 | return true; | ||
1868 | |||
1869 | /* If compaction would go ahead or the allocation would succeed, stop */ | ||
1870 | switch (compaction_suitable(zone, sc->order)) { | ||
1871 | case COMPACT_PARTIAL: | ||
1872 | case COMPACT_CONTINUE: | ||
1873 | return false; | ||
1874 | default: | ||
1875 | return true; | ||
1876 | } | ||
1877 | } | ||
1878 | |||
1879 | /* | ||
1808 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1880 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
1809 | */ | 1881 | */ |
1810 | static void shrink_zone(int priority, struct zone *zone, | 1882 | static void shrink_zone(int priority, struct zone *zone, |
@@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1813 | unsigned long nr[NR_LRU_LISTS]; | 1885 | unsigned long nr[NR_LRU_LISTS]; |
1814 | unsigned long nr_to_scan; | 1886 | unsigned long nr_to_scan; |
1815 | enum lru_list l; | 1887 | enum lru_list l; |
1816 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1888 | unsigned long nr_reclaimed; |
1817 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1889 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1890 | unsigned long nr_scanned = sc->nr_scanned; | ||
1818 | 1891 | ||
1892 | restart: | ||
1893 | nr_reclaimed = 0; | ||
1819 | get_scan_count(zone, sc, nr, priority); | 1894 | get_scan_count(zone, sc, nr, priority); |
1820 | 1895 | ||
1821 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1896 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
@@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1841 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 1916 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1842 | break; | 1917 | break; |
1843 | } | 1918 | } |
1844 | 1919 | sc->nr_reclaimed += nr_reclaimed; | |
1845 | sc->nr_reclaimed = nr_reclaimed; | ||
1846 | 1920 | ||
1847 | /* | 1921 | /* |
1848 | * Even if we did not try to evict anon pages at all, we want to | 1922 | * Even if we did not try to evict anon pages at all, we want to |
@@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1851 | if (inactive_anon_is_low(zone, sc)) | 1925 | if (inactive_anon_is_low(zone, sc)) |
1852 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1926 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1853 | 1927 | ||
1928 | /* reclaim/compaction might need reclaim to continue */ | ||
1929 | if (should_continue_reclaim(zone, nr_reclaimed, | ||
1930 | sc->nr_scanned - nr_scanned, sc)) | ||
1931 | goto restart; | ||
1932 | |||
1854 | throttle_vm_writeout(sc->gfp_mask); | 1933 | throttle_vm_writeout(sc->gfp_mask); |
1855 | } | 1934 | } |
1856 | 1935 | ||
@@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2124 | } | 2203 | } |
2125 | #endif | 2204 | #endif |
2126 | 2205 | ||
2206 | /* | ||
2207 | * pgdat_balanced is used when checking if a node is balanced for high-order | ||
2208 | * allocations. Only zones that meet watermarks and are in a zone allowed | ||
2209 | * by the callers classzone_idx are added to balanced_pages. The total of | ||
2210 | * balanced pages must be at least 25% of the zones allowed by classzone_idx | ||
2211 | * for the node to be considered balanced. Forcing all zones to be balanced | ||
2212 | * for high orders can cause excessive reclaim when there are imbalanced zones. | ||
2213 | * The choice of 25% is due to | ||
2214 | * o a 16M DMA zone that is balanced will not balance a zone on any | ||
2215 | * reasonable sized machine | ||
2216 | * o On all other machines, the top zone must be at least a reasonable | ||
2217 | * precentage of the middle zones. For example, on 32-bit x86, highmem | ||
2218 | * would need to be at least 256M for it to be balance a whole node. | ||
2219 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | ||
2220 | * to balance a node on its own. These seemed like reasonable ratios. | ||
2221 | */ | ||
2222 | static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | ||
2223 | int classzone_idx) | ||
2224 | { | ||
2225 | unsigned long present_pages = 0; | ||
2226 | int i; | ||
2227 | |||
2228 | for (i = 0; i <= classzone_idx; i++) | ||
2229 | present_pages += pgdat->node_zones[i].present_pages; | ||
2230 | |||
2231 | return balanced_pages > (present_pages >> 2); | ||
2232 | } | ||
2233 | |||
2127 | /* is kswapd sleeping prematurely? */ | 2234 | /* is kswapd sleeping prematurely? */ |
2128 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | 2235 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, |
2236 | int classzone_idx) | ||
2129 | { | 2237 | { |
2130 | int i; | 2238 | int i; |
2239 | unsigned long balanced = 0; | ||
2240 | bool all_zones_ok = true; | ||
2131 | 2241 | ||
2132 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2242 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2133 | if (remaining) | 2243 | if (remaining) |
2134 | return 1; | 2244 | return true; |
2135 | 2245 | ||
2136 | /* If after HZ/10, a zone is below the high mark, it's premature */ | 2246 | /* Check the watermark levels */ |
2137 | for (i = 0; i < pgdat->nr_zones; i++) { | 2247 | for (i = 0; i < pgdat->nr_zones; i++) { |
2138 | struct zone *zone = pgdat->node_zones + i; | 2248 | struct zone *zone = pgdat->node_zones + i; |
2139 | 2249 | ||
2140 | if (!populated_zone(zone)) | 2250 | if (!populated_zone(zone)) |
2141 | continue; | 2251 | continue; |
2142 | 2252 | ||
2143 | if (zone->all_unreclaimable) | 2253 | /* |
2254 | * balance_pgdat() skips over all_unreclaimable after | ||
2255 | * DEF_PRIORITY. Effectively, it considers them balanced so | ||
2256 | * they must be considered balanced here as well if kswapd | ||
2257 | * is to sleep | ||
2258 | */ | ||
2259 | if (zone->all_unreclaimable) { | ||
2260 | balanced += zone->present_pages; | ||
2144 | continue; | 2261 | continue; |
2262 | } | ||
2145 | 2263 | ||
2146 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 2264 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2147 | 0, 0)) | 2265 | classzone_idx, 0)) |
2148 | return 1; | 2266 | all_zones_ok = false; |
2267 | else | ||
2268 | balanced += zone->present_pages; | ||
2149 | } | 2269 | } |
2150 | 2270 | ||
2151 | return 0; | 2271 | /* |
2272 | * For high-order requests, the balanced zones must contain at least | ||
2273 | * 25% of the nodes pages for kswapd to sleep. For order-0, all zones | ||
2274 | * must be balanced | ||
2275 | */ | ||
2276 | if (order) | ||
2277 | return pgdat_balanced(pgdat, balanced, classzone_idx); | ||
2278 | else | ||
2279 | return !all_zones_ok; | ||
2152 | } | 2280 | } |
2153 | 2281 | ||
2154 | /* | 2282 | /* |
2155 | * For kswapd, balance_pgdat() will work across all this node's zones until | 2283 | * For kswapd, balance_pgdat() will work across all this node's zones until |
2156 | * they are all at high_wmark_pages(zone). | 2284 | * they are all at high_wmark_pages(zone). |
2157 | * | 2285 | * |
2158 | * Returns the number of pages which were actually freed. | 2286 | * Returns the final order kswapd was reclaiming at |
2159 | * | 2287 | * |
2160 | * There is special handling here for zones which are full of pinned pages. | 2288 | * There is special handling here for zones which are full of pinned pages. |
2161 | * This can happen if the pages are all mlocked, or if they are all used by | 2289 | * This can happen if the pages are all mlocked, or if they are all used by |
@@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
2172 | * interoperates with the page allocator fallback scheme to ensure that aging | 2300 | * interoperates with the page allocator fallback scheme to ensure that aging |
2173 | * of pages is balanced across the zones. | 2301 | * of pages is balanced across the zones. |
2174 | */ | 2302 | */ |
2175 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 2303 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2304 | int *classzone_idx) | ||
2176 | { | 2305 | { |
2177 | int all_zones_ok; | 2306 | int all_zones_ok; |
2307 | unsigned long balanced; | ||
2178 | int priority; | 2308 | int priority; |
2179 | int i; | 2309 | int i; |
2310 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2180 | unsigned long total_scanned; | 2311 | unsigned long total_scanned; |
2181 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2312 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2182 | struct scan_control sc = { | 2313 | struct scan_control sc = { |
@@ -2199,7 +2330,6 @@ loop_again: | |||
2199 | count_vm_event(PAGEOUTRUN); | 2330 | count_vm_event(PAGEOUTRUN); |
2200 | 2331 | ||
2201 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2332 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2202 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2203 | unsigned long lru_pages = 0; | 2333 | unsigned long lru_pages = 0; |
2204 | int has_under_min_watermark_zone = 0; | 2334 | int has_under_min_watermark_zone = 0; |
2205 | 2335 | ||
@@ -2208,6 +2338,7 @@ loop_again: | |||
2208 | disable_swap_token(); | 2338 | disable_swap_token(); |
2209 | 2339 | ||
2210 | all_zones_ok = 1; | 2340 | all_zones_ok = 1; |
2341 | balanced = 0; | ||
2211 | 2342 | ||
2212 | /* | 2343 | /* |
2213 | * Scan in the highmem->dma direction for the highest | 2344 | * Scan in the highmem->dma direction for the highest |
@@ -2230,9 +2361,10 @@ loop_again: | |||
2230 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2361 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
2231 | &sc, priority, 0); | 2362 | &sc, priority, 0); |
2232 | 2363 | ||
2233 | if (!zone_watermark_ok(zone, order, | 2364 | if (!zone_watermark_ok_safe(zone, order, |
2234 | high_wmark_pages(zone), 0, 0)) { | 2365 | high_wmark_pages(zone), 0, 0)) { |
2235 | end_zone = i; | 2366 | end_zone = i; |
2367 | *classzone_idx = i; | ||
2236 | break; | 2368 | break; |
2237 | } | 2369 | } |
2238 | } | 2370 | } |
@@ -2255,6 +2387,7 @@ loop_again: | |||
2255 | * cause too much scanning of the lower zones. | 2387 | * cause too much scanning of the lower zones. |
2256 | */ | 2388 | */ |
2257 | for (i = 0; i <= end_zone; i++) { | 2389 | for (i = 0; i <= end_zone; i++) { |
2390 | int compaction; | ||
2258 | struct zone *zone = pgdat->node_zones + i; | 2391 | struct zone *zone = pgdat->node_zones + i; |
2259 | int nr_slab; | 2392 | int nr_slab; |
2260 | 2393 | ||
@@ -2276,7 +2409,7 @@ loop_again: | |||
2276 | * We put equal pressure on every zone, unless one | 2409 | * We put equal pressure on every zone, unless one |
2277 | * zone has way too many pages free already. | 2410 | * zone has way too many pages free already. |
2278 | */ | 2411 | */ |
2279 | if (!zone_watermark_ok(zone, order, | 2412 | if (!zone_watermark_ok_safe(zone, order, |
2280 | 8*high_wmark_pages(zone), end_zone, 0)) | 2413 | 8*high_wmark_pages(zone), end_zone, 0)) |
2281 | shrink_zone(priority, zone, &sc); | 2414 | shrink_zone(priority, zone, &sc); |
2282 | reclaim_state->reclaimed_slab = 0; | 2415 | reclaim_state->reclaimed_slab = 0; |
@@ -2284,9 +2417,26 @@ loop_again: | |||
2284 | lru_pages); | 2417 | lru_pages); |
2285 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2418 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2286 | total_scanned += sc.nr_scanned; | 2419 | total_scanned += sc.nr_scanned; |
2420 | |||
2421 | compaction = 0; | ||
2422 | if (order && | ||
2423 | zone_watermark_ok(zone, 0, | ||
2424 | high_wmark_pages(zone), | ||
2425 | end_zone, 0) && | ||
2426 | !zone_watermark_ok(zone, order, | ||
2427 | high_wmark_pages(zone), | ||
2428 | end_zone, 0)) { | ||
2429 | compact_zone_order(zone, | ||
2430 | order, | ||
2431 | sc.gfp_mask, false, | ||
2432 | COMPACT_MODE_KSWAPD); | ||
2433 | compaction = 1; | ||
2434 | } | ||
2435 | |||
2287 | if (zone->all_unreclaimable) | 2436 | if (zone->all_unreclaimable) |
2288 | continue; | 2437 | continue; |
2289 | if (nr_slab == 0 && !zone_reclaimable(zone)) | 2438 | if (!compaction && nr_slab == 0 && |
2439 | !zone_reclaimable(zone)) | ||
2290 | zone->all_unreclaimable = 1; | 2440 | zone->all_unreclaimable = 1; |
2291 | /* | 2441 | /* |
2292 | * If we've done a decent amount of scanning and | 2442 | * If we've done a decent amount of scanning and |
@@ -2297,7 +2447,7 @@ loop_again: | |||
2297 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2447 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2298 | sc.may_writepage = 1; | 2448 | sc.may_writepage = 1; |
2299 | 2449 | ||
2300 | if (!zone_watermark_ok(zone, order, | 2450 | if (!zone_watermark_ok_safe(zone, order, |
2301 | high_wmark_pages(zone), end_zone, 0)) { | 2451 | high_wmark_pages(zone), end_zone, 0)) { |
2302 | all_zones_ok = 0; | 2452 | all_zones_ok = 0; |
2303 | /* | 2453 | /* |
@@ -2305,7 +2455,7 @@ loop_again: | |||
2305 | * means that we have a GFP_ATOMIC allocation | 2455 | * means that we have a GFP_ATOMIC allocation |
2306 | * failure risk. Hurry up! | 2456 | * failure risk. Hurry up! |
2307 | */ | 2457 | */ |
2308 | if (!zone_watermark_ok(zone, order, | 2458 | if (!zone_watermark_ok_safe(zone, order, |
2309 | min_wmark_pages(zone), end_zone, 0)) | 2459 | min_wmark_pages(zone), end_zone, 0)) |
2310 | has_under_min_watermark_zone = 1; | 2460 | has_under_min_watermark_zone = 1; |
2311 | } else { | 2461 | } else { |
@@ -2317,10 +2467,12 @@ loop_again: | |||
2317 | * spectulatively avoid congestion waits | 2467 | * spectulatively avoid congestion waits |
2318 | */ | 2468 | */ |
2319 | zone_clear_flag(zone, ZONE_CONGESTED); | 2469 | zone_clear_flag(zone, ZONE_CONGESTED); |
2470 | if (i <= *classzone_idx) | ||
2471 | balanced += zone->present_pages; | ||
2320 | } | 2472 | } |
2321 | 2473 | ||
2322 | } | 2474 | } |
2323 | if (all_zones_ok) | 2475 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2324 | break; /* kswapd: all done */ | 2476 | break; /* kswapd: all done */ |
2325 | /* | 2477 | /* |
2326 | * OK, kswapd is getting into trouble. Take a nap, then take | 2478 | * OK, kswapd is getting into trouble. Take a nap, then take |
@@ -2343,7 +2495,13 @@ loop_again: | |||
2343 | break; | 2495 | break; |
2344 | } | 2496 | } |
2345 | out: | 2497 | out: |
2346 | if (!all_zones_ok) { | 2498 | |
2499 | /* | ||
2500 | * order-0: All zones must meet high watermark for a balanced node | ||
2501 | * high-order: Balanced zones must make up at least 25% of the node | ||
2502 | * for the node to be balanced | ||
2503 | */ | ||
2504 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { | ||
2347 | cond_resched(); | 2505 | cond_resched(); |
2348 | 2506 | ||
2349 | try_to_freeze(); | 2507 | try_to_freeze(); |
@@ -2368,7 +2526,88 @@ out: | |||
2368 | goto loop_again; | 2526 | goto loop_again; |
2369 | } | 2527 | } |
2370 | 2528 | ||
2371 | return sc.nr_reclaimed; | 2529 | /* |
2530 | * If kswapd was reclaiming at a higher order, it has the option of | ||
2531 | * sleeping without all zones being balanced. Before it does, it must | ||
2532 | * ensure that the watermarks for order-0 on *all* zones are met and | ||
2533 | * that the congestion flags are cleared. The congestion flag must | ||
2534 | * be cleared as kswapd is the only mechanism that clears the flag | ||
2535 | * and it is potentially going to sleep here. | ||
2536 | */ | ||
2537 | if (order) { | ||
2538 | for (i = 0; i <= end_zone; i++) { | ||
2539 | struct zone *zone = pgdat->node_zones + i; | ||
2540 | |||
2541 | if (!populated_zone(zone)) | ||
2542 | continue; | ||
2543 | |||
2544 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | ||
2545 | continue; | ||
2546 | |||
2547 | /* Confirm the zone is balanced for order-0 */ | ||
2548 | if (!zone_watermark_ok(zone, 0, | ||
2549 | high_wmark_pages(zone), 0, 0)) { | ||
2550 | order = sc.order = 0; | ||
2551 | goto loop_again; | ||
2552 | } | ||
2553 | |||
2554 | /* If balanced, clear the congested flag */ | ||
2555 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2556 | } | ||
2557 | } | ||
2558 | |||
2559 | /* | ||
2560 | * Return the order we were reclaiming at so sleeping_prematurely() | ||
2561 | * makes a decision on the order we were last reclaiming at. However, | ||
2562 | * if another caller entered the allocator slow path while kswapd | ||
2563 | * was awake, order will remain at the higher level | ||
2564 | */ | ||
2565 | *classzone_idx = end_zone; | ||
2566 | return order; | ||
2567 | } | ||
2568 | |||
2569 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | ||
2570 | { | ||
2571 | long remaining = 0; | ||
2572 | DEFINE_WAIT(wait); | ||
2573 | |||
2574 | if (freezing(current) || kthread_should_stop()) | ||
2575 | return; | ||
2576 | |||
2577 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2578 | |||
2579 | /* Try to sleep for a short interval */ | ||
2580 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2581 | remaining = schedule_timeout(HZ/10); | ||
2582 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2583 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2584 | } | ||
2585 | |||
2586 | /* | ||
2587 | * After a short sleep, check if it was a premature sleep. If not, then | ||
2588 | * go fully to sleep until explicitly woken up. | ||
2589 | */ | ||
2590 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2591 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2592 | |||
2593 | /* | ||
2594 | * vmstat counters are not perfectly accurate and the estimated | ||
2595 | * value for counters such as NR_FREE_PAGES can deviate from the | ||
2596 | * true value by nr_online_cpus * threshold. To avoid the zone | ||
2597 | * watermarks being breached while under pressure, we reduce the | ||
2598 | * per-cpu vmstat threshold while kswapd is awake and restore | ||
2599 | * them before going back to sleep. | ||
2600 | */ | ||
2601 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | ||
2602 | schedule(); | ||
2603 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | ||
2604 | } else { | ||
2605 | if (remaining) | ||
2606 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2607 | else | ||
2608 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2609 | } | ||
2610 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2372 | } | 2611 | } |
2373 | 2612 | ||
2374 | /* | 2613 | /* |
@@ -2387,9 +2626,10 @@ out: | |||
2387 | static int kswapd(void *p) | 2626 | static int kswapd(void *p) |
2388 | { | 2627 | { |
2389 | unsigned long order; | 2628 | unsigned long order; |
2629 | int classzone_idx; | ||
2390 | pg_data_t *pgdat = (pg_data_t*)p; | 2630 | pg_data_t *pgdat = (pg_data_t*)p; |
2391 | struct task_struct *tsk = current; | 2631 | struct task_struct *tsk = current; |
2392 | DEFINE_WAIT(wait); | 2632 | |
2393 | struct reclaim_state reclaim_state = { | 2633 | struct reclaim_state reclaim_state = { |
2394 | .reclaimed_slab = 0, | 2634 | .reclaimed_slab = 0, |
2395 | }; | 2635 | }; |
@@ -2417,49 +2657,30 @@ static int kswapd(void *p) | |||
2417 | set_freezable(); | 2657 | set_freezable(); |
2418 | 2658 | ||
2419 | order = 0; | 2659 | order = 0; |
2660 | classzone_idx = MAX_NR_ZONES - 1; | ||
2420 | for ( ; ; ) { | 2661 | for ( ; ; ) { |
2421 | unsigned long new_order; | 2662 | unsigned long new_order; |
2663 | int new_classzone_idx; | ||
2422 | int ret; | 2664 | int ret; |
2423 | 2665 | ||
2424 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2425 | new_order = pgdat->kswapd_max_order; | 2666 | new_order = pgdat->kswapd_max_order; |
2667 | new_classzone_idx = pgdat->classzone_idx; | ||
2426 | pgdat->kswapd_max_order = 0; | 2668 | pgdat->kswapd_max_order = 0; |
2427 | if (order < new_order) { | 2669 | pgdat->classzone_idx = MAX_NR_ZONES - 1; |
2670 | if (order < new_order || classzone_idx > new_classzone_idx) { | ||
2428 | /* | 2671 | /* |
2429 | * Don't sleep if someone wants a larger 'order' | 2672 | * Don't sleep if someone wants a larger 'order' |
2430 | * allocation | 2673 | * allocation or has tigher zone constraints |
2431 | */ | 2674 | */ |
2432 | order = new_order; | 2675 | order = new_order; |
2676 | classzone_idx = new_classzone_idx; | ||
2433 | } else { | 2677 | } else { |
2434 | if (!freezing(current) && !kthread_should_stop()) { | 2678 | kswapd_try_to_sleep(pgdat, order, classzone_idx); |
2435 | long remaining = 0; | ||
2436 | |||
2437 | /* Try to sleep for a short interval */ | ||
2438 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2439 | remaining = schedule_timeout(HZ/10); | ||
2440 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2441 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2442 | } | ||
2443 | |||
2444 | /* | ||
2445 | * After a short sleep, check if it was a | ||
2446 | * premature sleep. If not, then go fully | ||
2447 | * to sleep until explicitly woken up | ||
2448 | */ | ||
2449 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2450 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2451 | schedule(); | ||
2452 | } else { | ||
2453 | if (remaining) | ||
2454 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2455 | else | ||
2456 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2457 | } | ||
2458 | } | ||
2459 | |||
2460 | order = pgdat->kswapd_max_order; | 2679 | order = pgdat->kswapd_max_order; |
2680 | classzone_idx = pgdat->classzone_idx; | ||
2681 | pgdat->kswapd_max_order = 0; | ||
2682 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | ||
2461 | } | 2683 | } |
2462 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2463 | 2684 | ||
2464 | ret = try_to_freeze(); | 2685 | ret = try_to_freeze(); |
2465 | if (kthread_should_stop()) | 2686 | if (kthread_should_stop()) |
@@ -2471,7 +2692,7 @@ static int kswapd(void *p) | |||
2471 | */ | 2692 | */ |
2472 | if (!ret) { | 2693 | if (!ret) { |
2473 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2694 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2474 | balance_pgdat(pgdat, order); | 2695 | order = balance_pgdat(pgdat, order, &classzone_idx); |
2475 | } | 2696 | } |
2476 | } | 2697 | } |
2477 | return 0; | 2698 | return 0; |
@@ -2480,23 +2701,26 @@ static int kswapd(void *p) | |||
2480 | /* | 2701 | /* |
2481 | * A zone is low on free memory, so wake its kswapd task to service it. | 2702 | * A zone is low on free memory, so wake its kswapd task to service it. |
2482 | */ | 2703 | */ |
2483 | void wakeup_kswapd(struct zone *zone, int order) | 2704 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
2484 | { | 2705 | { |
2485 | pg_data_t *pgdat; | 2706 | pg_data_t *pgdat; |
2486 | 2707 | ||
2487 | if (!populated_zone(zone)) | 2708 | if (!populated_zone(zone)) |
2488 | return; | 2709 | return; |
2489 | 2710 | ||
2490 | pgdat = zone->zone_pgdat; | ||
2491 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2492 | return; | ||
2493 | if (pgdat->kswapd_max_order < order) | ||
2494 | pgdat->kswapd_max_order = order; | ||
2495 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2496 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2711 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2497 | return; | 2712 | return; |
2713 | pgdat = zone->zone_pgdat; | ||
2714 | if (pgdat->kswapd_max_order < order) { | ||
2715 | pgdat->kswapd_max_order = order; | ||
2716 | pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); | ||
2717 | } | ||
2498 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2718 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
2499 | return; | 2719 | return; |
2720 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2721 | return; | ||
2722 | |||
2723 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2500 | wake_up_interruptible(&pgdat->kswapd_wait); | 2724 | wake_up_interruptible(&pgdat->kswapd_wait); |
2501 | } | 2725 | } |
2502 | 2726 | ||