diff options
author | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
---|---|---|
committer | Glenn Elliott <gelliott@cs.unc.edu> | 2012-03-04 19:47:13 -0500 |
commit | c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch) | |
tree | ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/vmscan.c | |
parent | ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff) | |
parent | 6a00f206debf8a5c8899055726ad127dbeeed098 (diff) |
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts:
litmus/sched_cedf.c
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 853 |
1 files changed, 625 insertions, 228 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index c5dfabf25f11..d036e59d302b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/topology.h> | 32 | #include <linux/topology.h> |
33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
35 | #include <linux/compaction.h> | ||
35 | #include <linux/notifier.h> | 36 | #include <linux/notifier.h> |
36 | #include <linux/rwsem.h> | 37 | #include <linux/rwsem.h> |
37 | #include <linux/delay.h> | 38 | #include <linux/delay.h> |
@@ -40,6 +41,8 @@ | |||
40 | #include <linux/memcontrol.h> | 41 | #include <linux/memcontrol.h> |
41 | #include <linux/delayacct.h> | 42 | #include <linux/delayacct.h> |
42 | #include <linux/sysctl.h> | 43 | #include <linux/sysctl.h> |
44 | #include <linux/oom.h> | ||
45 | #include <linux/prefetch.h> | ||
43 | 46 | ||
44 | #include <asm/tlbflush.h> | 47 | #include <asm/tlbflush.h> |
45 | #include <asm/div64.h> | 48 | #include <asm/div64.h> |
@@ -51,6 +54,24 @@ | |||
51 | #define CREATE_TRACE_POINTS | 54 | #define CREATE_TRACE_POINTS |
52 | #include <trace/events/vmscan.h> | 55 | #include <trace/events/vmscan.h> |
53 | 56 | ||
57 | /* | ||
58 | * reclaim_mode determines how the inactive list is shrunk | ||
59 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages | ||
60 | * RECLAIM_MODE_ASYNC: Do not block | ||
61 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback | ||
62 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | ||
63 | * page from the LRU and reclaim all pages within a | ||
64 | * naturally aligned range | ||
65 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | ||
66 | * order-0 pages and then compact the zone | ||
67 | */ | ||
68 | typedef unsigned __bitwise__ reclaim_mode_t; | ||
69 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | ||
70 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | ||
71 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | ||
72 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | ||
73 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | ||
74 | |||
54 | struct scan_control { | 75 | struct scan_control { |
55 | /* Incremented by the number of inactive pages that were scanned */ | 76 | /* Incremented by the number of inactive pages that were scanned */ |
56 | unsigned long nr_scanned; | 77 | unsigned long nr_scanned; |
@@ -79,10 +100,10 @@ struct scan_control { | |||
79 | int order; | 100 | int order; |
80 | 101 | ||
81 | /* | 102 | /* |
82 | * Intend to reclaim enough contenious memory rather than to reclaim | 103 | * Intend to reclaim enough continuous memory rather than reclaim |
83 | * enough amount memory. I.e, it's the mode for high order allocation. | 104 | * enough amount of memory. i.e, mode for high order allocation. |
84 | */ | 105 | */ |
85 | bool lumpy_reclaim_mode; | 106 | reclaim_mode_t reclaim_mode; |
86 | 107 | ||
87 | /* Which cgroup do we reclaim from */ | 108 | /* Which cgroup do we reclaim from */ |
88 | struct mem_cgroup *mem_cgroup; | 109 | struct mem_cgroup *mem_cgroup; |
@@ -152,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, | |||
152 | struct scan_control *sc, enum lru_list lru) | 173 | struct scan_control *sc, enum lru_list lru) |
153 | { | 174 | { |
154 | if (!scanning_global_lru(sc)) | 175 | if (!scanning_global_lru(sc)) |
155 | return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); | 176 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); |
156 | 177 | ||
157 | return zone_page_state(zone, NR_LRU_BASE + lru); | 178 | return zone_page_state(zone, NR_LRU_BASE + lru); |
158 | } | 179 | } |
@@ -181,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker) | |||
181 | } | 202 | } |
182 | EXPORT_SYMBOL(unregister_shrinker); | 203 | EXPORT_SYMBOL(unregister_shrinker); |
183 | 204 | ||
205 | static inline int do_shrinker_shrink(struct shrinker *shrinker, | ||
206 | struct shrink_control *sc, | ||
207 | unsigned long nr_to_scan) | ||
208 | { | ||
209 | sc->nr_to_scan = nr_to_scan; | ||
210 | return (*shrinker->shrink)(shrinker, sc); | ||
211 | } | ||
212 | |||
184 | #define SHRINK_BATCH 128 | 213 | #define SHRINK_BATCH 128 |
185 | /* | 214 | /* |
186 | * Call the shrink functions to age shrinkable caches | 215 | * Call the shrink functions to age shrinkable caches |
@@ -201,25 +230,29 @@ EXPORT_SYMBOL(unregister_shrinker); | |||
201 | * | 230 | * |
202 | * Returns the number of slab objects which we shrunk. | 231 | * Returns the number of slab objects which we shrunk. |
203 | */ | 232 | */ |
204 | unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | 233 | unsigned long shrink_slab(struct shrink_control *shrink, |
205 | unsigned long lru_pages) | 234 | unsigned long nr_pages_scanned, |
235 | unsigned long lru_pages) | ||
206 | { | 236 | { |
207 | struct shrinker *shrinker; | 237 | struct shrinker *shrinker; |
208 | unsigned long ret = 0; | 238 | unsigned long ret = 0; |
209 | 239 | ||
210 | if (scanned == 0) | 240 | if (nr_pages_scanned == 0) |
211 | scanned = SWAP_CLUSTER_MAX; | 241 | nr_pages_scanned = SWAP_CLUSTER_MAX; |
212 | 242 | ||
213 | if (!down_read_trylock(&shrinker_rwsem)) | 243 | if (!down_read_trylock(&shrinker_rwsem)) { |
214 | return 1; /* Assume we'll be able to shrink next time */ | 244 | /* Assume we'll be able to shrink next time */ |
245 | ret = 1; | ||
246 | goto out; | ||
247 | } | ||
215 | 248 | ||
216 | list_for_each_entry(shrinker, &shrinker_list, list) { | 249 | list_for_each_entry(shrinker, &shrinker_list, list) { |
217 | unsigned long long delta; | 250 | unsigned long long delta; |
218 | unsigned long total_scan; | 251 | unsigned long total_scan; |
219 | unsigned long max_pass; | 252 | unsigned long max_pass; |
220 | 253 | ||
221 | max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); | 254 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
222 | delta = (4 * scanned) / shrinker->seeks; | 255 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
223 | delta *= max_pass; | 256 | delta *= max_pass; |
224 | do_div(delta, lru_pages + 1); | 257 | do_div(delta, lru_pages + 1); |
225 | shrinker->nr += delta; | 258 | shrinker->nr += delta; |
@@ -246,9 +279,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
246 | int shrink_ret; | 279 | int shrink_ret; |
247 | int nr_before; | 280 | int nr_before; |
248 | 281 | ||
249 | nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); | 282 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
250 | shrink_ret = (*shrinker->shrink)(shrinker, this_scan, | 283 | shrink_ret = do_shrinker_shrink(shrinker, shrink, |
251 | gfp_mask); | 284 | this_scan); |
252 | if (shrink_ret == -1) | 285 | if (shrink_ret == -1) |
253 | break; | 286 | break; |
254 | if (shrink_ret < nr_before) | 287 | if (shrink_ret < nr_before) |
@@ -262,9 +295,44 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
262 | shrinker->nr += total_scan; | 295 | shrinker->nr += total_scan; |
263 | } | 296 | } |
264 | up_read(&shrinker_rwsem); | 297 | up_read(&shrinker_rwsem); |
298 | out: | ||
299 | cond_resched(); | ||
265 | return ret; | 300 | return ret; |
266 | } | 301 | } |
267 | 302 | ||
303 | static void set_reclaim_mode(int priority, struct scan_control *sc, | ||
304 | bool sync) | ||
305 | { | ||
306 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; | ||
307 | |||
308 | /* | ||
309 | * Initially assume we are entering either lumpy reclaim or | ||
310 | * reclaim/compaction.Depending on the order, we will either set the | ||
311 | * sync mode or just reclaim order-0 pages later. | ||
312 | */ | ||
313 | if (COMPACTION_BUILD) | ||
314 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; | ||
315 | else | ||
316 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | ||
317 | |||
318 | /* | ||
319 | * Avoid using lumpy reclaim or reclaim/compaction if possible by | ||
320 | * restricting when its set to either costly allocations or when | ||
321 | * under memory pressure | ||
322 | */ | ||
323 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
324 | sc->reclaim_mode |= syncmode; | ||
325 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
326 | sc->reclaim_mode |= syncmode; | ||
327 | else | ||
328 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | ||
329 | } | ||
330 | |||
331 | static void reset_reclaim_mode(struct scan_control *sc) | ||
332 | { | ||
333 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | ||
334 | } | ||
335 | |||
268 | static inline int is_page_cache_freeable(struct page *page) | 336 | static inline int is_page_cache_freeable(struct page *page) |
269 | { | 337 | { |
270 | /* | 338 | /* |
@@ -275,7 +343,8 @@ static inline int is_page_cache_freeable(struct page *page) | |||
275 | return page_count(page) - page_has_private(page) == 2; | 343 | return page_count(page) - page_has_private(page) == 2; |
276 | } | 344 | } |
277 | 345 | ||
278 | static int may_write_to_queue(struct backing_dev_info *bdi) | 346 | static int may_write_to_queue(struct backing_dev_info *bdi, |
347 | struct scan_control *sc) | ||
279 | { | 348 | { |
280 | if (current->flags & PF_SWAPWRITE) | 349 | if (current->flags & PF_SWAPWRITE) |
281 | return 1; | 350 | return 1; |
@@ -283,6 +352,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi) | |||
283 | return 1; | 352 | return 1; |
284 | if (bdi == current->backing_dev_info) | 353 | if (bdi == current->backing_dev_info) |
285 | return 1; | 354 | return 1; |
355 | |||
356 | /* lumpy reclaim for hugepage often need a lot of write */ | ||
357 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
358 | return 1; | ||
286 | return 0; | 359 | return 0; |
287 | } | 360 | } |
288 | 361 | ||
@@ -301,18 +374,12 @@ static int may_write_to_queue(struct backing_dev_info *bdi) | |||
301 | static void handle_write_error(struct address_space *mapping, | 374 | static void handle_write_error(struct address_space *mapping, |
302 | struct page *page, int error) | 375 | struct page *page, int error) |
303 | { | 376 | { |
304 | lock_page_nosync(page); | 377 | lock_page(page); |
305 | if (page_mapping(page) == mapping) | 378 | if (page_mapping(page) == mapping) |
306 | mapping_set_error(mapping, error); | 379 | mapping_set_error(mapping, error); |
307 | unlock_page(page); | 380 | unlock_page(page); |
308 | } | 381 | } |
309 | 382 | ||
310 | /* Request for sync pageout. */ | ||
311 | enum pageout_io { | ||
312 | PAGEOUT_IO_ASYNC, | ||
313 | PAGEOUT_IO_SYNC, | ||
314 | }; | ||
315 | |||
316 | /* possible outcome of pageout() */ | 383 | /* possible outcome of pageout() */ |
317 | typedef enum { | 384 | typedef enum { |
318 | /* failed to write page out, page is locked */ | 385 | /* failed to write page out, page is locked */ |
@@ -330,7 +397,7 @@ typedef enum { | |||
330 | * Calls ->writepage(). | 397 | * Calls ->writepage(). |
331 | */ | 398 | */ |
332 | static pageout_t pageout(struct page *page, struct address_space *mapping, | 399 | static pageout_t pageout(struct page *page, struct address_space *mapping, |
333 | enum pageout_io sync_writeback) | 400 | struct scan_control *sc) |
334 | { | 401 | { |
335 | /* | 402 | /* |
336 | * If the page is dirty, only perform writeback if that write | 403 | * If the page is dirty, only perform writeback if that write |
@@ -366,7 +433,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
366 | } | 433 | } |
367 | if (mapping->a_ops->writepage == NULL) | 434 | if (mapping->a_ops->writepage == NULL) |
368 | return PAGE_ACTIVATE; | 435 | return PAGE_ACTIVATE; |
369 | if (!may_write_to_queue(mapping->backing_dev_info)) | 436 | if (!may_write_to_queue(mapping->backing_dev_info, sc)) |
370 | return PAGE_KEEP; | 437 | return PAGE_KEEP; |
371 | 438 | ||
372 | if (clear_page_dirty_for_io(page)) { | 439 | if (clear_page_dirty_for_io(page)) { |
@@ -376,7 +443,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
376 | .nr_to_write = SWAP_CLUSTER_MAX, | 443 | .nr_to_write = SWAP_CLUSTER_MAX, |
377 | .range_start = 0, | 444 | .range_start = 0, |
378 | .range_end = LLONG_MAX, | 445 | .range_end = LLONG_MAX, |
379 | .nonblocking = 1, | ||
380 | .for_reclaim = 1, | 446 | .for_reclaim = 1, |
381 | }; | 447 | }; |
382 | 448 | ||
@@ -394,7 +460,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
394 | * direct reclaiming a large contiguous area and the | 460 | * direct reclaiming a large contiguous area and the |
395 | * first attempt to free a range of pages fails. | 461 | * first attempt to free a range of pages fails. |
396 | */ | 462 | */ |
397 | if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) | 463 | if (PageWriteback(page) && |
464 | (sc->reclaim_mode & RECLAIM_MODE_SYNC)) | ||
398 | wait_on_page_writeback(page); | 465 | wait_on_page_writeback(page); |
399 | 466 | ||
400 | if (!PageWriteback(page)) { | 467 | if (!PageWriteback(page)) { |
@@ -402,7 +469,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
402 | ClearPageReclaim(page); | 469 | ClearPageReclaim(page); |
403 | } | 470 | } |
404 | trace_mm_vmscan_writepage(page, | 471 | trace_mm_vmscan_writepage(page, |
405 | trace_reclaim_flags(page, sync_writeback)); | 472 | trace_reclaim_flags(page, sc->reclaim_mode)); |
406 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 473 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
407 | return PAGE_SUCCESS; | 474 | return PAGE_SUCCESS; |
408 | } | 475 | } |
@@ -459,9 +526,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
459 | spin_unlock_irq(&mapping->tree_lock); | 526 | spin_unlock_irq(&mapping->tree_lock); |
460 | swapcache_free(swap, page); | 527 | swapcache_free(swap, page); |
461 | } else { | 528 | } else { |
462 | __remove_from_page_cache(page); | 529 | void (*freepage)(struct page *); |
530 | |||
531 | freepage = mapping->a_ops->freepage; | ||
532 | |||
533 | __delete_from_page_cache(page); | ||
463 | spin_unlock_irq(&mapping->tree_lock); | 534 | spin_unlock_irq(&mapping->tree_lock); |
464 | mem_cgroup_uncharge_cache_page(page); | 535 | mem_cgroup_uncharge_cache_page(page); |
536 | |||
537 | if (freepage != NULL) | ||
538 | freepage(page); | ||
465 | } | 539 | } |
466 | 540 | ||
467 | return 1; | 541 | return 1; |
@@ -580,7 +654,7 @@ static enum page_references page_check_references(struct page *page, | |||
580 | referenced_page = TestClearPageReferenced(page); | 654 | referenced_page = TestClearPageReferenced(page); |
581 | 655 | ||
582 | /* Lumpy reclaim - ignore references */ | 656 | /* Lumpy reclaim - ignore references */ |
583 | if (sc->lumpy_reclaim_mode) | 657 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
584 | return PAGEREF_RECLAIM; | 658 | return PAGEREF_RECLAIM; |
585 | 659 | ||
586 | /* | 660 | /* |
@@ -616,7 +690,7 @@ static enum page_references page_check_references(struct page *page, | |||
616 | } | 690 | } |
617 | 691 | ||
618 | /* Reclaim if clean, defer dirty pages to writeback */ | 692 | /* Reclaim if clean, defer dirty pages to writeback */ |
619 | if (referenced_page) | 693 | if (referenced_page && !PageSwapBacked(page)) |
620 | return PAGEREF_RECLAIM_CLEAN; | 694 | return PAGEREF_RECLAIM_CLEAN; |
621 | 695 | ||
622 | return PAGEREF_RECLAIM; | 696 | return PAGEREF_RECLAIM; |
@@ -644,12 +718,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) | |||
644 | * shrink_page_list() returns the number of reclaimed pages | 718 | * shrink_page_list() returns the number of reclaimed pages |
645 | */ | 719 | */ |
646 | static unsigned long shrink_page_list(struct list_head *page_list, | 720 | static unsigned long shrink_page_list(struct list_head *page_list, |
647 | struct scan_control *sc, | 721 | struct zone *zone, |
648 | enum pageout_io sync_writeback) | 722 | struct scan_control *sc) |
649 | { | 723 | { |
650 | LIST_HEAD(ret_pages); | 724 | LIST_HEAD(ret_pages); |
651 | LIST_HEAD(free_pages); | 725 | LIST_HEAD(free_pages); |
652 | int pgactivate = 0; | 726 | int pgactivate = 0; |
727 | unsigned long nr_dirty = 0; | ||
728 | unsigned long nr_congested = 0; | ||
653 | unsigned long nr_reclaimed = 0; | 729 | unsigned long nr_reclaimed = 0; |
654 | 730 | ||
655 | cond_resched(); | 731 | cond_resched(); |
@@ -669,6 +745,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
669 | goto keep; | 745 | goto keep; |
670 | 746 | ||
671 | VM_BUG_ON(PageActive(page)); | 747 | VM_BUG_ON(PageActive(page)); |
748 | VM_BUG_ON(page_zone(page) != zone); | ||
672 | 749 | ||
673 | sc->nr_scanned++; | 750 | sc->nr_scanned++; |
674 | 751 | ||
@@ -694,10 +771,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
694 | * for any page for which writeback has already | 771 | * for any page for which writeback has already |
695 | * started. | 772 | * started. |
696 | */ | 773 | */ |
697 | if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) | 774 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && |
775 | may_enter_fs) | ||
698 | wait_on_page_writeback(page); | 776 | wait_on_page_writeback(page); |
699 | else | 777 | else { |
700 | goto keep_locked; | 778 | unlock_page(page); |
779 | goto keep_lumpy; | ||
780 | } | ||
701 | } | 781 | } |
702 | 782 | ||
703 | references = page_check_references(page, sc); | 783 | references = page_check_references(page, sc); |
@@ -743,6 +823,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
743 | } | 823 | } |
744 | 824 | ||
745 | if (PageDirty(page)) { | 825 | if (PageDirty(page)) { |
826 | nr_dirty++; | ||
827 | |||
746 | if (references == PAGEREF_RECLAIM_CLEAN) | 828 | if (references == PAGEREF_RECLAIM_CLEAN) |
747 | goto keep_locked; | 829 | goto keep_locked; |
748 | if (!may_enter_fs) | 830 | if (!may_enter_fs) |
@@ -751,14 +833,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
751 | goto keep_locked; | 833 | goto keep_locked; |
752 | 834 | ||
753 | /* Page is dirty, try to write it out here */ | 835 | /* Page is dirty, try to write it out here */ |
754 | switch (pageout(page, mapping, sync_writeback)) { | 836 | switch (pageout(page, mapping, sc)) { |
755 | case PAGE_KEEP: | 837 | case PAGE_KEEP: |
838 | nr_congested++; | ||
756 | goto keep_locked; | 839 | goto keep_locked; |
757 | case PAGE_ACTIVATE: | 840 | case PAGE_ACTIVATE: |
758 | goto activate_locked; | 841 | goto activate_locked; |
759 | case PAGE_SUCCESS: | 842 | case PAGE_SUCCESS: |
760 | if (PageWriteback(page) || PageDirty(page)) | 843 | if (PageWriteback(page)) |
844 | goto keep_lumpy; | ||
845 | if (PageDirty(page)) | ||
761 | goto keep; | 846 | goto keep; |
847 | |||
762 | /* | 848 | /* |
763 | * A synchronous write - probably a ramdisk. Go | 849 | * A synchronous write - probably a ramdisk. Go |
764 | * ahead and try to reclaim the page. | 850 | * ahead and try to reclaim the page. |
@@ -841,6 +927,7 @@ cull_mlocked: | |||
841 | try_to_free_swap(page); | 927 | try_to_free_swap(page); |
842 | unlock_page(page); | 928 | unlock_page(page); |
843 | putback_lru_page(page); | 929 | putback_lru_page(page); |
930 | reset_reclaim_mode(sc); | ||
844 | continue; | 931 | continue; |
845 | 932 | ||
846 | activate_locked: | 933 | activate_locked: |
@@ -853,10 +940,21 @@ activate_locked: | |||
853 | keep_locked: | 940 | keep_locked: |
854 | unlock_page(page); | 941 | unlock_page(page); |
855 | keep: | 942 | keep: |
943 | reset_reclaim_mode(sc); | ||
944 | keep_lumpy: | ||
856 | list_add(&page->lru, &ret_pages); | 945 | list_add(&page->lru, &ret_pages); |
857 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 946 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
858 | } | 947 | } |
859 | 948 | ||
949 | /* | ||
950 | * Tag a zone as congested if all the dirty pages encountered were | ||
951 | * backed by a congested BDI. In this case, reclaimers should just | ||
952 | * back off and wait for congestion to clear because further reclaim | ||
953 | * will encounter the same problem | ||
954 | */ | ||
955 | if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) | ||
956 | zone_set_flag(zone, ZONE_CONGESTED); | ||
957 | |||
860 | free_page_list(&free_pages); | 958 | free_page_list(&free_pages); |
861 | 959 | ||
862 | list_splice(&ret_pages, page_list); | 960 | list_splice(&ret_pages, page_list); |
@@ -962,7 +1060,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
962 | case 0: | 1060 | case 0: |
963 | list_move(&page->lru, dst); | 1061 | list_move(&page->lru, dst); |
964 | mem_cgroup_del_lru(page); | 1062 | mem_cgroup_del_lru(page); |
965 | nr_taken++; | 1063 | nr_taken += hpage_nr_pages(page); |
966 | break; | 1064 | break; |
967 | 1065 | ||
968 | case -EBUSY: | 1066 | case -EBUSY: |
@@ -983,7 +1081,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
983 | * surrounding the tag page. Only take those pages of | 1081 | * surrounding the tag page. Only take those pages of |
984 | * the same active state as that tag page. We may safely | 1082 | * the same active state as that tag page. We may safely |
985 | * round the target page pfn down to the requested order | 1083 | * round the target page pfn down to the requested order |
986 | * as the mem_map is guarenteed valid out to MAX_ORDER, | 1084 | * as the mem_map is guaranteed valid out to MAX_ORDER, |
987 | * where that page is in a different zone we will detect | 1085 | * where that page is in a different zone we will detect |
988 | * it from its zone id and abort this block scan. | 1086 | * it from its zone id and abort this block scan. |
989 | */ | 1087 | */ |
@@ -1006,7 +1104,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1006 | 1104 | ||
1007 | /* Check that we have not crossed a zone boundary. */ | 1105 | /* Check that we have not crossed a zone boundary. */ |
1008 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | 1106 | if (unlikely(page_zone_id(cursor_page) != zone_id)) |
1009 | continue; | 1107 | break; |
1010 | 1108 | ||
1011 | /* | 1109 | /* |
1012 | * If we don't have enough swap space, reclaiming of | 1110 | * If we don't have enough swap space, reclaiming of |
@@ -1014,23 +1112,40 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1014 | * pointless. | 1112 | * pointless. |
1015 | */ | 1113 | */ |
1016 | if (nr_swap_pages <= 0 && PageAnon(cursor_page) && | 1114 | if (nr_swap_pages <= 0 && PageAnon(cursor_page) && |
1017 | !PageSwapCache(cursor_page)) | 1115 | !PageSwapCache(cursor_page)) |
1018 | continue; | 1116 | break; |
1019 | 1117 | ||
1020 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | 1118 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
1021 | list_move(&cursor_page->lru, dst); | 1119 | list_move(&cursor_page->lru, dst); |
1022 | mem_cgroup_del_lru(cursor_page); | 1120 | mem_cgroup_del_lru(cursor_page); |
1023 | nr_taken++; | 1121 | nr_taken += hpage_nr_pages(page); |
1024 | nr_lumpy_taken++; | 1122 | nr_lumpy_taken++; |
1025 | if (PageDirty(cursor_page)) | 1123 | if (PageDirty(cursor_page)) |
1026 | nr_lumpy_dirty++; | 1124 | nr_lumpy_dirty++; |
1027 | scan++; | 1125 | scan++; |
1028 | } else { | 1126 | } else { |
1029 | if (mode == ISOLATE_BOTH && | 1127 | /* |
1030 | page_count(cursor_page)) | 1128 | * Check if the page is freed already. |
1031 | nr_lumpy_failed++; | 1129 | * |
1130 | * We can't use page_count() as that | ||
1131 | * requires compound_head and we don't | ||
1132 | * have a pin on the page here. If a | ||
1133 | * page is tail, we may or may not | ||
1134 | * have isolated the head, so assume | ||
1135 | * it's not free, it'd be tricky to | ||
1136 | * track the head status without a | ||
1137 | * page pin. | ||
1138 | */ | ||
1139 | if (!PageTail(cursor_page) && | ||
1140 | !atomic_read(&cursor_page->_count)) | ||
1141 | continue; | ||
1142 | break; | ||
1032 | } | 1143 | } |
1033 | } | 1144 | } |
1145 | |||
1146 | /* If we break out of the loop above, lumpy reclaim failed */ | ||
1147 | if (pfn < end_pfn) | ||
1148 | nr_lumpy_failed++; | ||
1034 | } | 1149 | } |
1035 | 1150 | ||
1036 | *scanned = scan; | 1151 | *scanned = scan; |
@@ -1070,14 +1185,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, | |||
1070 | struct page *page; | 1185 | struct page *page; |
1071 | 1186 | ||
1072 | list_for_each_entry(page, page_list, lru) { | 1187 | list_for_each_entry(page, page_list, lru) { |
1188 | int numpages = hpage_nr_pages(page); | ||
1073 | lru = page_lru_base_type(page); | 1189 | lru = page_lru_base_type(page); |
1074 | if (PageActive(page)) { | 1190 | if (PageActive(page)) { |
1075 | lru += LRU_ACTIVE; | 1191 | lru += LRU_ACTIVE; |
1076 | ClearPageActive(page); | 1192 | ClearPageActive(page); |
1077 | nr_active++; | 1193 | nr_active += numpages; |
1078 | } | 1194 | } |
1079 | if (count) | 1195 | if (count) |
1080 | count[lru]++; | 1196 | count[lru] += numpages; |
1081 | } | 1197 | } |
1082 | 1198 | ||
1083 | return nr_active; | 1199 | return nr_active; |
@@ -1112,13 +1228,16 @@ int isolate_lru_page(struct page *page) | |||
1112 | { | 1228 | { |
1113 | int ret = -EBUSY; | 1229 | int ret = -EBUSY; |
1114 | 1230 | ||
1231 | VM_BUG_ON(!page_count(page)); | ||
1232 | |||
1115 | if (PageLRU(page)) { | 1233 | if (PageLRU(page)) { |
1116 | struct zone *zone = page_zone(page); | 1234 | struct zone *zone = page_zone(page); |
1117 | 1235 | ||
1118 | spin_lock_irq(&zone->lru_lock); | 1236 | spin_lock_irq(&zone->lru_lock); |
1119 | if (PageLRU(page) && get_page_unless_zero(page)) { | 1237 | if (PageLRU(page)) { |
1120 | int lru = page_lru(page); | 1238 | int lru = page_lru(page); |
1121 | ret = 0; | 1239 | ret = 0; |
1240 | get_page(page); | ||
1122 | ClearPageLRU(page); | 1241 | ClearPageLRU(page); |
1123 | 1242 | ||
1124 | del_page_from_lru_list(zone, page, lru); | 1243 | del_page_from_lru_list(zone, page, lru); |
@@ -1187,7 +1306,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, | |||
1187 | add_page_to_lru_list(zone, page, lru); | 1306 | add_page_to_lru_list(zone, page, lru); |
1188 | if (is_active_lru(lru)) { | 1307 | if (is_active_lru(lru)) { |
1189 | int file = is_file_lru(lru); | 1308 | int file = is_file_lru(lru); |
1190 | reclaim_stat->recent_rotated[file]++; | 1309 | int numpages = hpage_nr_pages(page); |
1310 | reclaim_stat->recent_rotated[file] += numpages; | ||
1191 | } | 1311 | } |
1192 | if (!pagevec_add(&pvec, page)) { | 1312 | if (!pagevec_add(&pvec, page)) { |
1193 | spin_unlock_irq(&zone->lru_lock); | 1313 | spin_unlock_irq(&zone->lru_lock); |
@@ -1253,7 +1373,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, | |||
1253 | return false; | 1373 | return false; |
1254 | 1374 | ||
1255 | /* Only stall on lumpy reclaim */ | 1375 | /* Only stall on lumpy reclaim */ |
1256 | if (!sc->lumpy_reclaim_mode) | 1376 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) |
1257 | return false; | 1377 | return false; |
1258 | 1378 | ||
1259 | /* If we have relaimed everything on the isolated list, no stall */ | 1379 | /* If we have relaimed everything on the isolated list, no stall */ |
@@ -1286,7 +1406,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1286 | unsigned long nr_scanned; | 1406 | unsigned long nr_scanned; |
1287 | unsigned long nr_reclaimed = 0; | 1407 | unsigned long nr_reclaimed = 0; |
1288 | unsigned long nr_taken; | 1408 | unsigned long nr_taken; |
1289 | unsigned long nr_active; | ||
1290 | unsigned long nr_anon; | 1409 | unsigned long nr_anon; |
1291 | unsigned long nr_file; | 1410 | unsigned long nr_file; |
1292 | 1411 | ||
@@ -1298,15 +1417,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1298 | return SWAP_CLUSTER_MAX; | 1417 | return SWAP_CLUSTER_MAX; |
1299 | } | 1418 | } |
1300 | 1419 | ||
1301 | 1420 | set_reclaim_mode(priority, sc, false); | |
1302 | lru_add_drain(); | 1421 | lru_add_drain(); |
1303 | spin_lock_irq(&zone->lru_lock); | 1422 | spin_lock_irq(&zone->lru_lock); |
1304 | 1423 | ||
1305 | if (scanning_global_lru(sc)) { | 1424 | if (scanning_global_lru(sc)) { |
1306 | nr_taken = isolate_pages_global(nr_to_scan, | 1425 | nr_taken = isolate_pages_global(nr_to_scan, |
1307 | &page_list, &nr_scanned, sc->order, | 1426 | &page_list, &nr_scanned, sc->order, |
1308 | sc->lumpy_reclaim_mode ? | 1427 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1309 | ISOLATE_BOTH : ISOLATE_INACTIVE, | 1428 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1310 | zone, 0, file); | 1429 | zone, 0, file); |
1311 | zone->pages_scanned += nr_scanned; | 1430 | zone->pages_scanned += nr_scanned; |
1312 | if (current_is_kswapd()) | 1431 | if (current_is_kswapd()) |
@@ -1318,8 +1437,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1318 | } else { | 1437 | } else { |
1319 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, | 1438 | nr_taken = mem_cgroup_isolate_pages(nr_to_scan, |
1320 | &page_list, &nr_scanned, sc->order, | 1439 | &page_list, &nr_scanned, sc->order, |
1321 | sc->lumpy_reclaim_mode ? | 1440 | sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? |
1322 | ISOLATE_BOTH : ISOLATE_INACTIVE, | 1441 | ISOLATE_BOTH : ISOLATE_INACTIVE, |
1323 | zone, sc->mem_cgroup, | 1442 | zone, sc->mem_cgroup, |
1324 | 0, file); | 1443 | 0, file); |
1325 | /* | 1444 | /* |
@@ -1337,20 +1456,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1337 | 1456 | ||
1338 | spin_unlock_irq(&zone->lru_lock); | 1457 | spin_unlock_irq(&zone->lru_lock); |
1339 | 1458 | ||
1340 | nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); | 1459 | nr_reclaimed = shrink_page_list(&page_list, zone, sc); |
1341 | 1460 | ||
1342 | /* Check if we should syncronously wait for writeback */ | 1461 | /* Check if we should syncronously wait for writeback */ |
1343 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | 1462 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { |
1344 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1463 | set_reclaim_mode(priority, sc, true); |
1345 | 1464 | nr_reclaimed += shrink_page_list(&page_list, zone, sc); | |
1346 | /* | ||
1347 | * The attempt at page out may have made some | ||
1348 | * of the pages active, mark them inactive again. | ||
1349 | */ | ||
1350 | nr_active = clear_active_flags(&page_list, NULL); | ||
1351 | count_vm_events(PGDEACTIVATE, nr_active); | ||
1352 | |||
1353 | nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC); | ||
1354 | } | 1465 | } |
1355 | 1466 | ||
1356 | local_irq_disable(); | 1467 | local_irq_disable(); |
@@ -1359,6 +1470,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, | |||
1359 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); | 1470 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); |
1360 | 1471 | ||
1361 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); | 1472 | putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); |
1473 | |||
1474 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | ||
1475 | zone_idx(zone), | ||
1476 | nr_scanned, nr_reclaimed, | ||
1477 | priority, | ||
1478 | trace_shrink_flags(file, sc->reclaim_mode)); | ||
1362 | return nr_reclaimed; | 1479 | return nr_reclaimed; |
1363 | } | 1480 | } |
1364 | 1481 | ||
@@ -1398,7 +1515,7 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1398 | 1515 | ||
1399 | list_move(&page->lru, &zone->lru[lru].list); | 1516 | list_move(&page->lru, &zone->lru[lru].list); |
1400 | mem_cgroup_add_lru_list(page, lru); | 1517 | mem_cgroup_add_lru_list(page, lru); |
1401 | pgmoved++; | 1518 | pgmoved += hpage_nr_pages(page); |
1402 | 1519 | ||
1403 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | 1520 | if (!pagevec_add(&pvec, page) || list_empty(list)) { |
1404 | spin_unlock_irq(&zone->lru_lock); | 1521 | spin_unlock_irq(&zone->lru_lock); |
@@ -1466,7 +1583,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1466 | } | 1583 | } |
1467 | 1584 | ||
1468 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | 1585 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
1469 | nr_rotated++; | 1586 | nr_rotated += hpage_nr_pages(page); |
1470 | /* | 1587 | /* |
1471 | * Identify referenced, file-backed active pages and | 1588 | * Identify referenced, file-backed active pages and |
1472 | * give them one more trip around the active list. So | 1589 | * give them one more trip around the active list. So |
@@ -1506,6 +1623,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
1506 | spin_unlock_irq(&zone->lru_lock); | 1623 | spin_unlock_irq(&zone->lru_lock); |
1507 | } | 1624 | } |
1508 | 1625 | ||
1626 | #ifdef CONFIG_SWAP | ||
1509 | static int inactive_anon_is_low_global(struct zone *zone) | 1627 | static int inactive_anon_is_low_global(struct zone *zone) |
1510 | { | 1628 | { |
1511 | unsigned long active, inactive; | 1629 | unsigned long active, inactive; |
@@ -1531,12 +1649,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
1531 | { | 1649 | { |
1532 | int low; | 1650 | int low; |
1533 | 1651 | ||
1652 | /* | ||
1653 | * If we don't have swap space, anonymous page deactivation | ||
1654 | * is pointless. | ||
1655 | */ | ||
1656 | if (!total_swap_pages) | ||
1657 | return 0; | ||
1658 | |||
1534 | if (scanning_global_lru(sc)) | 1659 | if (scanning_global_lru(sc)) |
1535 | low = inactive_anon_is_low_global(zone); | 1660 | low = inactive_anon_is_low_global(zone); |
1536 | else | 1661 | else |
1537 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); | 1662 | low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); |
1538 | return low; | 1663 | return low; |
1539 | } | 1664 | } |
1665 | #else | ||
1666 | static inline int inactive_anon_is_low(struct zone *zone, | ||
1667 | struct scan_control *sc) | ||
1668 | { | ||
1669 | return 0; | ||
1670 | } | ||
1671 | #endif | ||
1540 | 1672 | ||
1541 | static int inactive_file_is_low_global(struct zone *zone) | 1673 | static int inactive_file_is_low_global(struct zone *zone) |
1542 | { | 1674 | { |
@@ -1598,26 +1730,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
1598 | } | 1730 | } |
1599 | 1731 | ||
1600 | /* | 1732 | /* |
1601 | * Smallish @nr_to_scan's are deposited in @nr_saved_scan, | ||
1602 | * until we collected @swap_cluster_max pages to scan. | ||
1603 | */ | ||
1604 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | ||
1605 | unsigned long *nr_saved_scan) | ||
1606 | { | ||
1607 | unsigned long nr; | ||
1608 | |||
1609 | *nr_saved_scan += nr_to_scan; | ||
1610 | nr = *nr_saved_scan; | ||
1611 | |||
1612 | if (nr >= SWAP_CLUSTER_MAX) | ||
1613 | *nr_saved_scan = 0; | ||
1614 | else | ||
1615 | nr = 0; | ||
1616 | |||
1617 | return nr; | ||
1618 | } | ||
1619 | |||
1620 | /* | ||
1621 | * Determine how aggressively the anon and file LRU lists should be | 1733 | * Determine how aggressively the anon and file LRU lists should be |
1622 | * scanned. The relative value of each set of LRU lists is determined | 1734 | * scanned. The relative value of each set of LRU lists is determined |
1623 | * by looking at the fraction of the pages scanned we did rotate back | 1735 | * by looking at the fraction of the pages scanned we did rotate back |
@@ -1635,6 +1747,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1635 | u64 fraction[2], denominator; | 1747 | u64 fraction[2], denominator; |
1636 | enum lru_list l; | 1748 | enum lru_list l; |
1637 | int noswap = 0; | 1749 | int noswap = 0; |
1750 | int force_scan = 0; | ||
1751 | |||
1752 | |||
1753 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1754 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1755 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1756 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1757 | |||
1758 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | ||
1759 | /* kswapd does zone balancing and need to scan this zone */ | ||
1760 | if (scanning_global_lru(sc) && current_is_kswapd()) | ||
1761 | force_scan = 1; | ||
1762 | /* memcg may have small limit and need to avoid priority drop */ | ||
1763 | if (!scanning_global_lru(sc)) | ||
1764 | force_scan = 1; | ||
1765 | } | ||
1638 | 1766 | ||
1639 | /* If we have no swap space, do not bother scanning anon pages. */ | 1767 | /* If we have no swap space, do not bother scanning anon pages. */ |
1640 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1768 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
@@ -1645,11 +1773,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1645 | goto out; | 1773 | goto out; |
1646 | } | 1774 | } |
1647 | 1775 | ||
1648 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1649 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1650 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1651 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1652 | |||
1653 | if (scanning_global_lru(sc)) { | 1776 | if (scanning_global_lru(sc)) { |
1654 | free = zone_page_state(zone, NR_FREE_PAGES); | 1777 | free = zone_page_state(zone, NR_FREE_PAGES); |
1655 | /* If we have very few page cache pages, | 1778 | /* If we have very few page cache pages, |
@@ -1716,24 +1839,87 @@ out: | |||
1716 | scan >>= priority; | 1839 | scan >>= priority; |
1717 | scan = div64_u64(scan * fraction[file], denominator); | 1840 | scan = div64_u64(scan * fraction[file], denominator); |
1718 | } | 1841 | } |
1719 | nr[l] = nr_scan_try_batch(scan, | 1842 | |
1720 | &reclaim_stat->nr_saved_scan[l]); | 1843 | /* |
1844 | * If zone is small or memcg is small, nr[l] can be 0. | ||
1845 | * This results no-scan on this priority and priority drop down. | ||
1846 | * For global direct reclaim, it can visit next zone and tend | ||
1847 | * not to have problems. For global kswapd, it's for zone | ||
1848 | * balancing and it need to scan a small amounts. When using | ||
1849 | * memcg, priority drop can cause big latency. So, it's better | ||
1850 | * to scan small amount. See may_noscan above. | ||
1851 | */ | ||
1852 | if (!scan && force_scan) { | ||
1853 | if (file) | ||
1854 | scan = SWAP_CLUSTER_MAX; | ||
1855 | else if (!noswap) | ||
1856 | scan = SWAP_CLUSTER_MAX; | ||
1857 | } | ||
1858 | nr[l] = scan; | ||
1721 | } | 1859 | } |
1722 | } | 1860 | } |
1723 | 1861 | ||
1724 | static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) | 1862 | /* |
1863 | * Reclaim/compaction depends on a number of pages being freed. To avoid | ||
1864 | * disruption to the system, a small number of order-0 pages continue to be | ||
1865 | * rotated and reclaimed in the normal fashion. However, by the time we get | ||
1866 | * back to the allocator and call try_to_compact_zone(), we ensure that | ||
1867 | * there are enough free pages for it to be likely successful | ||
1868 | */ | ||
1869 | static inline bool should_continue_reclaim(struct zone *zone, | ||
1870 | unsigned long nr_reclaimed, | ||
1871 | unsigned long nr_scanned, | ||
1872 | struct scan_control *sc) | ||
1725 | { | 1873 | { |
1874 | unsigned long pages_for_compaction; | ||
1875 | unsigned long inactive_lru_pages; | ||
1876 | |||
1877 | /* If not in reclaim/compaction mode, stop */ | ||
1878 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | ||
1879 | return false; | ||
1880 | |||
1881 | /* Consider stopping depending on scan and reclaim activity */ | ||
1882 | if (sc->gfp_mask & __GFP_REPEAT) { | ||
1883 | /* | ||
1884 | * For __GFP_REPEAT allocations, stop reclaiming if the | ||
1885 | * full LRU list has been scanned and we are still failing | ||
1886 | * to reclaim pages. This full LRU scan is potentially | ||
1887 | * expensive but a __GFP_REPEAT caller really wants to succeed | ||
1888 | */ | ||
1889 | if (!nr_reclaimed && !nr_scanned) | ||
1890 | return false; | ||
1891 | } else { | ||
1892 | /* | ||
1893 | * For non-__GFP_REPEAT allocations which can presumably | ||
1894 | * fail without consequence, stop if we failed to reclaim | ||
1895 | * any pages from the last SWAP_CLUSTER_MAX number of | ||
1896 | * pages that were scanned. This will return to the | ||
1897 | * caller faster at the risk reclaim/compaction and | ||
1898 | * the resulting allocation attempt fails | ||
1899 | */ | ||
1900 | if (!nr_reclaimed) | ||
1901 | return false; | ||
1902 | } | ||
1903 | |||
1726 | /* | 1904 | /* |
1727 | * If we need a large contiguous chunk of memory, or have | 1905 | * If we have not reclaimed enough pages for compaction and the |
1728 | * trouble getting a small set of contiguous pages, we | 1906 | * inactive lists are large enough, continue reclaiming |
1729 | * will reclaim both active and inactive pages. | ||
1730 | */ | 1907 | */ |
1731 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | 1908 | pages_for_compaction = (2UL << sc->order); |
1732 | sc->lumpy_reclaim_mode = 1; | 1909 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + |
1733 | else if (sc->order && priority < DEF_PRIORITY - 2) | 1910 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); |
1734 | sc->lumpy_reclaim_mode = 1; | 1911 | if (sc->nr_reclaimed < pages_for_compaction && |
1735 | else | 1912 | inactive_lru_pages > pages_for_compaction) |
1736 | sc->lumpy_reclaim_mode = 0; | 1913 | return true; |
1914 | |||
1915 | /* If compaction would go ahead or the allocation would succeed, stop */ | ||
1916 | switch (compaction_suitable(zone, sc->order)) { | ||
1917 | case COMPACT_PARTIAL: | ||
1918 | case COMPACT_CONTINUE: | ||
1919 | return false; | ||
1920 | default: | ||
1921 | return true; | ||
1922 | } | ||
1737 | } | 1923 | } |
1738 | 1924 | ||
1739 | /* | 1925 | /* |
@@ -1745,13 +1931,14 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1745 | unsigned long nr[NR_LRU_LISTS]; | 1931 | unsigned long nr[NR_LRU_LISTS]; |
1746 | unsigned long nr_to_scan; | 1932 | unsigned long nr_to_scan; |
1747 | enum lru_list l; | 1933 | enum lru_list l; |
1748 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1934 | unsigned long nr_reclaimed, nr_scanned; |
1749 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1935 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1750 | 1936 | ||
1937 | restart: | ||
1938 | nr_reclaimed = 0; | ||
1939 | nr_scanned = sc->nr_scanned; | ||
1751 | get_scan_count(zone, sc, nr, priority); | 1940 | get_scan_count(zone, sc, nr, priority); |
1752 | 1941 | ||
1753 | set_lumpy_reclaim_mode(priority, sc); | ||
1754 | |||
1755 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1942 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1756 | nr[LRU_INACTIVE_FILE]) { | 1943 | nr[LRU_INACTIVE_FILE]) { |
1757 | for_each_evictable_lru(l) { | 1944 | for_each_evictable_lru(l) { |
@@ -1775,16 +1962,20 @@ static void shrink_zone(int priority, struct zone *zone, | |||
1775 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 1962 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
1776 | break; | 1963 | break; |
1777 | } | 1964 | } |
1778 | 1965 | sc->nr_reclaimed += nr_reclaimed; | |
1779 | sc->nr_reclaimed = nr_reclaimed; | ||
1780 | 1966 | ||
1781 | /* | 1967 | /* |
1782 | * Even if we did not try to evict anon pages at all, we want to | 1968 | * Even if we did not try to evict anon pages at all, we want to |
1783 | * rebalance the anon lru active/inactive ratio. | 1969 | * rebalance the anon lru active/inactive ratio. |
1784 | */ | 1970 | */ |
1785 | if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) | 1971 | if (inactive_anon_is_low(zone, sc)) |
1786 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1972 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
1787 | 1973 | ||
1974 | /* reclaim/compaction might need reclaim to continue */ | ||
1975 | if (should_continue_reclaim(zone, nr_reclaimed, | ||
1976 | sc->nr_scanned - nr_scanned, sc)) | ||
1977 | goto restart; | ||
1978 | |||
1788 | throttle_vm_writeout(sc->gfp_mask); | 1979 | throttle_vm_writeout(sc->gfp_mask); |
1789 | } | 1980 | } |
1790 | 1981 | ||
@@ -1809,6 +2000,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
1809 | { | 2000 | { |
1810 | struct zoneref *z; | 2001 | struct zoneref *z; |
1811 | struct zone *zone; | 2002 | struct zone *zone; |
2003 | unsigned long nr_soft_reclaimed; | ||
2004 | unsigned long nr_soft_scanned; | ||
1812 | 2005 | ||
1813 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2006 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1814 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2007 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -1823,6 +2016,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
1823 | continue; | 2016 | continue; |
1824 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2017 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1825 | continue; /* Let kswapd poll it */ | 2018 | continue; /* Let kswapd poll it */ |
2019 | /* | ||
2020 | * This steals pages from memory cgroups over softlimit | ||
2021 | * and returns the number of reclaimed pages and | ||
2022 | * scanned pages. This works for global memory pressure | ||
2023 | * and balancing, not for a memcg's limit. | ||
2024 | */ | ||
2025 | nr_soft_scanned = 0; | ||
2026 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2027 | sc->order, sc->gfp_mask, | ||
2028 | &nr_soft_scanned); | ||
2029 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2030 | sc->nr_scanned += nr_soft_scanned; | ||
2031 | /* need some check for avoid more shrink_zone() */ | ||
1826 | } | 2032 | } |
1827 | 2033 | ||
1828 | shrink_zone(priority, zone, sc); | 2034 | shrink_zone(priority, zone, sc); |
@@ -1834,17 +2040,12 @@ static bool zone_reclaimable(struct zone *zone) | |||
1834 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; | 2040 | return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; |
1835 | } | 2041 | } |
1836 | 2042 | ||
1837 | /* | 2043 | /* All zones in zonelist are unreclaimable? */ |
1838 | * As hibernation is going on, kswapd is freezed so that it can't mark | ||
1839 | * the zone into all_unreclaimable. It can't handle OOM during hibernation. | ||
1840 | * So let's check zone's unreclaimable in direct reclaim as well as kswapd. | ||
1841 | */ | ||
1842 | static bool all_unreclaimable(struct zonelist *zonelist, | 2044 | static bool all_unreclaimable(struct zonelist *zonelist, |
1843 | struct scan_control *sc) | 2045 | struct scan_control *sc) |
1844 | { | 2046 | { |
1845 | struct zoneref *z; | 2047 | struct zoneref *z; |
1846 | struct zone *zone; | 2048 | struct zone *zone; |
1847 | bool all_unreclaimable = true; | ||
1848 | 2049 | ||
1849 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2050 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1850 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2051 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -1852,13 +2053,11 @@ static bool all_unreclaimable(struct zonelist *zonelist, | |||
1852 | continue; | 2053 | continue; |
1853 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2054 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1854 | continue; | 2055 | continue; |
1855 | if (zone_reclaimable(zone)) { | 2056 | if (!zone->all_unreclaimable) |
1856 | all_unreclaimable = false; | 2057 | return false; |
1857 | break; | ||
1858 | } | ||
1859 | } | 2058 | } |
1860 | 2059 | ||
1861 | return all_unreclaimable; | 2060 | return true; |
1862 | } | 2061 | } |
1863 | 2062 | ||
1864 | /* | 2063 | /* |
@@ -1878,7 +2077,8 @@ static bool all_unreclaimable(struct zonelist *zonelist, | |||
1878 | * else, the number of pages reclaimed | 2077 | * else, the number of pages reclaimed |
1879 | */ | 2078 | */ |
1880 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | 2079 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, |
1881 | struct scan_control *sc) | 2080 | struct scan_control *sc, |
2081 | struct shrink_control *shrink) | ||
1882 | { | 2082 | { |
1883 | int priority; | 2083 | int priority; |
1884 | unsigned long total_scanned = 0; | 2084 | unsigned long total_scanned = 0; |
@@ -1896,7 +2096,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1896 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2096 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1897 | sc->nr_scanned = 0; | 2097 | sc->nr_scanned = 0; |
1898 | if (!priority) | 2098 | if (!priority) |
1899 | disable_swap_token(); | 2099 | disable_swap_token(sc->mem_cgroup); |
1900 | shrink_zones(priority, zonelist, sc); | 2100 | shrink_zones(priority, zonelist, sc); |
1901 | /* | 2101 | /* |
1902 | * Don't shrink slabs when reclaiming memory from | 2102 | * Don't shrink slabs when reclaiming memory from |
@@ -1912,7 +2112,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1912 | lru_pages += zone_reclaimable_pages(zone); | 2112 | lru_pages += zone_reclaimable_pages(zone); |
1913 | } | 2113 | } |
1914 | 2114 | ||
1915 | shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); | 2115 | shrink_slab(shrink, sc->nr_scanned, lru_pages); |
1916 | if (reclaim_state) { | 2116 | if (reclaim_state) { |
1917 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | 2117 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; |
1918 | reclaim_state->reclaimed_slab = 0; | 2118 | reclaim_state->reclaimed_slab = 0; |
@@ -1937,27 +2137,31 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1937 | 2137 | ||
1938 | /* Take a nap, wait for some writeback to complete */ | 2138 | /* Take a nap, wait for some writeback to complete */ |
1939 | if (!sc->hibernation_mode && sc->nr_scanned && | 2139 | if (!sc->hibernation_mode && sc->nr_scanned && |
1940 | priority < DEF_PRIORITY - 2) | 2140 | priority < DEF_PRIORITY - 2) { |
1941 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2141 | struct zone *preferred_zone; |
2142 | |||
2143 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | ||
2144 | &cpuset_current_mems_allowed, | ||
2145 | &preferred_zone); | ||
2146 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | ||
2147 | } | ||
1942 | } | 2148 | } |
1943 | 2149 | ||
1944 | out: | 2150 | out: |
1945 | /* | ||
1946 | * Now that we've scanned all the zones at this priority level, note | ||
1947 | * that level within the zone so that the next thread which performs | ||
1948 | * scanning of this zone will immediately start out at this priority | ||
1949 | * level. This affects only the decision whether or not to bring | ||
1950 | * mapped pages onto the inactive list. | ||
1951 | */ | ||
1952 | if (priority < 0) | ||
1953 | priority = 0; | ||
1954 | |||
1955 | delayacct_freepages_end(); | 2151 | delayacct_freepages_end(); |
1956 | put_mems_allowed(); | 2152 | put_mems_allowed(); |
1957 | 2153 | ||
1958 | if (sc->nr_reclaimed) | 2154 | if (sc->nr_reclaimed) |
1959 | return sc->nr_reclaimed; | 2155 | return sc->nr_reclaimed; |
1960 | 2156 | ||
2157 | /* | ||
2158 | * As hibernation is going on, kswapd is freezed so that it can't mark | ||
2159 | * the zone into all_unreclaimable. Thus bypassing all_unreclaimable | ||
2160 | * check. | ||
2161 | */ | ||
2162 | if (oom_killer_disabled) | ||
2163 | return 0; | ||
2164 | |||
1961 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 2165 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
1962 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) | 2166 | if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) |
1963 | return 1; | 2167 | return 1; |
@@ -1980,12 +2184,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1980 | .mem_cgroup = NULL, | 2184 | .mem_cgroup = NULL, |
1981 | .nodemask = nodemask, | 2185 | .nodemask = nodemask, |
1982 | }; | 2186 | }; |
2187 | struct shrink_control shrink = { | ||
2188 | .gfp_mask = sc.gfp_mask, | ||
2189 | }; | ||
1983 | 2190 | ||
1984 | trace_mm_vmscan_direct_reclaim_begin(order, | 2191 | trace_mm_vmscan_direct_reclaim_begin(order, |
1985 | sc.may_writepage, | 2192 | sc.may_writepage, |
1986 | gfp_mask); | 2193 | gfp_mask); |
1987 | 2194 | ||
1988 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 2195 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
1989 | 2196 | ||
1990 | trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); | 2197 | trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); |
1991 | 2198 | ||
@@ -1997,9 +2204,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
1997 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2204 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
1998 | gfp_t gfp_mask, bool noswap, | 2205 | gfp_t gfp_mask, bool noswap, |
1999 | unsigned int swappiness, | 2206 | unsigned int swappiness, |
2000 | struct zone *zone) | 2207 | struct zone *zone, |
2208 | unsigned long *nr_scanned) | ||
2001 | { | 2209 | { |
2002 | struct scan_control sc = { | 2210 | struct scan_control sc = { |
2211 | .nr_scanned = 0, | ||
2003 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2212 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2004 | .may_writepage = !laptop_mode, | 2213 | .may_writepage = !laptop_mode, |
2005 | .may_unmap = 1, | 2214 | .may_unmap = 1, |
@@ -2008,6 +2217,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2008 | .order = 0, | 2217 | .order = 0, |
2009 | .mem_cgroup = mem, | 2218 | .mem_cgroup = mem, |
2010 | }; | 2219 | }; |
2220 | |||
2011 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2221 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2012 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2222 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
2013 | 2223 | ||
@@ -2026,6 +2236,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2026 | 2236 | ||
2027 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); | 2237 | trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); |
2028 | 2238 | ||
2239 | *nr_scanned = sc.nr_scanned; | ||
2029 | return sc.nr_reclaimed; | 2240 | return sc.nr_reclaimed; |
2030 | } | 2241 | } |
2031 | 2242 | ||
@@ -2036,6 +2247,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2036 | { | 2247 | { |
2037 | struct zonelist *zonelist; | 2248 | struct zonelist *zonelist; |
2038 | unsigned long nr_reclaimed; | 2249 | unsigned long nr_reclaimed; |
2250 | int nid; | ||
2039 | struct scan_control sc = { | 2251 | struct scan_control sc = { |
2040 | .may_writepage = !laptop_mode, | 2252 | .may_writepage = !laptop_mode, |
2041 | .may_unmap = 1, | 2253 | .may_unmap = 1, |
@@ -2045,17 +2257,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2045 | .order = 0, | 2257 | .order = 0, |
2046 | .mem_cgroup = mem_cont, | 2258 | .mem_cgroup = mem_cont, |
2047 | .nodemask = NULL, /* we don't care the placement */ | 2259 | .nodemask = NULL, /* we don't care the placement */ |
2260 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | ||
2261 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | ||
2262 | }; | ||
2263 | struct shrink_control shrink = { | ||
2264 | .gfp_mask = sc.gfp_mask, | ||
2048 | }; | 2265 | }; |
2049 | 2266 | ||
2050 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2267 | /* |
2051 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 2268 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't |
2052 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; | 2269 | * take care of from where we get pages. So the node where we start the |
2270 | * scan does not need to be the current node. | ||
2271 | */ | ||
2272 | nid = mem_cgroup_select_victim_node(mem_cont); | ||
2273 | |||
2274 | zonelist = NODE_DATA(nid)->node_zonelists; | ||
2053 | 2275 | ||
2054 | trace_mm_vmscan_memcg_reclaim_begin(0, | 2276 | trace_mm_vmscan_memcg_reclaim_begin(0, |
2055 | sc.may_writepage, | 2277 | sc.may_writepage, |
2056 | sc.gfp_mask); | 2278 | sc.gfp_mask); |
2057 | 2279 | ||
2058 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 2280 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
2059 | 2281 | ||
2060 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 2282 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
2061 | 2283 | ||
@@ -2063,38 +2285,88 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2063 | } | 2285 | } |
2064 | #endif | 2286 | #endif |
2065 | 2287 | ||
2288 | /* | ||
2289 | * pgdat_balanced is used when checking if a node is balanced for high-order | ||
2290 | * allocations. Only zones that meet watermarks and are in a zone allowed | ||
2291 | * by the callers classzone_idx are added to balanced_pages. The total of | ||
2292 | * balanced pages must be at least 25% of the zones allowed by classzone_idx | ||
2293 | * for the node to be considered balanced. Forcing all zones to be balanced | ||
2294 | * for high orders can cause excessive reclaim when there are imbalanced zones. | ||
2295 | * The choice of 25% is due to | ||
2296 | * o a 16M DMA zone that is balanced will not balance a zone on any | ||
2297 | * reasonable sized machine | ||
2298 | * o On all other machines, the top zone must be at least a reasonable | ||
2299 | * percentage of the middle zones. For example, on 32-bit x86, highmem | ||
2300 | * would need to be at least 256M for it to be balance a whole node. | ||
2301 | * Similarly, on x86-64 the Normal zone would need to be at least 1G | ||
2302 | * to balance a node on its own. These seemed like reasonable ratios. | ||
2303 | */ | ||
2304 | static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | ||
2305 | int classzone_idx) | ||
2306 | { | ||
2307 | unsigned long present_pages = 0; | ||
2308 | int i; | ||
2309 | |||
2310 | for (i = 0; i <= classzone_idx; i++) | ||
2311 | present_pages += pgdat->node_zones[i].present_pages; | ||
2312 | |||
2313 | /* A special case here: if zone has no page, we think it's balanced */ | ||
2314 | return balanced_pages >= (present_pages >> 2); | ||
2315 | } | ||
2316 | |||
2066 | /* is kswapd sleeping prematurely? */ | 2317 | /* is kswapd sleeping prematurely? */ |
2067 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | 2318 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, |
2319 | int classzone_idx) | ||
2068 | { | 2320 | { |
2069 | int i; | 2321 | int i; |
2322 | unsigned long balanced = 0; | ||
2323 | bool all_zones_ok = true; | ||
2070 | 2324 | ||
2071 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2325 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2072 | if (remaining) | 2326 | if (remaining) |
2073 | return 1; | 2327 | return true; |
2074 | 2328 | ||
2075 | /* If after HZ/10, a zone is below the high mark, it's premature */ | 2329 | /* Check the watermark levels */ |
2076 | for (i = 0; i < pgdat->nr_zones; i++) { | 2330 | for (i = 0; i <= classzone_idx; i++) { |
2077 | struct zone *zone = pgdat->node_zones + i; | 2331 | struct zone *zone = pgdat->node_zones + i; |
2078 | 2332 | ||
2079 | if (!populated_zone(zone)) | 2333 | if (!populated_zone(zone)) |
2080 | continue; | 2334 | continue; |
2081 | 2335 | ||
2082 | if (zone->all_unreclaimable) | 2336 | /* |
2337 | * balance_pgdat() skips over all_unreclaimable after | ||
2338 | * DEF_PRIORITY. Effectively, it considers them balanced so | ||
2339 | * they must be considered balanced here as well if kswapd | ||
2340 | * is to sleep | ||
2341 | */ | ||
2342 | if (zone->all_unreclaimable) { | ||
2343 | balanced += zone->present_pages; | ||
2083 | continue; | 2344 | continue; |
2345 | } | ||
2084 | 2346 | ||
2085 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 2347 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2086 | 0, 0)) | 2348 | i, 0)) |
2087 | return 1; | 2349 | all_zones_ok = false; |
2350 | else | ||
2351 | balanced += zone->present_pages; | ||
2088 | } | 2352 | } |
2089 | 2353 | ||
2090 | return 0; | 2354 | /* |
2355 | * For high-order requests, the balanced zones must contain at least | ||
2356 | * 25% of the nodes pages for kswapd to sleep. For order-0, all zones | ||
2357 | * must be balanced | ||
2358 | */ | ||
2359 | if (order) | ||
2360 | return !pgdat_balanced(pgdat, balanced, classzone_idx); | ||
2361 | else | ||
2362 | return !all_zones_ok; | ||
2091 | } | 2363 | } |
2092 | 2364 | ||
2093 | /* | 2365 | /* |
2094 | * For kswapd, balance_pgdat() will work across all this node's zones until | 2366 | * For kswapd, balance_pgdat() will work across all this node's zones until |
2095 | * they are all at high_wmark_pages(zone). | 2367 | * they are all at high_wmark_pages(zone). |
2096 | * | 2368 | * |
2097 | * Returns the number of pages which were actually freed. | 2369 | * Returns the final order kswapd was reclaiming at |
2098 | * | 2370 | * |
2099 | * There is special handling here for zones which are full of pinned pages. | 2371 | * There is special handling here for zones which are full of pinned pages. |
2100 | * This can happen if the pages are all mlocked, or if they are all used by | 2372 | * This can happen if the pages are all mlocked, or if they are all used by |
@@ -2111,13 +2383,18 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
2111 | * interoperates with the page allocator fallback scheme to ensure that aging | 2383 | * interoperates with the page allocator fallback scheme to ensure that aging |
2112 | * of pages is balanced across the zones. | 2384 | * of pages is balanced across the zones. |
2113 | */ | 2385 | */ |
2114 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 2386 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2387 | int *classzone_idx) | ||
2115 | { | 2388 | { |
2116 | int all_zones_ok; | 2389 | int all_zones_ok; |
2390 | unsigned long balanced; | ||
2117 | int priority; | 2391 | int priority; |
2118 | int i; | 2392 | int i; |
2393 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2119 | unsigned long total_scanned; | 2394 | unsigned long total_scanned; |
2120 | struct reclaim_state *reclaim_state = current->reclaim_state; | 2395 | struct reclaim_state *reclaim_state = current->reclaim_state; |
2396 | unsigned long nr_soft_reclaimed; | ||
2397 | unsigned long nr_soft_scanned; | ||
2121 | struct scan_control sc = { | 2398 | struct scan_control sc = { |
2122 | .gfp_mask = GFP_KERNEL, | 2399 | .gfp_mask = GFP_KERNEL, |
2123 | .may_unmap = 1, | 2400 | .may_unmap = 1, |
@@ -2131,6 +2408,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
2131 | .order = order, | 2408 | .order = order, |
2132 | .mem_cgroup = NULL, | 2409 | .mem_cgroup = NULL, |
2133 | }; | 2410 | }; |
2411 | struct shrink_control shrink = { | ||
2412 | .gfp_mask = sc.gfp_mask, | ||
2413 | }; | ||
2134 | loop_again: | 2414 | loop_again: |
2135 | total_scanned = 0; | 2415 | total_scanned = 0; |
2136 | sc.nr_reclaimed = 0; | 2416 | sc.nr_reclaimed = 0; |
@@ -2138,15 +2418,15 @@ loop_again: | |||
2138 | count_vm_event(PAGEOUTRUN); | 2418 | count_vm_event(PAGEOUTRUN); |
2139 | 2419 | ||
2140 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2420 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2141 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | ||
2142 | unsigned long lru_pages = 0; | 2421 | unsigned long lru_pages = 0; |
2143 | int has_under_min_watermark_zone = 0; | 2422 | int has_under_min_watermark_zone = 0; |
2144 | 2423 | ||
2145 | /* The swap token gets in the way of swapout... */ | 2424 | /* The swap token gets in the way of swapout... */ |
2146 | if (!priority) | 2425 | if (!priority) |
2147 | disable_swap_token(); | 2426 | disable_swap_token(NULL); |
2148 | 2427 | ||
2149 | all_zones_ok = 1; | 2428 | all_zones_ok = 1; |
2429 | balanced = 0; | ||
2150 | 2430 | ||
2151 | /* | 2431 | /* |
2152 | * Scan in the highmem->dma direction for the highest | 2432 | * Scan in the highmem->dma direction for the highest |
@@ -2169,7 +2449,7 @@ loop_again: | |||
2169 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2449 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
2170 | &sc, priority, 0); | 2450 | &sc, priority, 0); |
2171 | 2451 | ||
2172 | if (!zone_watermark_ok(zone, order, | 2452 | if (!zone_watermark_ok_safe(zone, order, |
2173 | high_wmark_pages(zone), 0, 0)) { | 2453 | high_wmark_pages(zone), 0, 0)) { |
2174 | end_zone = i; | 2454 | end_zone = i; |
2175 | break; | 2455 | break; |
@@ -2196,6 +2476,7 @@ loop_again: | |||
2196 | for (i = 0; i <= end_zone; i++) { | 2476 | for (i = 0; i <= end_zone; i++) { |
2197 | struct zone *zone = pgdat->node_zones + i; | 2477 | struct zone *zone = pgdat->node_zones + i; |
2198 | int nr_slab; | 2478 | int nr_slab; |
2479 | unsigned long balance_gap; | ||
2199 | 2480 | ||
2200 | if (!populated_zone(zone)) | 2481 | if (!populated_zone(zone)) |
2201 | continue; | 2482 | continue; |
@@ -2205,28 +2486,42 @@ loop_again: | |||
2205 | 2486 | ||
2206 | sc.nr_scanned = 0; | 2487 | sc.nr_scanned = 0; |
2207 | 2488 | ||
2489 | nr_soft_scanned = 0; | ||
2208 | /* | 2490 | /* |
2209 | * Call soft limit reclaim before calling shrink_zone. | 2491 | * Call soft limit reclaim before calling shrink_zone. |
2210 | * For now we ignore the return value | ||
2211 | */ | 2492 | */ |
2212 | mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); | 2493 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, |
2494 | order, sc.gfp_mask, | ||
2495 | &nr_soft_scanned); | ||
2496 | sc.nr_reclaimed += nr_soft_reclaimed; | ||
2497 | total_scanned += nr_soft_scanned; | ||
2213 | 2498 | ||
2214 | /* | 2499 | /* |
2215 | * We put equal pressure on every zone, unless one | 2500 | * We put equal pressure on every zone, unless |
2216 | * zone has way too many pages free already. | 2501 | * one zone has way too many pages free |
2502 | * already. The "too many pages" is defined | ||
2503 | * as the high wmark plus a "gap" where the | ||
2504 | * gap is either the low watermark or 1% | ||
2505 | * of the zone, whichever is smaller. | ||
2217 | */ | 2506 | */ |
2218 | if (!zone_watermark_ok(zone, order, | 2507 | balance_gap = min(low_wmark_pages(zone), |
2219 | 8*high_wmark_pages(zone), end_zone, 0)) | 2508 | (zone->present_pages + |
2509 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | ||
2510 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2511 | if (!zone_watermark_ok_safe(zone, order, | ||
2512 | high_wmark_pages(zone) + balance_gap, | ||
2513 | end_zone, 0)) { | ||
2220 | shrink_zone(priority, zone, &sc); | 2514 | shrink_zone(priority, zone, &sc); |
2221 | reclaim_state->reclaimed_slab = 0; | 2515 | |
2222 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 2516 | reclaim_state->reclaimed_slab = 0; |
2223 | lru_pages); | 2517 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
2224 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2518 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2225 | total_scanned += sc.nr_scanned; | 2519 | total_scanned += sc.nr_scanned; |
2226 | if (zone->all_unreclaimable) | 2520 | |
2227 | continue; | 2521 | if (nr_slab == 0 && !zone_reclaimable(zone)) |
2228 | if (nr_slab == 0 && !zone_reclaimable(zone)) | 2522 | zone->all_unreclaimable = 1; |
2229 | zone->all_unreclaimable = 1; | 2523 | } |
2524 | |||
2230 | /* | 2525 | /* |
2231 | * If we've done a decent amount of scanning and | 2526 | * If we've done a decent amount of scanning and |
2232 | * the reclaim ratio is low, start doing writepage | 2527 | * the reclaim ratio is low, start doing writepage |
@@ -2236,7 +2531,13 @@ loop_again: | |||
2236 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2531 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2237 | sc.may_writepage = 1; | 2532 | sc.may_writepage = 1; |
2238 | 2533 | ||
2239 | if (!zone_watermark_ok(zone, order, | 2534 | if (zone->all_unreclaimable) { |
2535 | if (end_zone && end_zone == i) | ||
2536 | end_zone--; | ||
2537 | continue; | ||
2538 | } | ||
2539 | |||
2540 | if (!zone_watermark_ok_safe(zone, order, | ||
2240 | high_wmark_pages(zone), end_zone, 0)) { | 2541 | high_wmark_pages(zone), end_zone, 0)) { |
2241 | all_zones_ok = 0; | 2542 | all_zones_ok = 0; |
2242 | /* | 2543 | /* |
@@ -2244,13 +2545,24 @@ loop_again: | |||
2244 | * means that we have a GFP_ATOMIC allocation | 2545 | * means that we have a GFP_ATOMIC allocation |
2245 | * failure risk. Hurry up! | 2546 | * failure risk. Hurry up! |
2246 | */ | 2547 | */ |
2247 | if (!zone_watermark_ok(zone, order, | 2548 | if (!zone_watermark_ok_safe(zone, order, |
2248 | min_wmark_pages(zone), end_zone, 0)) | 2549 | min_wmark_pages(zone), end_zone, 0)) |
2249 | has_under_min_watermark_zone = 1; | 2550 | has_under_min_watermark_zone = 1; |
2551 | } else { | ||
2552 | /* | ||
2553 | * If a zone reaches its high watermark, | ||
2554 | * consider it to be no longer congested. It's | ||
2555 | * possible there are dirty pages backed by | ||
2556 | * congested BDIs but as pressure is relieved, | ||
2557 | * spectulatively avoid congestion waits | ||
2558 | */ | ||
2559 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2560 | if (i <= *classzone_idx) | ||
2561 | balanced += zone->present_pages; | ||
2250 | } | 2562 | } |
2251 | 2563 | ||
2252 | } | 2564 | } |
2253 | if (all_zones_ok) | 2565 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2254 | break; /* kswapd: all done */ | 2566 | break; /* kswapd: all done */ |
2255 | /* | 2567 | /* |
2256 | * OK, kswapd is getting into trouble. Take a nap, then take | 2568 | * OK, kswapd is getting into trouble. Take a nap, then take |
@@ -2273,7 +2585,13 @@ loop_again: | |||
2273 | break; | 2585 | break; |
2274 | } | 2586 | } |
2275 | out: | 2587 | out: |
2276 | if (!all_zones_ok) { | 2588 | |
2589 | /* | ||
2590 | * order-0: All zones must meet high watermark for a balanced node | ||
2591 | * high-order: Balanced zones must make up at least 25% of the node | ||
2592 | * for the node to be balanced | ||
2593 | */ | ||
2594 | if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { | ||
2277 | cond_resched(); | 2595 | cond_resched(); |
2278 | 2596 | ||
2279 | try_to_freeze(); | 2597 | try_to_freeze(); |
@@ -2298,7 +2616,88 @@ out: | |||
2298 | goto loop_again; | 2616 | goto loop_again; |
2299 | } | 2617 | } |
2300 | 2618 | ||
2301 | return sc.nr_reclaimed; | 2619 | /* |
2620 | * If kswapd was reclaiming at a higher order, it has the option of | ||
2621 | * sleeping without all zones being balanced. Before it does, it must | ||
2622 | * ensure that the watermarks for order-0 on *all* zones are met and | ||
2623 | * that the congestion flags are cleared. The congestion flag must | ||
2624 | * be cleared as kswapd is the only mechanism that clears the flag | ||
2625 | * and it is potentially going to sleep here. | ||
2626 | */ | ||
2627 | if (order) { | ||
2628 | for (i = 0; i <= end_zone; i++) { | ||
2629 | struct zone *zone = pgdat->node_zones + i; | ||
2630 | |||
2631 | if (!populated_zone(zone)) | ||
2632 | continue; | ||
2633 | |||
2634 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | ||
2635 | continue; | ||
2636 | |||
2637 | /* Confirm the zone is balanced for order-0 */ | ||
2638 | if (!zone_watermark_ok(zone, 0, | ||
2639 | high_wmark_pages(zone), 0, 0)) { | ||
2640 | order = sc.order = 0; | ||
2641 | goto loop_again; | ||
2642 | } | ||
2643 | |||
2644 | /* If balanced, clear the congested flag */ | ||
2645 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2646 | } | ||
2647 | } | ||
2648 | |||
2649 | /* | ||
2650 | * Return the order we were reclaiming at so sleeping_prematurely() | ||
2651 | * makes a decision on the order we were last reclaiming at. However, | ||
2652 | * if another caller entered the allocator slow path while kswapd | ||
2653 | * was awake, order will remain at the higher level | ||
2654 | */ | ||
2655 | *classzone_idx = end_zone; | ||
2656 | return order; | ||
2657 | } | ||
2658 | |||
2659 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | ||
2660 | { | ||
2661 | long remaining = 0; | ||
2662 | DEFINE_WAIT(wait); | ||
2663 | |||
2664 | if (freezing(current) || kthread_should_stop()) | ||
2665 | return; | ||
2666 | |||
2667 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2668 | |||
2669 | /* Try to sleep for a short interval */ | ||
2670 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2671 | remaining = schedule_timeout(HZ/10); | ||
2672 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2673 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2674 | } | ||
2675 | |||
2676 | /* | ||
2677 | * After a short sleep, check if it was a premature sleep. If not, then | ||
2678 | * go fully to sleep until explicitly woken up. | ||
2679 | */ | ||
2680 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | ||
2681 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2682 | |||
2683 | /* | ||
2684 | * vmstat counters are not perfectly accurate and the estimated | ||
2685 | * value for counters such as NR_FREE_PAGES can deviate from the | ||
2686 | * true value by nr_online_cpus * threshold. To avoid the zone | ||
2687 | * watermarks being breached while under pressure, we reduce the | ||
2688 | * per-cpu vmstat threshold while kswapd is awake and restore | ||
2689 | * them before going back to sleep. | ||
2690 | */ | ||
2691 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | ||
2692 | schedule(); | ||
2693 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | ||
2694 | } else { | ||
2695 | if (remaining) | ||
2696 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2697 | else | ||
2698 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2699 | } | ||
2700 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2302 | } | 2701 | } |
2303 | 2702 | ||
2304 | /* | 2703 | /* |
@@ -2316,10 +2715,11 @@ out: | |||
2316 | */ | 2715 | */ |
2317 | static int kswapd(void *p) | 2716 | static int kswapd(void *p) |
2318 | { | 2717 | { |
2319 | unsigned long order; | 2718 | unsigned long order, new_order; |
2719 | int classzone_idx, new_classzone_idx; | ||
2320 | pg_data_t *pgdat = (pg_data_t*)p; | 2720 | pg_data_t *pgdat = (pg_data_t*)p; |
2321 | struct task_struct *tsk = current; | 2721 | struct task_struct *tsk = current; |
2322 | DEFINE_WAIT(wait); | 2722 | |
2323 | struct reclaim_state reclaim_state = { | 2723 | struct reclaim_state reclaim_state = { |
2324 | .reclaimed_slab = 0, | 2724 | .reclaimed_slab = 0, |
2325 | }; | 2725 | }; |
@@ -2346,50 +2746,37 @@ static int kswapd(void *p) | |||
2346 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 2746 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
2347 | set_freezable(); | 2747 | set_freezable(); |
2348 | 2748 | ||
2349 | order = 0; | 2749 | order = new_order = 0; |
2750 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; | ||
2350 | for ( ; ; ) { | 2751 | for ( ; ; ) { |
2351 | unsigned long new_order; | ||
2352 | int ret; | 2752 | int ret; |
2353 | 2753 | ||
2354 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2754 | /* |
2355 | new_order = pgdat->kswapd_max_order; | 2755 | * If the last balance_pgdat was unsuccessful it's unlikely a |
2356 | pgdat->kswapd_max_order = 0; | 2756 | * new request of a similar or harder type will succeed soon |
2357 | if (order < new_order) { | 2757 | * so consider going to sleep on the basis we reclaimed at |
2758 | */ | ||
2759 | if (classzone_idx >= new_classzone_idx && order == new_order) { | ||
2760 | new_order = pgdat->kswapd_max_order; | ||
2761 | new_classzone_idx = pgdat->classzone_idx; | ||
2762 | pgdat->kswapd_max_order = 0; | ||
2763 | pgdat->classzone_idx = pgdat->nr_zones - 1; | ||
2764 | } | ||
2765 | |||
2766 | if (order < new_order || classzone_idx > new_classzone_idx) { | ||
2358 | /* | 2767 | /* |
2359 | * Don't sleep if someone wants a larger 'order' | 2768 | * Don't sleep if someone wants a larger 'order' |
2360 | * allocation | 2769 | * allocation or has tigher zone constraints |
2361 | */ | 2770 | */ |
2362 | order = new_order; | 2771 | order = new_order; |
2772 | classzone_idx = new_classzone_idx; | ||
2363 | } else { | 2773 | } else { |
2364 | if (!freezing(current) && !kthread_should_stop()) { | 2774 | kswapd_try_to_sleep(pgdat, order, classzone_idx); |
2365 | long remaining = 0; | ||
2366 | |||
2367 | /* Try to sleep for a short interval */ | ||
2368 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2369 | remaining = schedule_timeout(HZ/10); | ||
2370 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2371 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
2372 | } | ||
2373 | |||
2374 | /* | ||
2375 | * After a short sleep, check if it was a | ||
2376 | * premature sleep. If not, then go fully | ||
2377 | * to sleep until explicitly woken up | ||
2378 | */ | ||
2379 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
2380 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | ||
2381 | schedule(); | ||
2382 | } else { | ||
2383 | if (remaining) | ||
2384 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
2385 | else | ||
2386 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
2387 | } | ||
2388 | } | ||
2389 | |||
2390 | order = pgdat->kswapd_max_order; | 2775 | order = pgdat->kswapd_max_order; |
2776 | classzone_idx = pgdat->classzone_idx; | ||
2777 | pgdat->kswapd_max_order = 0; | ||
2778 | pgdat->classzone_idx = pgdat->nr_zones - 1; | ||
2391 | } | 2779 | } |
2392 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
2393 | 2780 | ||
2394 | ret = try_to_freeze(); | 2781 | ret = try_to_freeze(); |
2395 | if (kthread_should_stop()) | 2782 | if (kthread_should_stop()) |
@@ -2401,7 +2788,7 @@ static int kswapd(void *p) | |||
2401 | */ | 2788 | */ |
2402 | if (!ret) { | 2789 | if (!ret) { |
2403 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); | 2790 | trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); |
2404 | balance_pgdat(pgdat, order); | 2791 | order = balance_pgdat(pgdat, order, &classzone_idx); |
2405 | } | 2792 | } |
2406 | } | 2793 | } |
2407 | return 0; | 2794 | return 0; |
@@ -2410,23 +2797,26 @@ static int kswapd(void *p) | |||
2410 | /* | 2797 | /* |
2411 | * A zone is low on free memory, so wake its kswapd task to service it. | 2798 | * A zone is low on free memory, so wake its kswapd task to service it. |
2412 | */ | 2799 | */ |
2413 | void wakeup_kswapd(struct zone *zone, int order) | 2800 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
2414 | { | 2801 | { |
2415 | pg_data_t *pgdat; | 2802 | pg_data_t *pgdat; |
2416 | 2803 | ||
2417 | if (!populated_zone(zone)) | 2804 | if (!populated_zone(zone)) |
2418 | return; | 2805 | return; |
2419 | 2806 | ||
2420 | pgdat = zone->zone_pgdat; | ||
2421 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2422 | return; | ||
2423 | if (pgdat->kswapd_max_order < order) | ||
2424 | pgdat->kswapd_max_order = order; | ||
2425 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2426 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2807 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2427 | return; | 2808 | return; |
2809 | pgdat = zone->zone_pgdat; | ||
2810 | if (pgdat->kswapd_max_order < order) { | ||
2811 | pgdat->kswapd_max_order = order; | ||
2812 | pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); | ||
2813 | } | ||
2428 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2814 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
2429 | return; | 2815 | return; |
2816 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2817 | return; | ||
2818 | |||
2819 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2430 | wake_up_interruptible(&pgdat->kswapd_wait); | 2820 | wake_up_interruptible(&pgdat->kswapd_wait); |
2431 | } | 2821 | } |
2432 | 2822 | ||
@@ -2487,7 +2877,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
2487 | .swappiness = vm_swappiness, | 2877 | .swappiness = vm_swappiness, |
2488 | .order = 0, | 2878 | .order = 0, |
2489 | }; | 2879 | }; |
2490 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | 2880 | struct shrink_control shrink = { |
2881 | .gfp_mask = sc.gfp_mask, | ||
2882 | }; | ||
2883 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | ||
2491 | struct task_struct *p = current; | 2884 | struct task_struct *p = current; |
2492 | unsigned long nr_reclaimed; | 2885 | unsigned long nr_reclaimed; |
2493 | 2886 | ||
@@ -2496,7 +2889,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
2496 | reclaim_state.reclaimed_slab = 0; | 2889 | reclaim_state.reclaimed_slab = 0; |
2497 | p->reclaim_state = &reclaim_state; | 2890 | p->reclaim_state = &reclaim_state; |
2498 | 2891 | ||
2499 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 2892 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); |
2500 | 2893 | ||
2501 | p->reclaim_state = NULL; | 2894 | p->reclaim_state = NULL; |
2502 | lockdep_clear_current_reclaim_state(); | 2895 | lockdep_clear_current_reclaim_state(); |
@@ -2671,6 +3064,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2671 | .swappiness = vm_swappiness, | 3064 | .swappiness = vm_swappiness, |
2672 | .order = order, | 3065 | .order = order, |
2673 | }; | 3066 | }; |
3067 | struct shrink_control shrink = { | ||
3068 | .gfp_mask = sc.gfp_mask, | ||
3069 | }; | ||
2674 | unsigned long nr_slab_pages0, nr_slab_pages1; | 3070 | unsigned long nr_slab_pages0, nr_slab_pages1; |
2675 | 3071 | ||
2676 | cond_resched(); | 3072 | cond_resched(); |
@@ -2712,7 +3108,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
2712 | unsigned long lru_pages = zone_reclaimable_pages(zone); | 3108 | unsigned long lru_pages = zone_reclaimable_pages(zone); |
2713 | 3109 | ||
2714 | /* No reclaimable slab or very low memory pressure */ | 3110 | /* No reclaimable slab or very low memory pressure */ |
2715 | if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) | 3111 | if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) |
2716 | break; | 3112 | break; |
2717 | 3113 | ||
2718 | /* Freed enough memory */ | 3114 | /* Freed enough memory */ |
@@ -2987,6 +3383,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write, | |||
2987 | return 0; | 3383 | return 0; |
2988 | } | 3384 | } |
2989 | 3385 | ||
3386 | #ifdef CONFIG_NUMA | ||
2990 | /* | 3387 | /* |
2991 | * per node 'scan_unevictable_pages' attribute. On demand re-scan of | 3388 | * per node 'scan_unevictable_pages' attribute. On demand re-scan of |
2992 | * a specified node's per zone unevictable lists for evictable pages. | 3389 | * a specified node's per zone unevictable lists for evictable pages. |
@@ -3033,4 +3430,4 @@ void scan_unevictable_unregister_node(struct node *node) | |||
3033 | { | 3430 | { |
3034 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); | 3431 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); |
3035 | } | 3432 | } |
3036 | 3433 | #endif | |