aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm/vmscan.c
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c853
1 files changed, 625 insertions, 228 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f11..d036e59d302b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
32#include <linux/topology.h> 32#include <linux/topology.h>
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/compaction.h>
35#include <linux/notifier.h> 36#include <linux/notifier.h>
36#include <linux/rwsem.h> 37#include <linux/rwsem.h>
37#include <linux/delay.h> 38#include <linux/delay.h>
@@ -40,6 +41,8 @@
40#include <linux/memcontrol.h> 41#include <linux/memcontrol.h>
41#include <linux/delayacct.h> 42#include <linux/delayacct.h>
42#include <linux/sysctl.h> 43#include <linux/sysctl.h>
44#include <linux/oom.h>
45#include <linux/prefetch.h>
43 46
44#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
45#include <asm/div64.h> 48#include <asm/div64.h>
@@ -51,6 +54,24 @@
51#define CREATE_TRACE_POINTS 54#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h> 55#include <trace/events/vmscan.h>
53 56
57/*
58 * reclaim_mode determines how the inactive list is shrunk
59 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
60 * RECLAIM_MODE_ASYNC: Do not block
61 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
62 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
63 * page from the LRU and reclaim all pages within a
64 * naturally aligned range
65 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
66 * order-0 pages and then compact the zone
67 */
68typedef unsigned __bitwise__ reclaim_mode_t;
69#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
70#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
71#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
72#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
73#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
74
54struct scan_control { 75struct scan_control {
55 /* Incremented by the number of inactive pages that were scanned */ 76 /* Incremented by the number of inactive pages that were scanned */
56 unsigned long nr_scanned; 77 unsigned long nr_scanned;
@@ -79,10 +100,10 @@ struct scan_control {
79 int order; 100 int order;
80 101
81 /* 102 /*
82 * Intend to reclaim enough contenious memory rather than to reclaim 103 * Intend to reclaim enough continuous memory rather than reclaim
83 * enough amount memory. I.e, it's the mode for high order allocation. 104 * enough amount of memory. i.e, mode for high order allocation.
84 */ 105 */
85 bool lumpy_reclaim_mode; 106 reclaim_mode_t reclaim_mode;
86 107
87 /* Which cgroup do we reclaim from */ 108 /* Which cgroup do we reclaim from */
88 struct mem_cgroup *mem_cgroup; 109 struct mem_cgroup *mem_cgroup;
@@ -152,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
152 struct scan_control *sc, enum lru_list lru) 173 struct scan_control *sc, enum lru_list lru)
153{ 174{
154 if (!scanning_global_lru(sc)) 175 if (!scanning_global_lru(sc))
155 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); 176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
156 177
157 return zone_page_state(zone, NR_LRU_BASE + lru); 178 return zone_page_state(zone, NR_LRU_BASE + lru);
158} 179}
@@ -181,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker)
181} 202}
182EXPORT_SYMBOL(unregister_shrinker); 203EXPORT_SYMBOL(unregister_shrinker);
183 204
205static inline int do_shrinker_shrink(struct shrinker *shrinker,
206 struct shrink_control *sc,
207 unsigned long nr_to_scan)
208{
209 sc->nr_to_scan = nr_to_scan;
210 return (*shrinker->shrink)(shrinker, sc);
211}
212
184#define SHRINK_BATCH 128 213#define SHRINK_BATCH 128
185/* 214/*
186 * Call the shrink functions to age shrinkable caches 215 * Call the shrink functions to age shrinkable caches
@@ -201,25 +230,29 @@ EXPORT_SYMBOL(unregister_shrinker);
201 * 230 *
202 * Returns the number of slab objects which we shrunk. 231 * Returns the number of slab objects which we shrunk.
203 */ 232 */
204unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, 233unsigned long shrink_slab(struct shrink_control *shrink,
205 unsigned long lru_pages) 234 unsigned long nr_pages_scanned,
235 unsigned long lru_pages)
206{ 236{
207 struct shrinker *shrinker; 237 struct shrinker *shrinker;
208 unsigned long ret = 0; 238 unsigned long ret = 0;
209 239
210 if (scanned == 0) 240 if (nr_pages_scanned == 0)
211 scanned = SWAP_CLUSTER_MAX; 241 nr_pages_scanned = SWAP_CLUSTER_MAX;
212 242
213 if (!down_read_trylock(&shrinker_rwsem)) 243 if (!down_read_trylock(&shrinker_rwsem)) {
214 return 1; /* Assume we'll be able to shrink next time */ 244 /* Assume we'll be able to shrink next time */
245 ret = 1;
246 goto out;
247 }
215 248
216 list_for_each_entry(shrinker, &shrinker_list, list) { 249 list_for_each_entry(shrinker, &shrinker_list, list) {
217 unsigned long long delta; 250 unsigned long long delta;
218 unsigned long total_scan; 251 unsigned long total_scan;
219 unsigned long max_pass; 252 unsigned long max_pass;
220 253
221 max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); 254 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
222 delta = (4 * scanned) / shrinker->seeks; 255 delta = (4 * nr_pages_scanned) / shrinker->seeks;
223 delta *= max_pass; 256 delta *= max_pass;
224 do_div(delta, lru_pages + 1); 257 do_div(delta, lru_pages + 1);
225 shrinker->nr += delta; 258 shrinker->nr += delta;
@@ -246,9 +279,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
246 int shrink_ret; 279 int shrink_ret;
247 int nr_before; 280 int nr_before;
248 281
249 nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); 282 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
250 shrink_ret = (*shrinker->shrink)(shrinker, this_scan, 283 shrink_ret = do_shrinker_shrink(shrinker, shrink,
251 gfp_mask); 284 this_scan);
252 if (shrink_ret == -1) 285 if (shrink_ret == -1)
253 break; 286 break;
254 if (shrink_ret < nr_before) 287 if (shrink_ret < nr_before)
@@ -262,9 +295,44 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
262 shrinker->nr += total_scan; 295 shrinker->nr += total_scan;
263 } 296 }
264 up_read(&shrinker_rwsem); 297 up_read(&shrinker_rwsem);
298out:
299 cond_resched();
265 return ret; 300 return ret;
266} 301}
267 302
303static void set_reclaim_mode(int priority, struct scan_control *sc,
304 bool sync)
305{
306 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
307
308 /*
309 * Initially assume we are entering either lumpy reclaim or
310 * reclaim/compaction.Depending on the order, we will either set the
311 * sync mode or just reclaim order-0 pages later.
312 */
313 if (COMPACTION_BUILD)
314 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
315 else
316 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
317
318 /*
319 * Avoid using lumpy reclaim or reclaim/compaction if possible by
320 * restricting when its set to either costly allocations or when
321 * under memory pressure
322 */
323 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
324 sc->reclaim_mode |= syncmode;
325 else if (sc->order && priority < DEF_PRIORITY - 2)
326 sc->reclaim_mode |= syncmode;
327 else
328 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
329}
330
331static void reset_reclaim_mode(struct scan_control *sc)
332{
333 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
334}
335
268static inline int is_page_cache_freeable(struct page *page) 336static inline int is_page_cache_freeable(struct page *page)
269{ 337{
270 /* 338 /*
@@ -275,7 +343,8 @@ static inline int is_page_cache_freeable(struct page *page)
275 return page_count(page) - page_has_private(page) == 2; 343 return page_count(page) - page_has_private(page) == 2;
276} 344}
277 345
278static int may_write_to_queue(struct backing_dev_info *bdi) 346static int may_write_to_queue(struct backing_dev_info *bdi,
347 struct scan_control *sc)
279{ 348{
280 if (current->flags & PF_SWAPWRITE) 349 if (current->flags & PF_SWAPWRITE)
281 return 1; 350 return 1;
@@ -283,6 +352,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
283 return 1; 352 return 1;
284 if (bdi == current->backing_dev_info) 353 if (bdi == current->backing_dev_info)
285 return 1; 354 return 1;
355
356 /* lumpy reclaim for hugepage often need a lot of write */
357 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
358 return 1;
286 return 0; 359 return 0;
287} 360}
288 361
@@ -301,18 +374,12 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
301static void handle_write_error(struct address_space *mapping, 374static void handle_write_error(struct address_space *mapping,
302 struct page *page, int error) 375 struct page *page, int error)
303{ 376{
304 lock_page_nosync(page); 377 lock_page(page);
305 if (page_mapping(page) == mapping) 378 if (page_mapping(page) == mapping)
306 mapping_set_error(mapping, error); 379 mapping_set_error(mapping, error);
307 unlock_page(page); 380 unlock_page(page);
308} 381}
309 382
310/* Request for sync pageout. */
311enum pageout_io {
312 PAGEOUT_IO_ASYNC,
313 PAGEOUT_IO_SYNC,
314};
315
316/* possible outcome of pageout() */ 383/* possible outcome of pageout() */
317typedef enum { 384typedef enum {
318 /* failed to write page out, page is locked */ 385 /* failed to write page out, page is locked */
@@ -330,7 +397,7 @@ typedef enum {
330 * Calls ->writepage(). 397 * Calls ->writepage().
331 */ 398 */
332static pageout_t pageout(struct page *page, struct address_space *mapping, 399static pageout_t pageout(struct page *page, struct address_space *mapping,
333 enum pageout_io sync_writeback) 400 struct scan_control *sc)
334{ 401{
335 /* 402 /*
336 * If the page is dirty, only perform writeback if that write 403 * If the page is dirty, only perform writeback if that write
@@ -366,7 +433,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
366 } 433 }
367 if (mapping->a_ops->writepage == NULL) 434 if (mapping->a_ops->writepage == NULL)
368 return PAGE_ACTIVATE; 435 return PAGE_ACTIVATE;
369 if (!may_write_to_queue(mapping->backing_dev_info)) 436 if (!may_write_to_queue(mapping->backing_dev_info, sc))
370 return PAGE_KEEP; 437 return PAGE_KEEP;
371 438
372 if (clear_page_dirty_for_io(page)) { 439 if (clear_page_dirty_for_io(page)) {
@@ -376,7 +443,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
376 .nr_to_write = SWAP_CLUSTER_MAX, 443 .nr_to_write = SWAP_CLUSTER_MAX,
377 .range_start = 0, 444 .range_start = 0,
378 .range_end = LLONG_MAX, 445 .range_end = LLONG_MAX,
379 .nonblocking = 1,
380 .for_reclaim = 1, 446 .for_reclaim = 1,
381 }; 447 };
382 448
@@ -394,7 +460,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
394 * direct reclaiming a large contiguous area and the 460 * direct reclaiming a large contiguous area and the
395 * first attempt to free a range of pages fails. 461 * first attempt to free a range of pages fails.
396 */ 462 */
397 if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) 463 if (PageWriteback(page) &&
464 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
398 wait_on_page_writeback(page); 465 wait_on_page_writeback(page);
399 466
400 if (!PageWriteback(page)) { 467 if (!PageWriteback(page)) {
@@ -402,7 +469,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
402 ClearPageReclaim(page); 469 ClearPageReclaim(page);
403 } 470 }
404 trace_mm_vmscan_writepage(page, 471 trace_mm_vmscan_writepage(page,
405 trace_reclaim_flags(page, sync_writeback)); 472 trace_reclaim_flags(page, sc->reclaim_mode));
406 inc_zone_page_state(page, NR_VMSCAN_WRITE); 473 inc_zone_page_state(page, NR_VMSCAN_WRITE);
407 return PAGE_SUCCESS; 474 return PAGE_SUCCESS;
408 } 475 }
@@ -459,9 +526,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
459 spin_unlock_irq(&mapping->tree_lock); 526 spin_unlock_irq(&mapping->tree_lock);
460 swapcache_free(swap, page); 527 swapcache_free(swap, page);
461 } else { 528 } else {
462 __remove_from_page_cache(page); 529 void (*freepage)(struct page *);
530
531 freepage = mapping->a_ops->freepage;
532
533 __delete_from_page_cache(page);
463 spin_unlock_irq(&mapping->tree_lock); 534 spin_unlock_irq(&mapping->tree_lock);
464 mem_cgroup_uncharge_cache_page(page); 535 mem_cgroup_uncharge_cache_page(page);
536
537 if (freepage != NULL)
538 freepage(page);
465 } 539 }
466 540
467 return 1; 541 return 1;
@@ -580,7 +654,7 @@ static enum page_references page_check_references(struct page *page,
580 referenced_page = TestClearPageReferenced(page); 654 referenced_page = TestClearPageReferenced(page);
581 655
582 /* Lumpy reclaim - ignore references */ 656 /* Lumpy reclaim - ignore references */
583 if (sc->lumpy_reclaim_mode) 657 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
584 return PAGEREF_RECLAIM; 658 return PAGEREF_RECLAIM;
585 659
586 /* 660 /*
@@ -616,7 +690,7 @@ static enum page_references page_check_references(struct page *page,
616 } 690 }
617 691
618 /* Reclaim if clean, defer dirty pages to writeback */ 692 /* Reclaim if clean, defer dirty pages to writeback */
619 if (referenced_page) 693 if (referenced_page && !PageSwapBacked(page))
620 return PAGEREF_RECLAIM_CLEAN; 694 return PAGEREF_RECLAIM_CLEAN;
621 695
622 return PAGEREF_RECLAIM; 696 return PAGEREF_RECLAIM;
@@ -644,12 +718,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
644 * shrink_page_list() returns the number of reclaimed pages 718 * shrink_page_list() returns the number of reclaimed pages
645 */ 719 */
646static unsigned long shrink_page_list(struct list_head *page_list, 720static unsigned long shrink_page_list(struct list_head *page_list,
647 struct scan_control *sc, 721 struct zone *zone,
648 enum pageout_io sync_writeback) 722 struct scan_control *sc)
649{ 723{
650 LIST_HEAD(ret_pages); 724 LIST_HEAD(ret_pages);
651 LIST_HEAD(free_pages); 725 LIST_HEAD(free_pages);
652 int pgactivate = 0; 726 int pgactivate = 0;
727 unsigned long nr_dirty = 0;
728 unsigned long nr_congested = 0;
653 unsigned long nr_reclaimed = 0; 729 unsigned long nr_reclaimed = 0;
654 730
655 cond_resched(); 731 cond_resched();
@@ -669,6 +745,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
669 goto keep; 745 goto keep;
670 746
671 VM_BUG_ON(PageActive(page)); 747 VM_BUG_ON(PageActive(page));
748 VM_BUG_ON(page_zone(page) != zone);
672 749
673 sc->nr_scanned++; 750 sc->nr_scanned++;
674 751
@@ -694,10 +771,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
694 * for any page for which writeback has already 771 * for any page for which writeback has already
695 * started. 772 * started.
696 */ 773 */
697 if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) 774 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
775 may_enter_fs)
698 wait_on_page_writeback(page); 776 wait_on_page_writeback(page);
699 else 777 else {
700 goto keep_locked; 778 unlock_page(page);
779 goto keep_lumpy;
780 }
701 } 781 }
702 782
703 references = page_check_references(page, sc); 783 references = page_check_references(page, sc);
@@ -743,6 +823,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
743 } 823 }
744 824
745 if (PageDirty(page)) { 825 if (PageDirty(page)) {
826 nr_dirty++;
827
746 if (references == PAGEREF_RECLAIM_CLEAN) 828 if (references == PAGEREF_RECLAIM_CLEAN)
747 goto keep_locked; 829 goto keep_locked;
748 if (!may_enter_fs) 830 if (!may_enter_fs)
@@ -751,14 +833,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
751 goto keep_locked; 833 goto keep_locked;
752 834
753 /* Page is dirty, try to write it out here */ 835 /* Page is dirty, try to write it out here */
754 switch (pageout(page, mapping, sync_writeback)) { 836 switch (pageout(page, mapping, sc)) {
755 case PAGE_KEEP: 837 case PAGE_KEEP:
838 nr_congested++;
756 goto keep_locked; 839 goto keep_locked;
757 case PAGE_ACTIVATE: 840 case PAGE_ACTIVATE:
758 goto activate_locked; 841 goto activate_locked;
759 case PAGE_SUCCESS: 842 case PAGE_SUCCESS:
760 if (PageWriteback(page) || PageDirty(page)) 843 if (PageWriteback(page))
844 goto keep_lumpy;
845 if (PageDirty(page))
761 goto keep; 846 goto keep;
847
762 /* 848 /*
763 * A synchronous write - probably a ramdisk. Go 849 * A synchronous write - probably a ramdisk. Go
764 * ahead and try to reclaim the page. 850 * ahead and try to reclaim the page.
@@ -841,6 +927,7 @@ cull_mlocked:
841 try_to_free_swap(page); 927 try_to_free_swap(page);
842 unlock_page(page); 928 unlock_page(page);
843 putback_lru_page(page); 929 putback_lru_page(page);
930 reset_reclaim_mode(sc);
844 continue; 931 continue;
845 932
846activate_locked: 933activate_locked:
@@ -853,10 +940,21 @@ activate_locked:
853keep_locked: 940keep_locked:
854 unlock_page(page); 941 unlock_page(page);
855keep: 942keep:
943 reset_reclaim_mode(sc);
944keep_lumpy:
856 list_add(&page->lru, &ret_pages); 945 list_add(&page->lru, &ret_pages);
857 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 946 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
858 } 947 }
859 948
949 /*
950 * Tag a zone as congested if all the dirty pages encountered were
951 * backed by a congested BDI. In this case, reclaimers should just
952 * back off and wait for congestion to clear because further reclaim
953 * will encounter the same problem
954 */
955 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
956 zone_set_flag(zone, ZONE_CONGESTED);
957
860 free_page_list(&free_pages); 958 free_page_list(&free_pages);
861 959
862 list_splice(&ret_pages, page_list); 960 list_splice(&ret_pages, page_list);
@@ -962,7 +1060,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
962 case 0: 1060 case 0:
963 list_move(&page->lru, dst); 1061 list_move(&page->lru, dst);
964 mem_cgroup_del_lru(page); 1062 mem_cgroup_del_lru(page);
965 nr_taken++; 1063 nr_taken += hpage_nr_pages(page);
966 break; 1064 break;
967 1065
968 case -EBUSY: 1066 case -EBUSY:
@@ -983,7 +1081,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
983 * surrounding the tag page. Only take those pages of 1081 * surrounding the tag page. Only take those pages of
984 * the same active state as that tag page. We may safely 1082 * the same active state as that tag page. We may safely
985 * round the target page pfn down to the requested order 1083 * round the target page pfn down to the requested order
986 * as the mem_map is guarenteed valid out to MAX_ORDER, 1084 * as the mem_map is guaranteed valid out to MAX_ORDER,
987 * where that page is in a different zone we will detect 1085 * where that page is in a different zone we will detect
988 * it from its zone id and abort this block scan. 1086 * it from its zone id and abort this block scan.
989 */ 1087 */
@@ -1006,7 +1104,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1006 1104
1007 /* Check that we have not crossed a zone boundary. */ 1105 /* Check that we have not crossed a zone boundary. */
1008 if (unlikely(page_zone_id(cursor_page) != zone_id)) 1106 if (unlikely(page_zone_id(cursor_page) != zone_id))
1009 continue; 1107 break;
1010 1108
1011 /* 1109 /*
1012 * If we don't have enough swap space, reclaiming of 1110 * If we don't have enough swap space, reclaiming of
@@ -1014,23 +1112,40 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1014 * pointless. 1112 * pointless.
1015 */ 1113 */
1016 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1114 if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
1017 !PageSwapCache(cursor_page)) 1115 !PageSwapCache(cursor_page))
1018 continue; 1116 break;
1019 1117
1020 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1118 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1021 list_move(&cursor_page->lru, dst); 1119 list_move(&cursor_page->lru, dst);
1022 mem_cgroup_del_lru(cursor_page); 1120 mem_cgroup_del_lru(cursor_page);
1023 nr_taken++; 1121 nr_taken += hpage_nr_pages(page);
1024 nr_lumpy_taken++; 1122 nr_lumpy_taken++;
1025 if (PageDirty(cursor_page)) 1123 if (PageDirty(cursor_page))
1026 nr_lumpy_dirty++; 1124 nr_lumpy_dirty++;
1027 scan++; 1125 scan++;
1028 } else { 1126 } else {
1029 if (mode == ISOLATE_BOTH && 1127 /*
1030 page_count(cursor_page)) 1128 * Check if the page is freed already.
1031 nr_lumpy_failed++; 1129 *
1130 * We can't use page_count() as that
1131 * requires compound_head and we don't
1132 * have a pin on the page here. If a
1133 * page is tail, we may or may not
1134 * have isolated the head, so assume
1135 * it's not free, it'd be tricky to
1136 * track the head status without a
1137 * page pin.
1138 */
1139 if (!PageTail(cursor_page) &&
1140 !atomic_read(&cursor_page->_count))
1141 continue;
1142 break;
1032 } 1143 }
1033 } 1144 }
1145
1146 /* If we break out of the loop above, lumpy reclaim failed */
1147 if (pfn < end_pfn)
1148 nr_lumpy_failed++;
1034 } 1149 }
1035 1150
1036 *scanned = scan; 1151 *scanned = scan;
@@ -1070,14 +1185,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1070 struct page *page; 1185 struct page *page;
1071 1186
1072 list_for_each_entry(page, page_list, lru) { 1187 list_for_each_entry(page, page_list, lru) {
1188 int numpages = hpage_nr_pages(page);
1073 lru = page_lru_base_type(page); 1189 lru = page_lru_base_type(page);
1074 if (PageActive(page)) { 1190 if (PageActive(page)) {
1075 lru += LRU_ACTIVE; 1191 lru += LRU_ACTIVE;
1076 ClearPageActive(page); 1192 ClearPageActive(page);
1077 nr_active++; 1193 nr_active += numpages;
1078 } 1194 }
1079 if (count) 1195 if (count)
1080 count[lru]++; 1196 count[lru] += numpages;
1081 } 1197 }
1082 1198
1083 return nr_active; 1199 return nr_active;
@@ -1112,13 +1228,16 @@ int isolate_lru_page(struct page *page)
1112{ 1228{
1113 int ret = -EBUSY; 1229 int ret = -EBUSY;
1114 1230
1231 VM_BUG_ON(!page_count(page));
1232
1115 if (PageLRU(page)) { 1233 if (PageLRU(page)) {
1116 struct zone *zone = page_zone(page); 1234 struct zone *zone = page_zone(page);
1117 1235
1118 spin_lock_irq(&zone->lru_lock); 1236 spin_lock_irq(&zone->lru_lock);
1119 if (PageLRU(page) && get_page_unless_zero(page)) { 1237 if (PageLRU(page)) {
1120 int lru = page_lru(page); 1238 int lru = page_lru(page);
1121 ret = 0; 1239 ret = 0;
1240 get_page(page);
1122 ClearPageLRU(page); 1241 ClearPageLRU(page);
1123 1242
1124 del_page_from_lru_list(zone, page, lru); 1243 del_page_from_lru_list(zone, page, lru);
@@ -1187,7 +1306,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1187 add_page_to_lru_list(zone, page, lru); 1306 add_page_to_lru_list(zone, page, lru);
1188 if (is_active_lru(lru)) { 1307 if (is_active_lru(lru)) {
1189 int file = is_file_lru(lru); 1308 int file = is_file_lru(lru);
1190 reclaim_stat->recent_rotated[file]++; 1309 int numpages = hpage_nr_pages(page);
1310 reclaim_stat->recent_rotated[file] += numpages;
1191 } 1311 }
1192 if (!pagevec_add(&pvec, page)) { 1312 if (!pagevec_add(&pvec, page)) {
1193 spin_unlock_irq(&zone->lru_lock); 1313 spin_unlock_irq(&zone->lru_lock);
@@ -1253,7 +1373,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1253 return false; 1373 return false;
1254 1374
1255 /* Only stall on lumpy reclaim */ 1375 /* Only stall on lumpy reclaim */
1256 if (!sc->lumpy_reclaim_mode) 1376 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1257 return false; 1377 return false;
1258 1378
1259 /* If we have relaimed everything on the isolated list, no stall */ 1379 /* If we have relaimed everything on the isolated list, no stall */
@@ -1286,7 +1406,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1286 unsigned long nr_scanned; 1406 unsigned long nr_scanned;
1287 unsigned long nr_reclaimed = 0; 1407 unsigned long nr_reclaimed = 0;
1288 unsigned long nr_taken; 1408 unsigned long nr_taken;
1289 unsigned long nr_active;
1290 unsigned long nr_anon; 1409 unsigned long nr_anon;
1291 unsigned long nr_file; 1410 unsigned long nr_file;
1292 1411
@@ -1298,15 +1417,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1298 return SWAP_CLUSTER_MAX; 1417 return SWAP_CLUSTER_MAX;
1299 } 1418 }
1300 1419
1301 1420 set_reclaim_mode(priority, sc, false);
1302 lru_add_drain(); 1421 lru_add_drain();
1303 spin_lock_irq(&zone->lru_lock); 1422 spin_lock_irq(&zone->lru_lock);
1304 1423
1305 if (scanning_global_lru(sc)) { 1424 if (scanning_global_lru(sc)) {
1306 nr_taken = isolate_pages_global(nr_to_scan, 1425 nr_taken = isolate_pages_global(nr_to_scan,
1307 &page_list, &nr_scanned, sc->order, 1426 &page_list, &nr_scanned, sc->order,
1308 sc->lumpy_reclaim_mode ? 1427 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1309 ISOLATE_BOTH : ISOLATE_INACTIVE, 1428 ISOLATE_BOTH : ISOLATE_INACTIVE,
1310 zone, 0, file); 1429 zone, 0, file);
1311 zone->pages_scanned += nr_scanned; 1430 zone->pages_scanned += nr_scanned;
1312 if (current_is_kswapd()) 1431 if (current_is_kswapd())
@@ -1318,8 +1437,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1318 } else { 1437 } else {
1319 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1438 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1320 &page_list, &nr_scanned, sc->order, 1439 &page_list, &nr_scanned, sc->order,
1321 sc->lumpy_reclaim_mode ? 1440 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1322 ISOLATE_BOTH : ISOLATE_INACTIVE, 1441 ISOLATE_BOTH : ISOLATE_INACTIVE,
1323 zone, sc->mem_cgroup, 1442 zone, sc->mem_cgroup,
1324 0, file); 1443 0, file);
1325 /* 1444 /*
@@ -1337,20 +1456,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1337 1456
1338 spin_unlock_irq(&zone->lru_lock); 1457 spin_unlock_irq(&zone->lru_lock);
1339 1458
1340 nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); 1459 nr_reclaimed = shrink_page_list(&page_list, zone, sc);
1341 1460
1342 /* Check if we should syncronously wait for writeback */ 1461 /* Check if we should syncronously wait for writeback */
1343 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1462 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1344 congestion_wait(BLK_RW_ASYNC, HZ/10); 1463 set_reclaim_mode(priority, sc, true);
1345 1464 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1346 /*
1347 * The attempt at page out may have made some
1348 * of the pages active, mark them inactive again.
1349 */
1350 nr_active = clear_active_flags(&page_list, NULL);
1351 count_vm_events(PGDEACTIVATE, nr_active);
1352
1353 nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
1354 } 1465 }
1355 1466
1356 local_irq_disable(); 1467 local_irq_disable();
@@ -1359,6 +1470,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1359 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1470 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1360 1471
1361 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1472 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1473
1474 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1475 zone_idx(zone),
1476 nr_scanned, nr_reclaimed,
1477 priority,
1478 trace_shrink_flags(file, sc->reclaim_mode));
1362 return nr_reclaimed; 1479 return nr_reclaimed;
1363} 1480}
1364 1481
@@ -1398,7 +1515,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1398 1515
1399 list_move(&page->lru, &zone->lru[lru].list); 1516 list_move(&page->lru, &zone->lru[lru].list);
1400 mem_cgroup_add_lru_list(page, lru); 1517 mem_cgroup_add_lru_list(page, lru);
1401 pgmoved++; 1518 pgmoved += hpage_nr_pages(page);
1402 1519
1403 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1520 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1404 spin_unlock_irq(&zone->lru_lock); 1521 spin_unlock_irq(&zone->lru_lock);
@@ -1466,7 +1583,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1466 } 1583 }
1467 1584
1468 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1585 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1469 nr_rotated++; 1586 nr_rotated += hpage_nr_pages(page);
1470 /* 1587 /*
1471 * Identify referenced, file-backed active pages and 1588 * Identify referenced, file-backed active pages and
1472 * give them one more trip around the active list. So 1589 * give them one more trip around the active list. So
@@ -1506,6 +1623,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1506 spin_unlock_irq(&zone->lru_lock); 1623 spin_unlock_irq(&zone->lru_lock);
1507} 1624}
1508 1625
1626#ifdef CONFIG_SWAP
1509static int inactive_anon_is_low_global(struct zone *zone) 1627static int inactive_anon_is_low_global(struct zone *zone)
1510{ 1628{
1511 unsigned long active, inactive; 1629 unsigned long active, inactive;
@@ -1531,12 +1649,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1531{ 1649{
1532 int low; 1650 int low;
1533 1651
1652 /*
1653 * If we don't have swap space, anonymous page deactivation
1654 * is pointless.
1655 */
1656 if (!total_swap_pages)
1657 return 0;
1658
1534 if (scanning_global_lru(sc)) 1659 if (scanning_global_lru(sc))
1535 low = inactive_anon_is_low_global(zone); 1660 low = inactive_anon_is_low_global(zone);
1536 else 1661 else
1537 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1662 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1538 return low; 1663 return low;
1539} 1664}
1665#else
1666static inline int inactive_anon_is_low(struct zone *zone,
1667 struct scan_control *sc)
1668{
1669 return 0;
1670}
1671#endif
1540 1672
1541static int inactive_file_is_low_global(struct zone *zone) 1673static int inactive_file_is_low_global(struct zone *zone)
1542{ 1674{
@@ -1598,26 +1730,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1598} 1730}
1599 1731
1600/* 1732/*
1601 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1602 * until we collected @swap_cluster_max pages to scan.
1603 */
1604static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1605 unsigned long *nr_saved_scan)
1606{
1607 unsigned long nr;
1608
1609 *nr_saved_scan += nr_to_scan;
1610 nr = *nr_saved_scan;
1611
1612 if (nr >= SWAP_CLUSTER_MAX)
1613 *nr_saved_scan = 0;
1614 else
1615 nr = 0;
1616
1617 return nr;
1618}
1619
1620/*
1621 * Determine how aggressively the anon and file LRU lists should be 1733 * Determine how aggressively the anon and file LRU lists should be
1622 * scanned. The relative value of each set of LRU lists is determined 1734 * scanned. The relative value of each set of LRU lists is determined
1623 * by looking at the fraction of the pages scanned we did rotate back 1735 * by looking at the fraction of the pages scanned we did rotate back
@@ -1635,6 +1747,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1635 u64 fraction[2], denominator; 1747 u64 fraction[2], denominator;
1636 enum lru_list l; 1748 enum lru_list l;
1637 int noswap = 0; 1749 int noswap = 0;
1750 int force_scan = 0;
1751
1752
1753 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1754 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1755 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1756 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1757
1758 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
1759 /* kswapd does zone balancing and need to scan this zone */
1760 if (scanning_global_lru(sc) && current_is_kswapd())
1761 force_scan = 1;
1762 /* memcg may have small limit and need to avoid priority drop */
1763 if (!scanning_global_lru(sc))
1764 force_scan = 1;
1765 }
1638 1766
1639 /* If we have no swap space, do not bother scanning anon pages. */ 1767 /* If we have no swap space, do not bother scanning anon pages. */
1640 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1768 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1645,11 +1773,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1645 goto out; 1773 goto out;
1646 } 1774 }
1647 1775
1648 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1649 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1650 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1651 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1652
1653 if (scanning_global_lru(sc)) { 1776 if (scanning_global_lru(sc)) {
1654 free = zone_page_state(zone, NR_FREE_PAGES); 1777 free = zone_page_state(zone, NR_FREE_PAGES);
1655 /* If we have very few page cache pages, 1778 /* If we have very few page cache pages,
@@ -1716,24 +1839,87 @@ out:
1716 scan >>= priority; 1839 scan >>= priority;
1717 scan = div64_u64(scan * fraction[file], denominator); 1840 scan = div64_u64(scan * fraction[file], denominator);
1718 } 1841 }
1719 nr[l] = nr_scan_try_batch(scan, 1842
1720 &reclaim_stat->nr_saved_scan[l]); 1843 /*
1844 * If zone is small or memcg is small, nr[l] can be 0.
1845 * This results no-scan on this priority and priority drop down.
1846 * For global direct reclaim, it can visit next zone and tend
1847 * not to have problems. For global kswapd, it's for zone
1848 * balancing and it need to scan a small amounts. When using
1849 * memcg, priority drop can cause big latency. So, it's better
1850 * to scan small amount. See may_noscan above.
1851 */
1852 if (!scan && force_scan) {
1853 if (file)
1854 scan = SWAP_CLUSTER_MAX;
1855 else if (!noswap)
1856 scan = SWAP_CLUSTER_MAX;
1857 }
1858 nr[l] = scan;
1721 } 1859 }
1722} 1860}
1723 1861
1724static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) 1862/*
1863 * Reclaim/compaction depends on a number of pages being freed. To avoid
1864 * disruption to the system, a small number of order-0 pages continue to be
1865 * rotated and reclaimed in the normal fashion. However, by the time we get
1866 * back to the allocator and call try_to_compact_zone(), we ensure that
1867 * there are enough free pages for it to be likely successful
1868 */
1869static inline bool should_continue_reclaim(struct zone *zone,
1870 unsigned long nr_reclaimed,
1871 unsigned long nr_scanned,
1872 struct scan_control *sc)
1725{ 1873{
1874 unsigned long pages_for_compaction;
1875 unsigned long inactive_lru_pages;
1876
1877 /* If not in reclaim/compaction mode, stop */
1878 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1879 return false;
1880
1881 /* Consider stopping depending on scan and reclaim activity */
1882 if (sc->gfp_mask & __GFP_REPEAT) {
1883 /*
1884 * For __GFP_REPEAT allocations, stop reclaiming if the
1885 * full LRU list has been scanned and we are still failing
1886 * to reclaim pages. This full LRU scan is potentially
1887 * expensive but a __GFP_REPEAT caller really wants to succeed
1888 */
1889 if (!nr_reclaimed && !nr_scanned)
1890 return false;
1891 } else {
1892 /*
1893 * For non-__GFP_REPEAT allocations which can presumably
1894 * fail without consequence, stop if we failed to reclaim
1895 * any pages from the last SWAP_CLUSTER_MAX number of
1896 * pages that were scanned. This will return to the
1897 * caller faster at the risk reclaim/compaction and
1898 * the resulting allocation attempt fails
1899 */
1900 if (!nr_reclaimed)
1901 return false;
1902 }
1903
1726 /* 1904 /*
1727 * If we need a large contiguous chunk of memory, or have 1905 * If we have not reclaimed enough pages for compaction and the
1728 * trouble getting a small set of contiguous pages, we 1906 * inactive lists are large enough, continue reclaiming
1729 * will reclaim both active and inactive pages.
1730 */ 1907 */
1731 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 1908 pages_for_compaction = (2UL << sc->order);
1732 sc->lumpy_reclaim_mode = 1; 1909 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1733 else if (sc->order && priority < DEF_PRIORITY - 2) 1910 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1734 sc->lumpy_reclaim_mode = 1; 1911 if (sc->nr_reclaimed < pages_for_compaction &&
1735 else 1912 inactive_lru_pages > pages_for_compaction)
1736 sc->lumpy_reclaim_mode = 0; 1913 return true;
1914
1915 /* If compaction would go ahead or the allocation would succeed, stop */
1916 switch (compaction_suitable(zone, sc->order)) {
1917 case COMPACT_PARTIAL:
1918 case COMPACT_CONTINUE:
1919 return false;
1920 default:
1921 return true;
1922 }
1737} 1923}
1738 1924
1739/* 1925/*
@@ -1745,13 +1931,14 @@ static void shrink_zone(int priority, struct zone *zone,
1745 unsigned long nr[NR_LRU_LISTS]; 1931 unsigned long nr[NR_LRU_LISTS];
1746 unsigned long nr_to_scan; 1932 unsigned long nr_to_scan;
1747 enum lru_list l; 1933 enum lru_list l;
1748 unsigned long nr_reclaimed = sc->nr_reclaimed; 1934 unsigned long nr_reclaimed, nr_scanned;
1749 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1935 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1750 1936
1937restart:
1938 nr_reclaimed = 0;
1939 nr_scanned = sc->nr_scanned;
1751 get_scan_count(zone, sc, nr, priority); 1940 get_scan_count(zone, sc, nr, priority);
1752 1941
1753 set_lumpy_reclaim_mode(priority, sc);
1754
1755 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1942 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1756 nr[LRU_INACTIVE_FILE]) { 1943 nr[LRU_INACTIVE_FILE]) {
1757 for_each_evictable_lru(l) { 1944 for_each_evictable_lru(l) {
@@ -1775,16 +1962,20 @@ static void shrink_zone(int priority, struct zone *zone,
1775 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1962 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1776 break; 1963 break;
1777 } 1964 }
1778 1965 sc->nr_reclaimed += nr_reclaimed;
1779 sc->nr_reclaimed = nr_reclaimed;
1780 1966
1781 /* 1967 /*
1782 * Even if we did not try to evict anon pages at all, we want to 1968 * Even if we did not try to evict anon pages at all, we want to
1783 * rebalance the anon lru active/inactive ratio. 1969 * rebalance the anon lru active/inactive ratio.
1784 */ 1970 */
1785 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) 1971 if (inactive_anon_is_low(zone, sc))
1786 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1972 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1787 1973
1974 /* reclaim/compaction might need reclaim to continue */
1975 if (should_continue_reclaim(zone, nr_reclaimed,
1976 sc->nr_scanned - nr_scanned, sc))
1977 goto restart;
1978
1788 throttle_vm_writeout(sc->gfp_mask); 1979 throttle_vm_writeout(sc->gfp_mask);
1789} 1980}
1790 1981
@@ -1809,6 +2000,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1809{ 2000{
1810 struct zoneref *z; 2001 struct zoneref *z;
1811 struct zone *zone; 2002 struct zone *zone;
2003 unsigned long nr_soft_reclaimed;
2004 unsigned long nr_soft_scanned;
1812 2005
1813 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2006 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1814 gfp_zone(sc->gfp_mask), sc->nodemask) { 2007 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1823,6 +2016,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1823 continue; 2016 continue;
1824 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2017 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1825 continue; /* Let kswapd poll it */ 2018 continue; /* Let kswapd poll it */
2019 /*
2020 * This steals pages from memory cgroups over softlimit
2021 * and returns the number of reclaimed pages and
2022 * scanned pages. This works for global memory pressure
2023 * and balancing, not for a memcg's limit.
2024 */
2025 nr_soft_scanned = 0;
2026 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2027 sc->order, sc->gfp_mask,
2028 &nr_soft_scanned);
2029 sc->nr_reclaimed += nr_soft_reclaimed;
2030 sc->nr_scanned += nr_soft_scanned;
2031 /* need some check for avoid more shrink_zone() */
1826 } 2032 }
1827 2033
1828 shrink_zone(priority, zone, sc); 2034 shrink_zone(priority, zone, sc);
@@ -1834,17 +2040,12 @@ static bool zone_reclaimable(struct zone *zone)
1834 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; 2040 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
1835} 2041}
1836 2042
1837/* 2043/* All zones in zonelist are unreclaimable? */
1838 * As hibernation is going on, kswapd is freezed so that it can't mark
1839 * the zone into all_unreclaimable. It can't handle OOM during hibernation.
1840 * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
1841 */
1842static bool all_unreclaimable(struct zonelist *zonelist, 2044static bool all_unreclaimable(struct zonelist *zonelist,
1843 struct scan_control *sc) 2045 struct scan_control *sc)
1844{ 2046{
1845 struct zoneref *z; 2047 struct zoneref *z;
1846 struct zone *zone; 2048 struct zone *zone;
1847 bool all_unreclaimable = true;
1848 2049
1849 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2050 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1850 gfp_zone(sc->gfp_mask), sc->nodemask) { 2051 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1852,13 +2053,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
1852 continue; 2053 continue;
1853 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2054 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1854 continue; 2055 continue;
1855 if (zone_reclaimable(zone)) { 2056 if (!zone->all_unreclaimable)
1856 all_unreclaimable = false; 2057 return false;
1857 break;
1858 }
1859 } 2058 }
1860 2059
1861 return all_unreclaimable; 2060 return true;
1862} 2061}
1863 2062
1864/* 2063/*
@@ -1878,7 +2077,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
1878 * else, the number of pages reclaimed 2077 * else, the number of pages reclaimed
1879 */ 2078 */
1880static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2079static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1881 struct scan_control *sc) 2080 struct scan_control *sc,
2081 struct shrink_control *shrink)
1882{ 2082{
1883 int priority; 2083 int priority;
1884 unsigned long total_scanned = 0; 2084 unsigned long total_scanned = 0;
@@ -1896,7 +2096,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1896 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2096 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1897 sc->nr_scanned = 0; 2097 sc->nr_scanned = 0;
1898 if (!priority) 2098 if (!priority)
1899 disable_swap_token(); 2099 disable_swap_token(sc->mem_cgroup);
1900 shrink_zones(priority, zonelist, sc); 2100 shrink_zones(priority, zonelist, sc);
1901 /* 2101 /*
1902 * Don't shrink slabs when reclaiming memory from 2102 * Don't shrink slabs when reclaiming memory from
@@ -1912,7 +2112,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1912 lru_pages += zone_reclaimable_pages(zone); 2112 lru_pages += zone_reclaimable_pages(zone);
1913 } 2113 }
1914 2114
1915 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 2115 shrink_slab(shrink, sc->nr_scanned, lru_pages);
1916 if (reclaim_state) { 2116 if (reclaim_state) {
1917 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2117 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
1918 reclaim_state->reclaimed_slab = 0; 2118 reclaim_state->reclaimed_slab = 0;
@@ -1937,27 +2137,31 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1937 2137
1938 /* Take a nap, wait for some writeback to complete */ 2138 /* Take a nap, wait for some writeback to complete */
1939 if (!sc->hibernation_mode && sc->nr_scanned && 2139 if (!sc->hibernation_mode && sc->nr_scanned &&
1940 priority < DEF_PRIORITY - 2) 2140 priority < DEF_PRIORITY - 2) {
1941 congestion_wait(BLK_RW_ASYNC, HZ/10); 2141 struct zone *preferred_zone;
2142
2143 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2144 &cpuset_current_mems_allowed,
2145 &preferred_zone);
2146 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2147 }
1942 } 2148 }
1943 2149
1944out: 2150out:
1945 /*
1946 * Now that we've scanned all the zones at this priority level, note
1947 * that level within the zone so that the next thread which performs
1948 * scanning of this zone will immediately start out at this priority
1949 * level. This affects only the decision whether or not to bring
1950 * mapped pages onto the inactive list.
1951 */
1952 if (priority < 0)
1953 priority = 0;
1954
1955 delayacct_freepages_end(); 2151 delayacct_freepages_end();
1956 put_mems_allowed(); 2152 put_mems_allowed();
1957 2153
1958 if (sc->nr_reclaimed) 2154 if (sc->nr_reclaimed)
1959 return sc->nr_reclaimed; 2155 return sc->nr_reclaimed;
1960 2156
2157 /*
2158 * As hibernation is going on, kswapd is freezed so that it can't mark
2159 * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
2160 * check.
2161 */
2162 if (oom_killer_disabled)
2163 return 0;
2164
1961 /* top priority shrink_zones still had more to do? don't OOM, then */ 2165 /* top priority shrink_zones still had more to do? don't OOM, then */
1962 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) 2166 if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
1963 return 1; 2167 return 1;
@@ -1980,12 +2184,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1980 .mem_cgroup = NULL, 2184 .mem_cgroup = NULL,
1981 .nodemask = nodemask, 2185 .nodemask = nodemask,
1982 }; 2186 };
2187 struct shrink_control shrink = {
2188 .gfp_mask = sc.gfp_mask,
2189 };
1983 2190
1984 trace_mm_vmscan_direct_reclaim_begin(order, 2191 trace_mm_vmscan_direct_reclaim_begin(order,
1985 sc.may_writepage, 2192 sc.may_writepage,
1986 gfp_mask); 2193 gfp_mask);
1987 2194
1988 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2195 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
1989 2196
1990 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); 2197 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
1991 2198
@@ -1997,9 +2204,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1997unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2204unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1998 gfp_t gfp_mask, bool noswap, 2205 gfp_t gfp_mask, bool noswap,
1999 unsigned int swappiness, 2206 unsigned int swappiness,
2000 struct zone *zone) 2207 struct zone *zone,
2208 unsigned long *nr_scanned)
2001{ 2209{
2002 struct scan_control sc = { 2210 struct scan_control sc = {
2211 .nr_scanned = 0,
2003 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2212 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2004 .may_writepage = !laptop_mode, 2213 .may_writepage = !laptop_mode,
2005 .may_unmap = 1, 2214 .may_unmap = 1,
@@ -2008,6 +2217,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2008 .order = 0, 2217 .order = 0,
2009 .mem_cgroup = mem, 2218 .mem_cgroup = mem,
2010 }; 2219 };
2220
2011 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2221 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2012 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2222 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2013 2223
@@ -2026,6 +2236,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2026 2236
2027 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2237 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2028 2238
2239 *nr_scanned = sc.nr_scanned;
2029 return sc.nr_reclaimed; 2240 return sc.nr_reclaimed;
2030} 2241}
2031 2242
@@ -2036,6 +2247,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2036{ 2247{
2037 struct zonelist *zonelist; 2248 struct zonelist *zonelist;
2038 unsigned long nr_reclaimed; 2249 unsigned long nr_reclaimed;
2250 int nid;
2039 struct scan_control sc = { 2251 struct scan_control sc = {
2040 .may_writepage = !laptop_mode, 2252 .may_writepage = !laptop_mode,
2041 .may_unmap = 1, 2253 .may_unmap = 1,
@@ -2045,17 +2257,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2045 .order = 0, 2257 .order = 0,
2046 .mem_cgroup = mem_cont, 2258 .mem_cgroup = mem_cont,
2047 .nodemask = NULL, /* we don't care the placement */ 2259 .nodemask = NULL, /* we don't care the placement */
2260 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2261 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
2262 };
2263 struct shrink_control shrink = {
2264 .gfp_mask = sc.gfp_mask,
2048 }; 2265 };
2049 2266
2050 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2267 /*
2051 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2268 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2052 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 2269 * take care of from where we get pages. So the node where we start the
2270 * scan does not need to be the current node.
2271 */
2272 nid = mem_cgroup_select_victim_node(mem_cont);
2273
2274 zonelist = NODE_DATA(nid)->node_zonelists;
2053 2275
2054 trace_mm_vmscan_memcg_reclaim_begin(0, 2276 trace_mm_vmscan_memcg_reclaim_begin(0,
2055 sc.may_writepage, 2277 sc.may_writepage,
2056 sc.gfp_mask); 2278 sc.gfp_mask);
2057 2279
2058 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2280 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2059 2281
2060 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2282 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2061 2283
@@ -2063,38 +2285,88 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2063} 2285}
2064#endif 2286#endif
2065 2287
2288/*
2289 * pgdat_balanced is used when checking if a node is balanced for high-order
2290 * allocations. Only zones that meet watermarks and are in a zone allowed
2291 * by the callers classzone_idx are added to balanced_pages. The total of
2292 * balanced pages must be at least 25% of the zones allowed by classzone_idx
2293 * for the node to be considered balanced. Forcing all zones to be balanced
2294 * for high orders can cause excessive reclaim when there are imbalanced zones.
2295 * The choice of 25% is due to
2296 * o a 16M DMA zone that is balanced will not balance a zone on any
2297 * reasonable sized machine
2298 * o On all other machines, the top zone must be at least a reasonable
2299 * percentage of the middle zones. For example, on 32-bit x86, highmem
2300 * would need to be at least 256M for it to be balance a whole node.
2301 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2302 * to balance a node on its own. These seemed like reasonable ratios.
2303 */
2304static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2305 int classzone_idx)
2306{
2307 unsigned long present_pages = 0;
2308 int i;
2309
2310 for (i = 0; i <= classzone_idx; i++)
2311 present_pages += pgdat->node_zones[i].present_pages;
2312
2313 /* A special case here: if zone has no page, we think it's balanced */
2314 return balanced_pages >= (present_pages >> 2);
2315}
2316
2066/* is kswapd sleeping prematurely? */ 2317/* is kswapd sleeping prematurely? */
2067static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) 2318static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2319 int classzone_idx)
2068{ 2320{
2069 int i; 2321 int i;
2322 unsigned long balanced = 0;
2323 bool all_zones_ok = true;
2070 2324
2071 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2325 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2072 if (remaining) 2326 if (remaining)
2073 return 1; 2327 return true;
2074 2328
2075 /* If after HZ/10, a zone is below the high mark, it's premature */ 2329 /* Check the watermark levels */
2076 for (i = 0; i < pgdat->nr_zones; i++) { 2330 for (i = 0; i <= classzone_idx; i++) {
2077 struct zone *zone = pgdat->node_zones + i; 2331 struct zone *zone = pgdat->node_zones + i;
2078 2332
2079 if (!populated_zone(zone)) 2333 if (!populated_zone(zone))
2080 continue; 2334 continue;
2081 2335
2082 if (zone->all_unreclaimable) 2336 /*
2337 * balance_pgdat() skips over all_unreclaimable after
2338 * DEF_PRIORITY. Effectively, it considers them balanced so
2339 * they must be considered balanced here as well if kswapd
2340 * is to sleep
2341 */
2342 if (zone->all_unreclaimable) {
2343 balanced += zone->present_pages;
2083 continue; 2344 continue;
2345 }
2084 2346
2085 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 2347 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2086 0, 0)) 2348 i, 0))
2087 return 1; 2349 all_zones_ok = false;
2350 else
2351 balanced += zone->present_pages;
2088 } 2352 }
2089 2353
2090 return 0; 2354 /*
2355 * For high-order requests, the balanced zones must contain at least
2356 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2357 * must be balanced
2358 */
2359 if (order)
2360 return !pgdat_balanced(pgdat, balanced, classzone_idx);
2361 else
2362 return !all_zones_ok;
2091} 2363}
2092 2364
2093/* 2365/*
2094 * For kswapd, balance_pgdat() will work across all this node's zones until 2366 * For kswapd, balance_pgdat() will work across all this node's zones until
2095 * they are all at high_wmark_pages(zone). 2367 * they are all at high_wmark_pages(zone).
2096 * 2368 *
2097 * Returns the number of pages which were actually freed. 2369 * Returns the final order kswapd was reclaiming at
2098 * 2370 *
2099 * There is special handling here for zones which are full of pinned pages. 2371 * There is special handling here for zones which are full of pinned pages.
2100 * This can happen if the pages are all mlocked, or if they are all used by 2372 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2111,13 +2383,18 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
2111 * interoperates with the page allocator fallback scheme to ensure that aging 2383 * interoperates with the page allocator fallback scheme to ensure that aging
2112 * of pages is balanced across the zones. 2384 * of pages is balanced across the zones.
2113 */ 2385 */
2114static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 2386static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2387 int *classzone_idx)
2115{ 2388{
2116 int all_zones_ok; 2389 int all_zones_ok;
2390 unsigned long balanced;
2117 int priority; 2391 int priority;
2118 int i; 2392 int i;
2393 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2119 unsigned long total_scanned; 2394 unsigned long total_scanned;
2120 struct reclaim_state *reclaim_state = current->reclaim_state; 2395 struct reclaim_state *reclaim_state = current->reclaim_state;
2396 unsigned long nr_soft_reclaimed;
2397 unsigned long nr_soft_scanned;
2121 struct scan_control sc = { 2398 struct scan_control sc = {
2122 .gfp_mask = GFP_KERNEL, 2399 .gfp_mask = GFP_KERNEL,
2123 .may_unmap = 1, 2400 .may_unmap = 1,
@@ -2131,6 +2408,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2131 .order = order, 2408 .order = order,
2132 .mem_cgroup = NULL, 2409 .mem_cgroup = NULL,
2133 }; 2410 };
2411 struct shrink_control shrink = {
2412 .gfp_mask = sc.gfp_mask,
2413 };
2134loop_again: 2414loop_again:
2135 total_scanned = 0; 2415 total_scanned = 0;
2136 sc.nr_reclaimed = 0; 2416 sc.nr_reclaimed = 0;
@@ -2138,15 +2418,15 @@ loop_again:
2138 count_vm_event(PAGEOUTRUN); 2418 count_vm_event(PAGEOUTRUN);
2139 2419
2140 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2420 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2141 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2142 unsigned long lru_pages = 0; 2421 unsigned long lru_pages = 0;
2143 int has_under_min_watermark_zone = 0; 2422 int has_under_min_watermark_zone = 0;
2144 2423
2145 /* The swap token gets in the way of swapout... */ 2424 /* The swap token gets in the way of swapout... */
2146 if (!priority) 2425 if (!priority)
2147 disable_swap_token(); 2426 disable_swap_token(NULL);
2148 2427
2149 all_zones_ok = 1; 2428 all_zones_ok = 1;
2429 balanced = 0;
2150 2430
2151 /* 2431 /*
2152 * Scan in the highmem->dma direction for the highest 2432 * Scan in the highmem->dma direction for the highest
@@ -2169,7 +2449,7 @@ loop_again:
2169 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2449 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2170 &sc, priority, 0); 2450 &sc, priority, 0);
2171 2451
2172 if (!zone_watermark_ok(zone, order, 2452 if (!zone_watermark_ok_safe(zone, order,
2173 high_wmark_pages(zone), 0, 0)) { 2453 high_wmark_pages(zone), 0, 0)) {
2174 end_zone = i; 2454 end_zone = i;
2175 break; 2455 break;
@@ -2196,6 +2476,7 @@ loop_again:
2196 for (i = 0; i <= end_zone; i++) { 2476 for (i = 0; i <= end_zone; i++) {
2197 struct zone *zone = pgdat->node_zones + i; 2477 struct zone *zone = pgdat->node_zones + i;
2198 int nr_slab; 2478 int nr_slab;
2479 unsigned long balance_gap;
2199 2480
2200 if (!populated_zone(zone)) 2481 if (!populated_zone(zone))
2201 continue; 2482 continue;
@@ -2205,28 +2486,42 @@ loop_again:
2205 2486
2206 sc.nr_scanned = 0; 2487 sc.nr_scanned = 0;
2207 2488
2489 nr_soft_scanned = 0;
2208 /* 2490 /*
2209 * Call soft limit reclaim before calling shrink_zone. 2491 * Call soft limit reclaim before calling shrink_zone.
2210 * For now we ignore the return value
2211 */ 2492 */
2212 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); 2493 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2494 order, sc.gfp_mask,
2495 &nr_soft_scanned);
2496 sc.nr_reclaimed += nr_soft_reclaimed;
2497 total_scanned += nr_soft_scanned;
2213 2498
2214 /* 2499 /*
2215 * We put equal pressure on every zone, unless one 2500 * We put equal pressure on every zone, unless
2216 * zone has way too many pages free already. 2501 * one zone has way too many pages free
2502 * already. The "too many pages" is defined
2503 * as the high wmark plus a "gap" where the
2504 * gap is either the low watermark or 1%
2505 * of the zone, whichever is smaller.
2217 */ 2506 */
2218 if (!zone_watermark_ok(zone, order, 2507 balance_gap = min(low_wmark_pages(zone),
2219 8*high_wmark_pages(zone), end_zone, 0)) 2508 (zone->present_pages +
2509 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2510 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2511 if (!zone_watermark_ok_safe(zone, order,
2512 high_wmark_pages(zone) + balance_gap,
2513 end_zone, 0)) {
2220 shrink_zone(priority, zone, &sc); 2514 shrink_zone(priority, zone, &sc);
2221 reclaim_state->reclaimed_slab = 0; 2515
2222 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 2516 reclaim_state->reclaimed_slab = 0;
2223 lru_pages); 2517 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2224 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2518 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2225 total_scanned += sc.nr_scanned; 2519 total_scanned += sc.nr_scanned;
2226 if (zone->all_unreclaimable) 2520
2227 continue; 2521 if (nr_slab == 0 && !zone_reclaimable(zone))
2228 if (nr_slab == 0 && !zone_reclaimable(zone)) 2522 zone->all_unreclaimable = 1;
2229 zone->all_unreclaimable = 1; 2523 }
2524
2230 /* 2525 /*
2231 * If we've done a decent amount of scanning and 2526 * If we've done a decent amount of scanning and
2232 * the reclaim ratio is low, start doing writepage 2527 * the reclaim ratio is low, start doing writepage
@@ -2236,7 +2531,13 @@ loop_again:
2236 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2531 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2237 sc.may_writepage = 1; 2532 sc.may_writepage = 1;
2238 2533
2239 if (!zone_watermark_ok(zone, order, 2534 if (zone->all_unreclaimable) {
2535 if (end_zone && end_zone == i)
2536 end_zone--;
2537 continue;
2538 }
2539
2540 if (!zone_watermark_ok_safe(zone, order,
2240 high_wmark_pages(zone), end_zone, 0)) { 2541 high_wmark_pages(zone), end_zone, 0)) {
2241 all_zones_ok = 0; 2542 all_zones_ok = 0;
2242 /* 2543 /*
@@ -2244,13 +2545,24 @@ loop_again:
2244 * means that we have a GFP_ATOMIC allocation 2545 * means that we have a GFP_ATOMIC allocation
2245 * failure risk. Hurry up! 2546 * failure risk. Hurry up!
2246 */ 2547 */
2247 if (!zone_watermark_ok(zone, order, 2548 if (!zone_watermark_ok_safe(zone, order,
2248 min_wmark_pages(zone), end_zone, 0)) 2549 min_wmark_pages(zone), end_zone, 0))
2249 has_under_min_watermark_zone = 1; 2550 has_under_min_watermark_zone = 1;
2551 } else {
2552 /*
2553 * If a zone reaches its high watermark,
2554 * consider it to be no longer congested. It's
2555 * possible there are dirty pages backed by
2556 * congested BDIs but as pressure is relieved,
2557 * spectulatively avoid congestion waits
2558 */
2559 zone_clear_flag(zone, ZONE_CONGESTED);
2560 if (i <= *classzone_idx)
2561 balanced += zone->present_pages;
2250 } 2562 }
2251 2563
2252 } 2564 }
2253 if (all_zones_ok) 2565 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2254 break; /* kswapd: all done */ 2566 break; /* kswapd: all done */
2255 /* 2567 /*
2256 * OK, kswapd is getting into trouble. Take a nap, then take 2568 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2273,7 +2585,13 @@ loop_again:
2273 break; 2585 break;
2274 } 2586 }
2275out: 2587out:
2276 if (!all_zones_ok) { 2588
2589 /*
2590 * order-0: All zones must meet high watermark for a balanced node
2591 * high-order: Balanced zones must make up at least 25% of the node
2592 * for the node to be balanced
2593 */
2594 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2277 cond_resched(); 2595 cond_resched();
2278 2596
2279 try_to_freeze(); 2597 try_to_freeze();
@@ -2298,7 +2616,88 @@ out:
2298 goto loop_again; 2616 goto loop_again;
2299 } 2617 }
2300 2618
2301 return sc.nr_reclaimed; 2619 /*
2620 * If kswapd was reclaiming at a higher order, it has the option of
2621 * sleeping without all zones being balanced. Before it does, it must
2622 * ensure that the watermarks for order-0 on *all* zones are met and
2623 * that the congestion flags are cleared. The congestion flag must
2624 * be cleared as kswapd is the only mechanism that clears the flag
2625 * and it is potentially going to sleep here.
2626 */
2627 if (order) {
2628 for (i = 0; i <= end_zone; i++) {
2629 struct zone *zone = pgdat->node_zones + i;
2630
2631 if (!populated_zone(zone))
2632 continue;
2633
2634 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2635 continue;
2636
2637 /* Confirm the zone is balanced for order-0 */
2638 if (!zone_watermark_ok(zone, 0,
2639 high_wmark_pages(zone), 0, 0)) {
2640 order = sc.order = 0;
2641 goto loop_again;
2642 }
2643
2644 /* If balanced, clear the congested flag */
2645 zone_clear_flag(zone, ZONE_CONGESTED);
2646 }
2647 }
2648
2649 /*
2650 * Return the order we were reclaiming at so sleeping_prematurely()
2651 * makes a decision on the order we were last reclaiming at. However,
2652 * if another caller entered the allocator slow path while kswapd
2653 * was awake, order will remain at the higher level
2654 */
2655 *classzone_idx = end_zone;
2656 return order;
2657}
2658
2659static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2660{
2661 long remaining = 0;
2662 DEFINE_WAIT(wait);
2663
2664 if (freezing(current) || kthread_should_stop())
2665 return;
2666
2667 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2668
2669 /* Try to sleep for a short interval */
2670 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2671 remaining = schedule_timeout(HZ/10);
2672 finish_wait(&pgdat->kswapd_wait, &wait);
2673 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2674 }
2675
2676 /*
2677 * After a short sleep, check if it was a premature sleep. If not, then
2678 * go fully to sleep until explicitly woken up.
2679 */
2680 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2681 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2682
2683 /*
2684 * vmstat counters are not perfectly accurate and the estimated
2685 * value for counters such as NR_FREE_PAGES can deviate from the
2686 * true value by nr_online_cpus * threshold. To avoid the zone
2687 * watermarks being breached while under pressure, we reduce the
2688 * per-cpu vmstat threshold while kswapd is awake and restore
2689 * them before going back to sleep.
2690 */
2691 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2692 schedule();
2693 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2694 } else {
2695 if (remaining)
2696 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2697 else
2698 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2699 }
2700 finish_wait(&pgdat->kswapd_wait, &wait);
2302} 2701}
2303 2702
2304/* 2703/*
@@ -2316,10 +2715,11 @@ out:
2316 */ 2715 */
2317static int kswapd(void *p) 2716static int kswapd(void *p)
2318{ 2717{
2319 unsigned long order; 2718 unsigned long order, new_order;
2719 int classzone_idx, new_classzone_idx;
2320 pg_data_t *pgdat = (pg_data_t*)p; 2720 pg_data_t *pgdat = (pg_data_t*)p;
2321 struct task_struct *tsk = current; 2721 struct task_struct *tsk = current;
2322 DEFINE_WAIT(wait); 2722
2323 struct reclaim_state reclaim_state = { 2723 struct reclaim_state reclaim_state = {
2324 .reclaimed_slab = 0, 2724 .reclaimed_slab = 0,
2325 }; 2725 };
@@ -2346,50 +2746,37 @@ static int kswapd(void *p)
2346 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2746 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2347 set_freezable(); 2747 set_freezable();
2348 2748
2349 order = 0; 2749 order = new_order = 0;
2750 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2350 for ( ; ; ) { 2751 for ( ; ; ) {
2351 unsigned long new_order;
2352 int ret; 2752 int ret;
2353 2753
2354 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2754 /*
2355 new_order = pgdat->kswapd_max_order; 2755 * If the last balance_pgdat was unsuccessful it's unlikely a
2356 pgdat->kswapd_max_order = 0; 2756 * new request of a similar or harder type will succeed soon
2357 if (order < new_order) { 2757 * so consider going to sleep on the basis we reclaimed at
2758 */
2759 if (classzone_idx >= new_classzone_idx && order == new_order) {
2760 new_order = pgdat->kswapd_max_order;
2761 new_classzone_idx = pgdat->classzone_idx;
2762 pgdat->kswapd_max_order = 0;
2763 pgdat->classzone_idx = pgdat->nr_zones - 1;
2764 }
2765
2766 if (order < new_order || classzone_idx > new_classzone_idx) {
2358 /* 2767 /*
2359 * Don't sleep if someone wants a larger 'order' 2768 * Don't sleep if someone wants a larger 'order'
2360 * allocation 2769 * allocation or has tigher zone constraints
2361 */ 2770 */
2362 order = new_order; 2771 order = new_order;
2772 classzone_idx = new_classzone_idx;
2363 } else { 2773 } else {
2364 if (!freezing(current) && !kthread_should_stop()) { 2774 kswapd_try_to_sleep(pgdat, order, classzone_idx);
2365 long remaining = 0;
2366
2367 /* Try to sleep for a short interval */
2368 if (!sleeping_prematurely(pgdat, order, remaining)) {
2369 remaining = schedule_timeout(HZ/10);
2370 finish_wait(&pgdat->kswapd_wait, &wait);
2371 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2372 }
2373
2374 /*
2375 * After a short sleep, check if it was a
2376 * premature sleep. If not, then go fully
2377 * to sleep until explicitly woken up
2378 */
2379 if (!sleeping_prematurely(pgdat, order, remaining)) {
2380 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2381 schedule();
2382 } else {
2383 if (remaining)
2384 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2385 else
2386 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2387 }
2388 }
2389
2390 order = pgdat->kswapd_max_order; 2775 order = pgdat->kswapd_max_order;
2776 classzone_idx = pgdat->classzone_idx;
2777 pgdat->kswapd_max_order = 0;
2778 pgdat->classzone_idx = pgdat->nr_zones - 1;
2391 } 2779 }
2392 finish_wait(&pgdat->kswapd_wait, &wait);
2393 2780
2394 ret = try_to_freeze(); 2781 ret = try_to_freeze();
2395 if (kthread_should_stop()) 2782 if (kthread_should_stop())
@@ -2401,7 +2788,7 @@ static int kswapd(void *p)
2401 */ 2788 */
2402 if (!ret) { 2789 if (!ret) {
2403 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2790 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2404 balance_pgdat(pgdat, order); 2791 order = balance_pgdat(pgdat, order, &classzone_idx);
2405 } 2792 }
2406 } 2793 }
2407 return 0; 2794 return 0;
@@ -2410,23 +2797,26 @@ static int kswapd(void *p)
2410/* 2797/*
2411 * A zone is low on free memory, so wake its kswapd task to service it. 2798 * A zone is low on free memory, so wake its kswapd task to service it.
2412 */ 2799 */
2413void wakeup_kswapd(struct zone *zone, int order) 2800void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2414{ 2801{
2415 pg_data_t *pgdat; 2802 pg_data_t *pgdat;
2416 2803
2417 if (!populated_zone(zone)) 2804 if (!populated_zone(zone))
2418 return; 2805 return;
2419 2806
2420 pgdat = zone->zone_pgdat;
2421 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2422 return;
2423 if (pgdat->kswapd_max_order < order)
2424 pgdat->kswapd_max_order = order;
2425 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2426 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2807 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2427 return; 2808 return;
2809 pgdat = zone->zone_pgdat;
2810 if (pgdat->kswapd_max_order < order) {
2811 pgdat->kswapd_max_order = order;
2812 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2813 }
2428 if (!waitqueue_active(&pgdat->kswapd_wait)) 2814 if (!waitqueue_active(&pgdat->kswapd_wait))
2429 return; 2815 return;
2816 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2817 return;
2818
2819 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2430 wake_up_interruptible(&pgdat->kswapd_wait); 2820 wake_up_interruptible(&pgdat->kswapd_wait);
2431} 2821}
2432 2822
@@ -2487,7 +2877,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2487 .swappiness = vm_swappiness, 2877 .swappiness = vm_swappiness,
2488 .order = 0, 2878 .order = 0,
2489 }; 2879 };
2490 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); 2880 struct shrink_control shrink = {
2881 .gfp_mask = sc.gfp_mask,
2882 };
2883 struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2491 struct task_struct *p = current; 2884 struct task_struct *p = current;
2492 unsigned long nr_reclaimed; 2885 unsigned long nr_reclaimed;
2493 2886
@@ -2496,7 +2889,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2496 reclaim_state.reclaimed_slab = 0; 2889 reclaim_state.reclaimed_slab = 0;
2497 p->reclaim_state = &reclaim_state; 2890 p->reclaim_state = &reclaim_state;
2498 2891
2499 nr_reclaimed = do_try_to_free_pages(zonelist, &sc); 2892 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2500 2893
2501 p->reclaim_state = NULL; 2894 p->reclaim_state = NULL;
2502 lockdep_clear_current_reclaim_state(); 2895 lockdep_clear_current_reclaim_state();
@@ -2671,6 +3064,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2671 .swappiness = vm_swappiness, 3064 .swappiness = vm_swappiness,
2672 .order = order, 3065 .order = order,
2673 }; 3066 };
3067 struct shrink_control shrink = {
3068 .gfp_mask = sc.gfp_mask,
3069 };
2674 unsigned long nr_slab_pages0, nr_slab_pages1; 3070 unsigned long nr_slab_pages0, nr_slab_pages1;
2675 3071
2676 cond_resched(); 3072 cond_resched();
@@ -2712,7 +3108,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2712 unsigned long lru_pages = zone_reclaimable_pages(zone); 3108 unsigned long lru_pages = zone_reclaimable_pages(zone);
2713 3109
2714 /* No reclaimable slab or very low memory pressure */ 3110 /* No reclaimable slab or very low memory pressure */
2715 if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) 3111 if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
2716 break; 3112 break;
2717 3113
2718 /* Freed enough memory */ 3114 /* Freed enough memory */
@@ -2987,6 +3383,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
2987 return 0; 3383 return 0;
2988} 3384}
2989 3385
3386#ifdef CONFIG_NUMA
2990/* 3387/*
2991 * per node 'scan_unevictable_pages' attribute. On demand re-scan of 3388 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
2992 * a specified node's per zone unevictable lists for evictable pages. 3389 * a specified node's per zone unevictable lists for evictable pages.
@@ -3033,4 +3430,4 @@ void scan_unevictable_unregister_node(struct node *node)
3033{ 3430{
3034 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 3431 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
3035} 3432}
3036 3433#endif