aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c750
1 files changed, 214 insertions, 536 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33dc256033b5..66e431060c05 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,24 +53,6 @@
53#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
54#include <trace/events/vmscan.h> 54#include <trace/events/vmscan.h>
55 55
56/*
57 * reclaim_mode determines how the inactive list is shrunk
58 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
59 * RECLAIM_MODE_ASYNC: Do not block
60 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
61 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
62 * page from the LRU and reclaim all pages within a
63 * naturally aligned range
64 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
65 * order-0 pages and then compact the zone
66 */
67typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
73
74struct scan_control { 56struct scan_control {
75 /* Incremented by the number of inactive pages that were scanned */ 57 /* Incremented by the number of inactive pages that were scanned */
76 unsigned long nr_scanned; 58 unsigned long nr_scanned;
@@ -96,11 +78,8 @@ struct scan_control {
96 78
97 int order; 79 int order;
98 80
99 /* 81 /* Scan (total_size >> priority) pages at once */
100 * Intend to reclaim enough continuous memory rather than reclaim 82 int priority;
101 * enough amount of memory. i.e, mode for high order allocation.
102 */
103 reclaim_mode_t reclaim_mode;
104 83
105 /* 84 /*
106 * The memory cgroup that hit its limit and as a result is the 85 * The memory cgroup that hit its limit and as a result is the
@@ -115,11 +94,6 @@ struct scan_control {
115 nodemask_t *nodemask; 94 nodemask_t *nodemask;
116}; 95};
117 96
118struct mem_cgroup_zone {
119 struct mem_cgroup *mem_cgroup;
120 struct zone *zone;
121};
122
123#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 97#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
124 98
125#ifdef ARCH_HAS_PREFETCH 99#ifdef ARCH_HAS_PREFETCH
@@ -164,44 +138,21 @@ static bool global_reclaim(struct scan_control *sc)
164{ 138{
165 return !sc->target_mem_cgroup; 139 return !sc->target_mem_cgroup;
166} 140}
167
168static bool scanning_global_lru(struct mem_cgroup_zone *mz)
169{
170 return !mz->mem_cgroup;
171}
172#else 141#else
173static bool global_reclaim(struct scan_control *sc) 142static bool global_reclaim(struct scan_control *sc)
174{ 143{
175 return true; 144 return true;
176} 145}
177
178static bool scanning_global_lru(struct mem_cgroup_zone *mz)
179{
180 return true;
181}
182#endif 146#endif
183 147
184static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) 148static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
185{ 149{
186 if (!scanning_global_lru(mz)) 150 if (!mem_cgroup_disabled())
187 return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); 151 return mem_cgroup_get_lru_size(lruvec, lru);
188 152
189 return &mz->zone->reclaim_stat; 153 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
190} 154}
191 155
192static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
193 enum lru_list lru)
194{
195 if (!scanning_global_lru(mz))
196 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
197 zone_to_nid(mz->zone),
198 zone_idx(mz->zone),
199 BIT(lru));
200
201 return zone_page_state(mz->zone, NR_LRU_BASE + lru);
202}
203
204
205/* 156/*
206 * Add a shrinker callback to be called from the vm 157 * Add a shrinker callback to be called from the vm
207 */ 158 */
@@ -364,39 +315,6 @@ out:
364 return ret; 315 return ret;
365} 316}
366 317
367static void set_reclaim_mode(int priority, struct scan_control *sc,
368 bool sync)
369{
370 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
371
372 /*
373 * Initially assume we are entering either lumpy reclaim or
374 * reclaim/compaction.Depending on the order, we will either set the
375 * sync mode or just reclaim order-0 pages later.
376 */
377 if (COMPACTION_BUILD)
378 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
379 else
380 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
381
382 /*
383 * Avoid using lumpy reclaim or reclaim/compaction if possible by
384 * restricting when its set to either costly allocations or when
385 * under memory pressure
386 */
387 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
388 sc->reclaim_mode |= syncmode;
389 else if (sc->order && priority < DEF_PRIORITY - 2)
390 sc->reclaim_mode |= syncmode;
391 else
392 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
393}
394
395static void reset_reclaim_mode(struct scan_control *sc)
396{
397 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
398}
399
400static inline int is_page_cache_freeable(struct page *page) 318static inline int is_page_cache_freeable(struct page *page)
401{ 319{
402 /* 320 /*
@@ -416,10 +334,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
416 return 1; 334 return 1;
417 if (bdi == current->backing_dev_info) 335 if (bdi == current->backing_dev_info)
418 return 1; 336 return 1;
419
420 /* lumpy reclaim for hugepage often need a lot of write */
421 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
422 return 1;
423 return 0; 337 return 0;
424} 338}
425 339
@@ -523,8 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
523 /* synchronous write or broken a_ops? */ 437 /* synchronous write or broken a_ops? */
524 ClearPageReclaim(page); 438 ClearPageReclaim(page);
525 } 439 }
526 trace_mm_vmscan_writepage(page, 440 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
527 trace_reclaim_flags(page, sc->reclaim_mode));
528 inc_zone_page_state(page, NR_VMSCAN_WRITE); 441 inc_zone_page_state(page, NR_VMSCAN_WRITE);
529 return PAGE_SUCCESS; 442 return PAGE_SUCCESS;
530 } 443 }
@@ -701,19 +614,15 @@ enum page_references {
701}; 614};
702 615
703static enum page_references page_check_references(struct page *page, 616static enum page_references page_check_references(struct page *page,
704 struct mem_cgroup_zone *mz,
705 struct scan_control *sc) 617 struct scan_control *sc)
706{ 618{
707 int referenced_ptes, referenced_page; 619 int referenced_ptes, referenced_page;
708 unsigned long vm_flags; 620 unsigned long vm_flags;
709 621
710 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); 622 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
623 &vm_flags);
711 referenced_page = TestClearPageReferenced(page); 624 referenced_page = TestClearPageReferenced(page);
712 625
713 /* Lumpy reclaim - ignore references */
714 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
715 return PAGEREF_RECLAIM;
716
717 /* 626 /*
718 * Mlock lost the isolation race with us. Let try_to_unmap() 627 * Mlock lost the isolation race with us. Let try_to_unmap()
719 * move the page to the unevictable list. 628 * move the page to the unevictable list.
@@ -722,7 +631,7 @@ static enum page_references page_check_references(struct page *page,
722 return PAGEREF_RECLAIM; 631 return PAGEREF_RECLAIM;
723 632
724 if (referenced_ptes) { 633 if (referenced_ptes) {
725 if (PageAnon(page)) 634 if (PageSwapBacked(page))
726 return PAGEREF_ACTIVATE; 635 return PAGEREF_ACTIVATE;
727 /* 636 /*
728 * All mapped pages start out with page table 637 * All mapped pages start out with page table
@@ -763,9 +672,8 @@ static enum page_references page_check_references(struct page *page,
763 * shrink_page_list() returns the number of reclaimed pages 672 * shrink_page_list() returns the number of reclaimed pages
764 */ 673 */
765static unsigned long shrink_page_list(struct list_head *page_list, 674static unsigned long shrink_page_list(struct list_head *page_list,
766 struct mem_cgroup_zone *mz, 675 struct zone *zone,
767 struct scan_control *sc, 676 struct scan_control *sc,
768 int priority,
769 unsigned long *ret_nr_dirty, 677 unsigned long *ret_nr_dirty,
770 unsigned long *ret_nr_writeback) 678 unsigned long *ret_nr_writeback)
771{ 679{
@@ -794,7 +702,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
794 goto keep; 702 goto keep;
795 703
796 VM_BUG_ON(PageActive(page)); 704 VM_BUG_ON(PageActive(page));
797 VM_BUG_ON(page_zone(page) != mz->zone); 705 VM_BUG_ON(page_zone(page) != zone);
798 706
799 sc->nr_scanned++; 707 sc->nr_scanned++;
800 708
@@ -813,22 +721,11 @@ static unsigned long shrink_page_list(struct list_head *page_list,
813 721
814 if (PageWriteback(page)) { 722 if (PageWriteback(page)) {
815 nr_writeback++; 723 nr_writeback++;
816 /* 724 unlock_page(page);
817 * Synchronous reclaim cannot queue pages for 725 goto keep;
818 * writeback due to the possibility of stack overflow
819 * but if it encounters a page under writeback, wait
820 * for the IO to complete.
821 */
822 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
823 may_enter_fs)
824 wait_on_page_writeback(page);
825 else {
826 unlock_page(page);
827 goto keep_lumpy;
828 }
829 } 726 }
830 727
831 references = page_check_references(page, mz, sc); 728 references = page_check_references(page, sc);
832 switch (references) { 729 switch (references) {
833 case PAGEREF_ACTIVATE: 730 case PAGEREF_ACTIVATE:
834 goto activate_locked; 731 goto activate_locked;
@@ -879,7 +776,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
879 * unless under significant pressure. 776 * unless under significant pressure.
880 */ 777 */
881 if (page_is_file_cache(page) && 778 if (page_is_file_cache(page) &&
882 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { 779 (!current_is_kswapd() ||
780 sc->priority >= DEF_PRIORITY - 2)) {
883 /* 781 /*
884 * Immediately reclaim when written back. 782 * Immediately reclaim when written back.
885 * Similar in principal to deactivate_page() 783 * Similar in principal to deactivate_page()
@@ -908,7 +806,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
908 goto activate_locked; 806 goto activate_locked;
909 case PAGE_SUCCESS: 807 case PAGE_SUCCESS:
910 if (PageWriteback(page)) 808 if (PageWriteback(page))
911 goto keep_lumpy; 809 goto keep;
912 if (PageDirty(page)) 810 if (PageDirty(page))
913 goto keep; 811 goto keep;
914 812
@@ -994,7 +892,6 @@ cull_mlocked:
994 try_to_free_swap(page); 892 try_to_free_swap(page);
995 unlock_page(page); 893 unlock_page(page);
996 putback_lru_page(page); 894 putback_lru_page(page);
997 reset_reclaim_mode(sc);
998 continue; 895 continue;
999 896
1000activate_locked: 897activate_locked:
@@ -1007,8 +904,6 @@ activate_locked:
1007keep_locked: 904keep_locked:
1008 unlock_page(page); 905 unlock_page(page);
1009keep: 906keep:
1010 reset_reclaim_mode(sc);
1011keep_lumpy:
1012 list_add(&page->lru, &ret_pages); 907 list_add(&page->lru, &ret_pages);
1013 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 908 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1014 } 909 }
@@ -1020,7 +915,7 @@ keep_lumpy:
1020 * will encounter the same problem 915 * will encounter the same problem
1021 */ 916 */
1022 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) 917 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
1023 zone_set_flag(mz->zone, ZONE_CONGESTED); 918 zone_set_flag(zone, ZONE_CONGESTED);
1024 919
1025 free_hot_cold_page_list(&free_pages, 1); 920 free_hot_cold_page_list(&free_pages, 1);
1026 921
@@ -1041,34 +936,15 @@ keep_lumpy:
1041 * 936 *
1042 * returns 0 on success, -ve errno on failure. 937 * returns 0 on success, -ve errno on failure.
1043 */ 938 */
1044int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) 939int __isolate_lru_page(struct page *page, isolate_mode_t mode)
1045{ 940{
1046 bool all_lru_mode;
1047 int ret = -EINVAL; 941 int ret = -EINVAL;
1048 942
1049 /* Only take pages on the LRU. */ 943 /* Only take pages on the LRU. */
1050 if (!PageLRU(page)) 944 if (!PageLRU(page))
1051 return ret; 945 return ret;
1052 946
1053 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == 947 /* Do not give back unevictable pages for compaction */
1054 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1055
1056 /*
1057 * When checking the active state, we need to be sure we are
1058 * dealing with comparible boolean values. Take the logical not
1059 * of each.
1060 */
1061 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1062 return ret;
1063
1064 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1065 return ret;
1066
1067 /*
1068 * When this function is being called for lumpy reclaim, we
1069 * initially look into all LRU pages, active, inactive and
1070 * unevictable; only give shrink_page_list evictable pages.
1071 */
1072 if (PageUnevictable(page)) 948 if (PageUnevictable(page))
1073 return ret; 949 return ret;
1074 950
@@ -1135,54 +1011,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1135 * Appropriate locks must be held before calling this function. 1011 * Appropriate locks must be held before calling this function.
1136 * 1012 *
1137 * @nr_to_scan: The number of pages to look through on the list. 1013 * @nr_to_scan: The number of pages to look through on the list.
1138 * @mz: The mem_cgroup_zone to pull pages from. 1014 * @lruvec: The LRU vector to pull pages from.
1139 * @dst: The temp list to put pages on to. 1015 * @dst: The temp list to put pages on to.
1140 * @nr_scanned: The number of pages that were scanned. 1016 * @nr_scanned: The number of pages that were scanned.
1141 * @sc: The scan_control struct for this reclaim session 1017 * @sc: The scan_control struct for this reclaim session
1142 * @mode: One of the LRU isolation modes 1018 * @mode: One of the LRU isolation modes
1143 * @active: True [1] if isolating active pages 1019 * @lru: LRU list id for isolating
1144 * @file: True [1] if isolating file [!anon] pages
1145 * 1020 *
1146 * returns how many pages were moved onto *@dst. 1021 * returns how many pages were moved onto *@dst.
1147 */ 1022 */
1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1023static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1149 struct mem_cgroup_zone *mz, struct list_head *dst, 1024 struct lruvec *lruvec, struct list_head *dst,
1150 unsigned long *nr_scanned, struct scan_control *sc, 1025 unsigned long *nr_scanned, struct scan_control *sc,
1151 isolate_mode_t mode, int active, int file) 1026 isolate_mode_t mode, enum lru_list lru)
1152{ 1027{
1153 struct lruvec *lruvec; 1028 struct list_head *src = &lruvec->lists[lru];
1154 struct list_head *src;
1155 unsigned long nr_taken = 0; 1029 unsigned long nr_taken = 0;
1156 unsigned long nr_lumpy_taken = 0;
1157 unsigned long nr_lumpy_dirty = 0;
1158 unsigned long nr_lumpy_failed = 0;
1159 unsigned long scan; 1030 unsigned long scan;
1160 int lru = LRU_BASE;
1161
1162 lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
1163 if (active)
1164 lru += LRU_ACTIVE;
1165 if (file)
1166 lru += LRU_FILE;
1167 src = &lruvec->lists[lru];
1168 1031
1169 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1032 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1170 struct page *page; 1033 struct page *page;
1171 unsigned long pfn; 1034 int nr_pages;
1172 unsigned long end_pfn;
1173 unsigned long page_pfn;
1174 int zone_id;
1175 1035
1176 page = lru_to_page(src); 1036 page = lru_to_page(src);
1177 prefetchw_prev_lru_page(page, src, flags); 1037 prefetchw_prev_lru_page(page, src, flags);
1178 1038
1179 VM_BUG_ON(!PageLRU(page)); 1039 VM_BUG_ON(!PageLRU(page));
1180 1040
1181 switch (__isolate_lru_page(page, mode, file)) { 1041 switch (__isolate_lru_page(page, mode)) {
1182 case 0: 1042 case 0:
1183 mem_cgroup_lru_del(page); 1043 nr_pages = hpage_nr_pages(page);
1044 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
1184 list_move(&page->lru, dst); 1045 list_move(&page->lru, dst);
1185 nr_taken += hpage_nr_pages(page); 1046 nr_taken += nr_pages;
1186 break; 1047 break;
1187 1048
1188 case -EBUSY: 1049 case -EBUSY:
@@ -1193,93 +1054,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1193 default: 1054 default:
1194 BUG(); 1055 BUG();
1195 } 1056 }
1196
1197 if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
1198 continue;
1199
1200 /*
1201 * Attempt to take all pages in the order aligned region
1202 * surrounding the tag page. Only take those pages of
1203 * the same active state as that tag page. We may safely
1204 * round the target page pfn down to the requested order
1205 * as the mem_map is guaranteed valid out to MAX_ORDER,
1206 * where that page is in a different zone we will detect
1207 * it from its zone id and abort this block scan.
1208 */
1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << sc->order) - 1);
1212 end_pfn = pfn + (1 << sc->order);
1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page;
1215
1216 /* The target page is in the block, ignore it. */
1217 if (unlikely(pfn == page_pfn))
1218 continue;
1219
1220 /* Avoid holes within the zone. */
1221 if (unlikely(!pfn_valid_within(pfn)))
1222 break;
1223
1224 cursor_page = pfn_to_page(pfn);
1225
1226 /* Check that we have not crossed a zone boundary. */
1227 if (unlikely(page_zone_id(cursor_page) != zone_id))
1228 break;
1229
1230 /*
1231 * If we don't have enough swap space, reclaiming of
1232 * anon page which don't already have a swap slot is
1233 * pointless.
1234 */
1235 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1236 !PageSwapCache(cursor_page))
1237 break;
1238
1239 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1240 unsigned int isolated_pages;
1241
1242 mem_cgroup_lru_del(cursor_page);
1243 list_move(&cursor_page->lru, dst);
1244 isolated_pages = hpage_nr_pages(cursor_page);
1245 nr_taken += isolated_pages;
1246 nr_lumpy_taken += isolated_pages;
1247 if (PageDirty(cursor_page))
1248 nr_lumpy_dirty += isolated_pages;
1249 scan++;
1250 pfn += isolated_pages - 1;
1251 } else {
1252 /*
1253 * Check if the page is freed already.
1254 *
1255 * We can't use page_count() as that
1256 * requires compound_head and we don't
1257 * have a pin on the page here. If a
1258 * page is tail, we may or may not
1259 * have isolated the head, so assume
1260 * it's not free, it'd be tricky to
1261 * track the head status without a
1262 * page pin.
1263 */
1264 if (!PageTail(cursor_page) &&
1265 !atomic_read(&cursor_page->_count))
1266 continue;
1267 break;
1268 }
1269 }
1270
1271 /* If we break out of the loop above, lumpy reclaim failed */
1272 if (pfn < end_pfn)
1273 nr_lumpy_failed++;
1274 } 1057 }
1275 1058
1276 *nr_scanned = scan; 1059 *nr_scanned = scan;
1277 1060 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
1278 trace_mm_vmscan_lru_isolate(sc->order, 1061 nr_taken, mode, is_file_lru(lru));
1279 nr_to_scan, scan,
1280 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1282 mode, file);
1283 return nr_taken; 1062 return nr_taken;
1284} 1063}
1285 1064
@@ -1316,15 +1095,16 @@ int isolate_lru_page(struct page *page)
1316 1095
1317 if (PageLRU(page)) { 1096 if (PageLRU(page)) {
1318 struct zone *zone = page_zone(page); 1097 struct zone *zone = page_zone(page);
1098 struct lruvec *lruvec;
1319 1099
1320 spin_lock_irq(&zone->lru_lock); 1100 spin_lock_irq(&zone->lru_lock);
1101 lruvec = mem_cgroup_page_lruvec(page, zone);
1321 if (PageLRU(page)) { 1102 if (PageLRU(page)) {
1322 int lru = page_lru(page); 1103 int lru = page_lru(page);
1323 ret = 0;
1324 get_page(page); 1104 get_page(page);
1325 ClearPageLRU(page); 1105 ClearPageLRU(page);
1326 1106 del_page_from_lru_list(page, lruvec, lru);
1327 del_page_from_lru_list(zone, page, lru); 1107 ret = 0;
1328 } 1108 }
1329 spin_unlock_irq(&zone->lru_lock); 1109 spin_unlock_irq(&zone->lru_lock);
1330 } 1110 }
@@ -1357,11 +1137,10 @@ static int too_many_isolated(struct zone *zone, int file,
1357} 1137}
1358 1138
1359static noinline_for_stack void 1139static noinline_for_stack void
1360putback_inactive_pages(struct mem_cgroup_zone *mz, 1140putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1361 struct list_head *page_list)
1362{ 1141{
1363 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1142 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1364 struct zone *zone = mz->zone; 1143 struct zone *zone = lruvec_zone(lruvec);
1365 LIST_HEAD(pages_to_free); 1144 LIST_HEAD(pages_to_free);
1366 1145
1367 /* 1146 /*
@@ -1379,9 +1158,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1379 spin_lock_irq(&zone->lru_lock); 1158 spin_lock_irq(&zone->lru_lock);
1380 continue; 1159 continue;
1381 } 1160 }
1161
1162 lruvec = mem_cgroup_page_lruvec(page, zone);
1163
1382 SetPageLRU(page); 1164 SetPageLRU(page);
1383 lru = page_lru(page); 1165 lru = page_lru(page);
1384 add_page_to_lru_list(zone, page, lru); 1166 add_page_to_lru_list(page, lruvec, lru);
1167
1385 if (is_active_lru(lru)) { 1168 if (is_active_lru(lru)) {
1386 int file = is_file_lru(lru); 1169 int file = is_file_lru(lru);
1387 int numpages = hpage_nr_pages(page); 1170 int numpages = hpage_nr_pages(page);
@@ -1390,7 +1173,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1390 if (put_page_testzero(page)) { 1173 if (put_page_testzero(page)) {
1391 __ClearPageLRU(page); 1174 __ClearPageLRU(page);
1392 __ClearPageActive(page); 1175 __ClearPageActive(page);
1393 del_page_from_lru_list(zone, page, lru); 1176 del_page_from_lru_list(page, lruvec, lru);
1394 1177
1395 if (unlikely(PageCompound(page))) { 1178 if (unlikely(PageCompound(page))) {
1396 spin_unlock_irq(&zone->lru_lock); 1179 spin_unlock_irq(&zone->lru_lock);
@@ -1407,112 +1190,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1407 list_splice(&pages_to_free, page_list); 1190 list_splice(&pages_to_free, page_list);
1408} 1191}
1409 1192
1410static noinline_for_stack void
1411update_isolated_counts(struct mem_cgroup_zone *mz,
1412 struct list_head *page_list,
1413 unsigned long *nr_anon,
1414 unsigned long *nr_file)
1415{
1416 struct zone *zone = mz->zone;
1417 unsigned int count[NR_LRU_LISTS] = { 0, };
1418 unsigned long nr_active = 0;
1419 struct page *page;
1420 int lru;
1421
1422 /*
1423 * Count pages and clear active flags
1424 */
1425 list_for_each_entry(page, page_list, lru) {
1426 int numpages = hpage_nr_pages(page);
1427 lru = page_lru_base_type(page);
1428 if (PageActive(page)) {
1429 lru += LRU_ACTIVE;
1430 ClearPageActive(page);
1431 nr_active += numpages;
1432 }
1433 count[lru] += numpages;
1434 }
1435
1436 preempt_disable();
1437 __count_vm_events(PGDEACTIVATE, nr_active);
1438
1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1440 -count[LRU_ACTIVE_FILE]);
1441 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1442 -count[LRU_INACTIVE_FILE]);
1443 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1444 -count[LRU_ACTIVE_ANON]);
1445 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1446 -count[LRU_INACTIVE_ANON]);
1447
1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1450
1451 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1452 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1453 preempt_enable();
1454}
1455
1456/*
1457 * Returns true if a direct reclaim should wait on pages under writeback.
1458 *
1459 * If we are direct reclaiming for contiguous pages and we do not reclaim
1460 * everything in the list, try again and wait for writeback IO to complete.
1461 * This will stall high-order allocations noticeably. Only do that when really
1462 * need to free the pages under high memory pressure.
1463 */
1464static inline bool should_reclaim_stall(unsigned long nr_taken,
1465 unsigned long nr_freed,
1466 int priority,
1467 struct scan_control *sc)
1468{
1469 int lumpy_stall_priority;
1470
1471 /* kswapd should not stall on sync IO */
1472 if (current_is_kswapd())
1473 return false;
1474
1475 /* Only stall on lumpy reclaim */
1476 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1477 return false;
1478
1479 /* If we have reclaimed everything on the isolated list, no stall */
1480 if (nr_freed == nr_taken)
1481 return false;
1482
1483 /*
1484 * For high-order allocations, there are two stall thresholds.
1485 * High-cost allocations stall immediately where as lower
1486 * order allocations such as stacks require the scanning
1487 * priority to be much higher before stalling.
1488 */
1489 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1490 lumpy_stall_priority = DEF_PRIORITY;
1491 else
1492 lumpy_stall_priority = DEF_PRIORITY / 3;
1493
1494 return priority <= lumpy_stall_priority;
1495}
1496
1497/* 1193/*
1498 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1194 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1499 * of reclaimed pages 1195 * of reclaimed pages
1500 */ 1196 */
1501static noinline_for_stack unsigned long 1197static noinline_for_stack unsigned long
1502shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, 1198shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1503 struct scan_control *sc, int priority, int file) 1199 struct scan_control *sc, enum lru_list lru)
1504{ 1200{
1505 LIST_HEAD(page_list); 1201 LIST_HEAD(page_list);
1506 unsigned long nr_scanned; 1202 unsigned long nr_scanned;
1507 unsigned long nr_reclaimed = 0; 1203 unsigned long nr_reclaimed = 0;
1508 unsigned long nr_taken; 1204 unsigned long nr_taken;
1509 unsigned long nr_anon;
1510 unsigned long nr_file;
1511 unsigned long nr_dirty = 0; 1205 unsigned long nr_dirty = 0;
1512 unsigned long nr_writeback = 0; 1206 unsigned long nr_writeback = 0;
1513 isolate_mode_t isolate_mode = ISOLATE_INACTIVE; 1207 isolate_mode_t isolate_mode = 0;
1514 struct zone *zone = mz->zone; 1208 int file = is_file_lru(lru);
1515 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1209 struct zone *zone = lruvec_zone(lruvec);
1210 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1516 1211
1517 while (unlikely(too_many_isolated(zone, file, sc))) { 1212 while (unlikely(too_many_isolated(zone, file, sc))) {
1518 congestion_wait(BLK_RW_ASYNC, HZ/10); 1213 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,10 +1217,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1522 return SWAP_CLUSTER_MAX; 1217 return SWAP_CLUSTER_MAX;
1523 } 1218 }
1524 1219
1525 set_reclaim_mode(priority, sc, false);
1526 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1527 isolate_mode |= ISOLATE_ACTIVE;
1528
1529 lru_add_drain(); 1220 lru_add_drain();
1530 1221
1531 if (!sc->may_unmap) 1222 if (!sc->may_unmap)
@@ -1535,38 +1226,30 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1535 1226
1536 spin_lock_irq(&zone->lru_lock); 1227 spin_lock_irq(&zone->lru_lock);
1537 1228
1538 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, 1229 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1539 sc, isolate_mode, 0, file); 1230 &nr_scanned, sc, isolate_mode, lru);
1231
1232 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1233 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1234
1540 if (global_reclaim(sc)) { 1235 if (global_reclaim(sc)) {
1541 zone->pages_scanned += nr_scanned; 1236 zone->pages_scanned += nr_scanned;
1542 if (current_is_kswapd()) 1237 if (current_is_kswapd())
1543 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1238 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1544 nr_scanned);
1545 else 1239 else
1546 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1240 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
1547 nr_scanned);
1548 } 1241 }
1549 spin_unlock_irq(&zone->lru_lock); 1242 spin_unlock_irq(&zone->lru_lock);
1550 1243
1551 if (nr_taken == 0) 1244 if (nr_taken == 0)
1552 return 0; 1245 return 0;
1553 1246
1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); 1247 nr_reclaimed = shrink_page_list(&page_list, zone, sc,
1555
1556 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1557 &nr_dirty, &nr_writeback); 1248 &nr_dirty, &nr_writeback);
1558 1249
1559 /* Check if we should syncronously wait for writeback */
1560 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1561 set_reclaim_mode(priority, sc, true);
1562 nr_reclaimed += shrink_page_list(&page_list, mz, sc,
1563 priority, &nr_dirty, &nr_writeback);
1564 }
1565
1566 spin_lock_irq(&zone->lru_lock); 1250 spin_lock_irq(&zone->lru_lock);
1567 1251
1568 reclaim_stat->recent_scanned[0] += nr_anon; 1252 reclaim_stat->recent_scanned[file] += nr_taken;
1569 reclaim_stat->recent_scanned[1] += nr_file;
1570 1253
1571 if (global_reclaim(sc)) { 1254 if (global_reclaim(sc)) {
1572 if (current_is_kswapd()) 1255 if (current_is_kswapd())
@@ -1577,10 +1260,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1577 nr_reclaimed); 1260 nr_reclaimed);
1578 } 1261 }
1579 1262
1580 putback_inactive_pages(mz, &page_list); 1263 putback_inactive_pages(lruvec, &page_list);
1581 1264
1582 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); 1265 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1583 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1584 1266
1585 spin_unlock_irq(&zone->lru_lock); 1267 spin_unlock_irq(&zone->lru_lock);
1586 1268
@@ -1609,14 +1291,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1609 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any 1291 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1610 * isolated page is PageWriteback 1292 * isolated page is PageWriteback
1611 */ 1293 */
1612 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) 1294 if (nr_writeback && nr_writeback >=
1295 (nr_taken >> (DEF_PRIORITY - sc->priority)))
1613 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1296 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1614 1297
1615 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1298 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1616 zone_idx(zone), 1299 zone_idx(zone),
1617 nr_scanned, nr_reclaimed, 1300 nr_scanned, nr_reclaimed,
1618 priority, 1301 sc->priority,
1619 trace_shrink_flags(file, sc->reclaim_mode)); 1302 trace_shrink_flags(file));
1620 return nr_reclaimed; 1303 return nr_reclaimed;
1621} 1304}
1622 1305
@@ -1638,30 +1321,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1638 * But we had to alter page->flags anyway. 1321 * But we had to alter page->flags anyway.
1639 */ 1322 */
1640 1323
1641static void move_active_pages_to_lru(struct zone *zone, 1324static void move_active_pages_to_lru(struct lruvec *lruvec,
1642 struct list_head *list, 1325 struct list_head *list,
1643 struct list_head *pages_to_free, 1326 struct list_head *pages_to_free,
1644 enum lru_list lru) 1327 enum lru_list lru)
1645{ 1328{
1329 struct zone *zone = lruvec_zone(lruvec);
1646 unsigned long pgmoved = 0; 1330 unsigned long pgmoved = 0;
1647 struct page *page; 1331 struct page *page;
1332 int nr_pages;
1648 1333
1649 while (!list_empty(list)) { 1334 while (!list_empty(list)) {
1650 struct lruvec *lruvec;
1651
1652 page = lru_to_page(list); 1335 page = lru_to_page(list);
1336 lruvec = mem_cgroup_page_lruvec(page, zone);
1653 1337
1654 VM_BUG_ON(PageLRU(page)); 1338 VM_BUG_ON(PageLRU(page));
1655 SetPageLRU(page); 1339 SetPageLRU(page);
1656 1340
1657 lruvec = mem_cgroup_lru_add_list(zone, page, lru); 1341 nr_pages = hpage_nr_pages(page);
1342 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
1658 list_move(&page->lru, &lruvec->lists[lru]); 1343 list_move(&page->lru, &lruvec->lists[lru]);
1659 pgmoved += hpage_nr_pages(page); 1344 pgmoved += nr_pages;
1660 1345
1661 if (put_page_testzero(page)) { 1346 if (put_page_testzero(page)) {
1662 __ClearPageLRU(page); 1347 __ClearPageLRU(page);
1663 __ClearPageActive(page); 1348 __ClearPageActive(page);
1664 del_page_from_lru_list(zone, page, lru); 1349 del_page_from_lru_list(page, lruvec, lru);
1665 1350
1666 if (unlikely(PageCompound(page))) { 1351 if (unlikely(PageCompound(page))) {
1667 spin_unlock_irq(&zone->lru_lock); 1352 spin_unlock_irq(&zone->lru_lock);
@@ -1677,9 +1362,9 @@ static void move_active_pages_to_lru(struct zone *zone,
1677} 1362}
1678 1363
1679static void shrink_active_list(unsigned long nr_to_scan, 1364static void shrink_active_list(unsigned long nr_to_scan,
1680 struct mem_cgroup_zone *mz, 1365 struct lruvec *lruvec,
1681 struct scan_control *sc, 1366 struct scan_control *sc,
1682 int priority, int file) 1367 enum lru_list lru)
1683{ 1368{
1684 unsigned long nr_taken; 1369 unsigned long nr_taken;
1685 unsigned long nr_scanned; 1370 unsigned long nr_scanned;
@@ -1688,15 +1373,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
1688 LIST_HEAD(l_active); 1373 LIST_HEAD(l_active);
1689 LIST_HEAD(l_inactive); 1374 LIST_HEAD(l_inactive);
1690 struct page *page; 1375 struct page *page;
1691 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1376 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1692 unsigned long nr_rotated = 0; 1377 unsigned long nr_rotated = 0;
1693 isolate_mode_t isolate_mode = ISOLATE_ACTIVE; 1378 isolate_mode_t isolate_mode = 0;
1694 struct zone *zone = mz->zone; 1379 int file = is_file_lru(lru);
1380 struct zone *zone = lruvec_zone(lruvec);
1695 1381
1696 lru_add_drain(); 1382 lru_add_drain();
1697 1383
1698 reset_reclaim_mode(sc);
1699
1700 if (!sc->may_unmap) 1384 if (!sc->may_unmap)
1701 isolate_mode |= ISOLATE_UNMAPPED; 1385 isolate_mode |= ISOLATE_UNMAPPED;
1702 if (!sc->may_writepage) 1386 if (!sc->may_writepage)
@@ -1704,18 +1388,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
1704 1388
1705 spin_lock_irq(&zone->lru_lock); 1389 spin_lock_irq(&zone->lru_lock);
1706 1390
1707 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, 1391 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1708 isolate_mode, 1, file); 1392 &nr_scanned, sc, isolate_mode, lru);
1709 if (global_reclaim(sc)) 1393 if (global_reclaim(sc))
1710 zone->pages_scanned += nr_scanned; 1394 zone->pages_scanned += nr_scanned;
1711 1395
1712 reclaim_stat->recent_scanned[file] += nr_taken; 1396 reclaim_stat->recent_scanned[file] += nr_taken;
1713 1397
1714 __count_zone_vm_events(PGREFILL, zone, nr_scanned); 1398 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1715 if (file) 1399 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1716 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1717 else
1718 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1719 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1400 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1720 spin_unlock_irq(&zone->lru_lock); 1401 spin_unlock_irq(&zone->lru_lock);
1721 1402
@@ -1737,7 +1418,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
1737 } 1418 }
1738 } 1419 }
1739 1420
1740 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { 1421 if (page_referenced(page, 0, sc->target_mem_cgroup,
1422 &vm_flags)) {
1741 nr_rotated += hpage_nr_pages(page); 1423 nr_rotated += hpage_nr_pages(page);
1742 /* 1424 /*
1743 * Identify referenced, file-backed active pages and 1425 * Identify referenced, file-backed active pages and
@@ -1770,10 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
1770 */ 1452 */
1771 reclaim_stat->recent_rotated[file] += nr_rotated; 1453 reclaim_stat->recent_rotated[file] += nr_rotated;
1772 1454
1773 move_active_pages_to_lru(zone, &l_active, &l_hold, 1455 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1774 LRU_ACTIVE + file * LRU_FILE); 1456 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1775 move_active_pages_to_lru(zone, &l_inactive, &l_hold,
1776 LRU_BASE + file * LRU_FILE);
1777 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1457 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1778 spin_unlock_irq(&zone->lru_lock); 1458 spin_unlock_irq(&zone->lru_lock);
1779 1459
@@ -1796,13 +1476,12 @@ static int inactive_anon_is_low_global(struct zone *zone)
1796 1476
1797/** 1477/**
1798 * inactive_anon_is_low - check if anonymous pages need to be deactivated 1478 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1799 * @zone: zone to check 1479 * @lruvec: LRU vector to check
1800 * @sc: scan control of this context
1801 * 1480 *
1802 * Returns true if the zone does not have enough inactive anon pages, 1481 * Returns true if the zone does not have enough inactive anon pages,
1803 * meaning some active anon pages need to be deactivated. 1482 * meaning some active anon pages need to be deactivated.
1804 */ 1483 */
1805static int inactive_anon_is_low(struct mem_cgroup_zone *mz) 1484static int inactive_anon_is_low(struct lruvec *lruvec)
1806{ 1485{
1807 /* 1486 /*
1808 * If we don't have swap space, anonymous page deactivation 1487 * If we don't have swap space, anonymous page deactivation
@@ -1811,14 +1490,13 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1811 if (!total_swap_pages) 1490 if (!total_swap_pages)
1812 return 0; 1491 return 0;
1813 1492
1814 if (!scanning_global_lru(mz)) 1493 if (!mem_cgroup_disabled())
1815 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, 1494 return mem_cgroup_inactive_anon_is_low(lruvec);
1816 mz->zone);
1817 1495
1818 return inactive_anon_is_low_global(mz->zone); 1496 return inactive_anon_is_low_global(lruvec_zone(lruvec));
1819} 1497}
1820#else 1498#else
1821static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) 1499static inline int inactive_anon_is_low(struct lruvec *lruvec)
1822{ 1500{
1823 return 0; 1501 return 0;
1824} 1502}
@@ -1836,7 +1514,7 @@ static int inactive_file_is_low_global(struct zone *zone)
1836 1514
1837/** 1515/**
1838 * inactive_file_is_low - check if file pages need to be deactivated 1516 * inactive_file_is_low - check if file pages need to be deactivated
1839 * @mz: memory cgroup and zone to check 1517 * @lruvec: LRU vector to check
1840 * 1518 *
1841 * When the system is doing streaming IO, memory pressure here 1519 * When the system is doing streaming IO, memory pressure here
1842 * ensures that active file pages get deactivated, until more 1520 * ensures that active file pages get deactivated, until more
@@ -1848,44 +1526,39 @@ static int inactive_file_is_low_global(struct zone *zone)
1848 * This uses a different ratio than the anonymous pages, because 1526 * This uses a different ratio than the anonymous pages, because
1849 * the page cache uses a use-once replacement algorithm. 1527 * the page cache uses a use-once replacement algorithm.
1850 */ 1528 */
1851static int inactive_file_is_low(struct mem_cgroup_zone *mz) 1529static int inactive_file_is_low(struct lruvec *lruvec)
1852{ 1530{
1853 if (!scanning_global_lru(mz)) 1531 if (!mem_cgroup_disabled())
1854 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, 1532 return mem_cgroup_inactive_file_is_low(lruvec);
1855 mz->zone);
1856 1533
1857 return inactive_file_is_low_global(mz->zone); 1534 return inactive_file_is_low_global(lruvec_zone(lruvec));
1858} 1535}
1859 1536
1860static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) 1537static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1861{ 1538{
1862 if (file) 1539 if (is_file_lru(lru))
1863 return inactive_file_is_low(mz); 1540 return inactive_file_is_low(lruvec);
1864 else 1541 else
1865 return inactive_anon_is_low(mz); 1542 return inactive_anon_is_low(lruvec);
1866} 1543}
1867 1544
1868static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1545static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1869 struct mem_cgroup_zone *mz, 1546 struct lruvec *lruvec, struct scan_control *sc)
1870 struct scan_control *sc, int priority)
1871{ 1547{
1872 int file = is_file_lru(lru);
1873
1874 if (is_active_lru(lru)) { 1548 if (is_active_lru(lru)) {
1875 if (inactive_list_is_low(mz, file)) 1549 if (inactive_list_is_low(lruvec, lru))
1876 shrink_active_list(nr_to_scan, mz, sc, priority, file); 1550 shrink_active_list(nr_to_scan, lruvec, sc, lru);
1877 return 0; 1551 return 0;
1878 } 1552 }
1879 1553
1880 return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); 1554 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
1881} 1555}
1882 1556
1883static int vmscan_swappiness(struct mem_cgroup_zone *mz, 1557static int vmscan_swappiness(struct scan_control *sc)
1884 struct scan_control *sc)
1885{ 1558{
1886 if (global_reclaim(sc)) 1559 if (global_reclaim(sc))
1887 return vm_swappiness; 1560 return vm_swappiness;
1888 return mem_cgroup_swappiness(mz->mem_cgroup); 1561 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1889} 1562}
1890 1563
1891/* 1564/*
@@ -1896,17 +1569,18 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz,
1896 * 1569 *
1897 * nr[0] = anon pages to scan; nr[1] = file pages to scan 1570 * nr[0] = anon pages to scan; nr[1] = file pages to scan
1898 */ 1571 */
1899static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, 1572static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1900 unsigned long *nr, int priority) 1573 unsigned long *nr)
1901{ 1574{
1902 unsigned long anon, file, free; 1575 unsigned long anon, file, free;
1903 unsigned long anon_prio, file_prio; 1576 unsigned long anon_prio, file_prio;
1904 unsigned long ap, fp; 1577 unsigned long ap, fp;
1905 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1578 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1906 u64 fraction[2], denominator; 1579 u64 fraction[2], denominator;
1907 enum lru_list lru; 1580 enum lru_list lru;
1908 int noswap = 0; 1581 int noswap = 0;
1909 bool force_scan = false; 1582 bool force_scan = false;
1583 struct zone *zone = lruvec_zone(lruvec);
1910 1584
1911 /* 1585 /*
1912 * If the zone or memcg is small, nr[l] can be 0. This 1586 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1918,7 +1592,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1918 * latencies, so it's better to scan a minimum amount there as 1592 * latencies, so it's better to scan a minimum amount there as
1919 * well. 1593 * well.
1920 */ 1594 */
1921 if (current_is_kswapd() && mz->zone->all_unreclaimable) 1595 if (current_is_kswapd() && zone->all_unreclaimable)
1922 force_scan = true; 1596 force_scan = true;
1923 if (!global_reclaim(sc)) 1597 if (!global_reclaim(sc))
1924 force_scan = true; 1598 force_scan = true;
@@ -1932,16 +1606,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1932 goto out; 1606 goto out;
1933 } 1607 }
1934 1608
1935 anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + 1609 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1936 zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); 1610 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1937 file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + 1611 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1938 zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); 1612 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1939 1613
1940 if (global_reclaim(sc)) { 1614 if (global_reclaim(sc)) {
1941 free = zone_page_state(mz->zone, NR_FREE_PAGES); 1615 free = zone_page_state(zone, NR_FREE_PAGES);
1942 /* If we have very few page cache pages, 1616 /* If we have very few page cache pages,
1943 force-scan anon pages. */ 1617 force-scan anon pages. */
1944 if (unlikely(file + free <= high_wmark_pages(mz->zone))) { 1618 if (unlikely(file + free <= high_wmark_pages(zone))) {
1945 fraction[0] = 1; 1619 fraction[0] = 1;
1946 fraction[1] = 0; 1620 fraction[1] = 0;
1947 denominator = 1; 1621 denominator = 1;
@@ -1953,8 +1627,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1953 * With swappiness at 100, anonymous and file have the same priority. 1627 * With swappiness at 100, anonymous and file have the same priority.
1954 * This scanning priority is essentially the inverse of IO cost. 1628 * This scanning priority is essentially the inverse of IO cost.
1955 */ 1629 */
1956 anon_prio = vmscan_swappiness(mz, sc); 1630 anon_prio = vmscan_swappiness(sc);
1957 file_prio = 200 - vmscan_swappiness(mz, sc); 1631 file_prio = 200 - anon_prio;
1958 1632
1959 /* 1633 /*
1960 * OK, so we have swap space and a fair amount of page cache 1634 * OK, so we have swap space and a fair amount of page cache
@@ -1967,7 +1641,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1967 * 1641 *
1968 * anon in [0], file in [1] 1642 * anon in [0], file in [1]
1969 */ 1643 */
1970 spin_lock_irq(&mz->zone->lru_lock); 1644 spin_lock_irq(&zone->lru_lock);
1971 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1645 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1972 reclaim_stat->recent_scanned[0] /= 2; 1646 reclaim_stat->recent_scanned[0] /= 2;
1973 reclaim_stat->recent_rotated[0] /= 2; 1647 reclaim_stat->recent_rotated[0] /= 2;
@@ -1983,12 +1657,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1983 * proportional to the fraction of recently scanned pages on 1657 * proportional to the fraction of recently scanned pages on
1984 * each list that were recently referenced and in active use. 1658 * each list that were recently referenced and in active use.
1985 */ 1659 */
1986 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); 1660 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
1987 ap /= reclaim_stat->recent_rotated[0] + 1; 1661 ap /= reclaim_stat->recent_rotated[0] + 1;
1988 1662
1989 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1663 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1990 fp /= reclaim_stat->recent_rotated[1] + 1; 1664 fp /= reclaim_stat->recent_rotated[1] + 1;
1991 spin_unlock_irq(&mz->zone->lru_lock); 1665 spin_unlock_irq(&zone->lru_lock);
1992 1666
1993 fraction[0] = ap; 1667 fraction[0] = ap;
1994 fraction[1] = fp; 1668 fraction[1] = fp;
@@ -1998,9 +1672,9 @@ out:
1998 int file = is_file_lru(lru); 1672 int file = is_file_lru(lru);
1999 unsigned long scan; 1673 unsigned long scan;
2000 1674
2001 scan = zone_nr_lru_pages(mz, lru); 1675 scan = get_lru_size(lruvec, lru);
2002 if (priority || noswap) { 1676 if (sc->priority || noswap || !vmscan_swappiness(sc)) {
2003 scan >>= priority; 1677 scan >>= sc->priority;
2004 if (!scan && force_scan) 1678 if (!scan && force_scan)
2005 scan = SWAP_CLUSTER_MAX; 1679 scan = SWAP_CLUSTER_MAX;
2006 scan = div64_u64(scan * fraction[file], denominator); 1680 scan = div64_u64(scan * fraction[file], denominator);
@@ -2009,14 +1683,25 @@ out:
2009 } 1683 }
2010} 1684}
2011 1685
1686/* Use reclaim/compaction for costly allocs or under memory pressure */
1687static bool in_reclaim_compaction(struct scan_control *sc)
1688{
1689 if (COMPACTION_BUILD && sc->order &&
1690 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1691 sc->priority < DEF_PRIORITY - 2))
1692 return true;
1693
1694 return false;
1695}
1696
2012/* 1697/*
2013 * Reclaim/compaction depends on a number of pages being freed. To avoid 1698 * Reclaim/compaction is used for high-order allocation requests. It reclaims
2014 * disruption to the system, a small number of order-0 pages continue to be 1699 * order-0 pages before compacting the zone. should_continue_reclaim() returns
2015 * rotated and reclaimed in the normal fashion. However, by the time we get 1700 * true if more pages should be reclaimed such that when the page allocator
2016 * back to the allocator and call try_to_compact_zone(), we ensure that 1701 * calls try_to_compact_zone() that it will have enough free pages to succeed.
2017 * there are enough free pages for it to be likely successful 1702 * It will give up earlier than that if there is difficulty reclaiming pages.
2018 */ 1703 */
2019static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, 1704static inline bool should_continue_reclaim(struct lruvec *lruvec,
2020 unsigned long nr_reclaimed, 1705 unsigned long nr_reclaimed,
2021 unsigned long nr_scanned, 1706 unsigned long nr_scanned,
2022 struct scan_control *sc) 1707 struct scan_control *sc)
@@ -2025,7 +1710,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2025 unsigned long inactive_lru_pages; 1710 unsigned long inactive_lru_pages;
2026 1711
2027 /* If not in reclaim/compaction mode, stop */ 1712 /* If not in reclaim/compaction mode, stop */
2028 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1713 if (!in_reclaim_compaction(sc))
2029 return false; 1714 return false;
2030 1715
2031 /* Consider stopping depending on scan and reclaim activity */ 1716 /* Consider stopping depending on scan and reclaim activity */
@@ -2056,15 +1741,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2056 * inactive lists are large enough, continue reclaiming 1741 * inactive lists are large enough, continue reclaiming
2057 */ 1742 */
2058 pages_for_compaction = (2UL << sc->order); 1743 pages_for_compaction = (2UL << sc->order);
2059 inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); 1744 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
2060 if (nr_swap_pages > 0) 1745 if (nr_swap_pages > 0)
2061 inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); 1746 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
2062 if (sc->nr_reclaimed < pages_for_compaction && 1747 if (sc->nr_reclaimed < pages_for_compaction &&
2063 inactive_lru_pages > pages_for_compaction) 1748 inactive_lru_pages > pages_for_compaction)
2064 return true; 1749 return true;
2065 1750
2066 /* If compaction would go ahead or the allocation would succeed, stop */ 1751 /* If compaction would go ahead or the allocation would succeed, stop */
2067 switch (compaction_suitable(mz->zone, sc->order)) { 1752 switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
2068 case COMPACT_PARTIAL: 1753 case COMPACT_PARTIAL:
2069 case COMPACT_CONTINUE: 1754 case COMPACT_CONTINUE:
2070 return false; 1755 return false;
@@ -2076,8 +1761,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2076/* 1761/*
2077 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1762 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
2078 */ 1763 */
2079static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, 1764static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2080 struct scan_control *sc)
2081{ 1765{
2082 unsigned long nr[NR_LRU_LISTS]; 1766 unsigned long nr[NR_LRU_LISTS];
2083 unsigned long nr_to_scan; 1767 unsigned long nr_to_scan;
@@ -2089,7 +1773,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
2089restart: 1773restart:
2090 nr_reclaimed = 0; 1774 nr_reclaimed = 0;
2091 nr_scanned = sc->nr_scanned; 1775 nr_scanned = sc->nr_scanned;
2092 get_scan_count(mz, sc, nr, priority); 1776 get_scan_count(lruvec, sc, nr);
2093 1777
2094 blk_start_plug(&plug); 1778 blk_start_plug(&plug);
2095 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1779 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2101,7 +1785,7 @@ restart:
2101 nr[lru] -= nr_to_scan; 1785 nr[lru] -= nr_to_scan;
2102 1786
2103 nr_reclaimed += shrink_list(lru, nr_to_scan, 1787 nr_reclaimed += shrink_list(lru, nr_to_scan,
2104 mz, sc, priority); 1788 lruvec, sc);
2105 } 1789 }
2106 } 1790 }
2107 /* 1791 /*
@@ -2112,7 +1796,8 @@ restart:
2112 * with multiple processes reclaiming pages, the total 1796 * with multiple processes reclaiming pages, the total
2113 * freeing target can get unreasonably large. 1797 * freeing target can get unreasonably large.
2114 */ 1798 */
2115 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1799 if (nr_reclaimed >= nr_to_reclaim &&
1800 sc->priority < DEF_PRIORITY)
2116 break; 1801 break;
2117 } 1802 }
2118 blk_finish_plug(&plug); 1803 blk_finish_plug(&plug);
@@ -2122,35 +1807,33 @@ restart:
2122 * Even if we did not try to evict anon pages at all, we want to 1807 * Even if we did not try to evict anon pages at all, we want to
2123 * rebalance the anon lru active/inactive ratio. 1808 * rebalance the anon lru active/inactive ratio.
2124 */ 1809 */
2125 if (inactive_anon_is_low(mz)) 1810 if (inactive_anon_is_low(lruvec))
2126 shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); 1811 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1812 sc, LRU_ACTIVE_ANON);
2127 1813
2128 /* reclaim/compaction might need reclaim to continue */ 1814 /* reclaim/compaction might need reclaim to continue */
2129 if (should_continue_reclaim(mz, nr_reclaimed, 1815 if (should_continue_reclaim(lruvec, nr_reclaimed,
2130 sc->nr_scanned - nr_scanned, sc)) 1816 sc->nr_scanned - nr_scanned, sc))
2131 goto restart; 1817 goto restart;
2132 1818
2133 throttle_vm_writeout(sc->gfp_mask); 1819 throttle_vm_writeout(sc->gfp_mask);
2134} 1820}
2135 1821
2136static void shrink_zone(int priority, struct zone *zone, 1822static void shrink_zone(struct zone *zone, struct scan_control *sc)
2137 struct scan_control *sc)
2138{ 1823{
2139 struct mem_cgroup *root = sc->target_mem_cgroup; 1824 struct mem_cgroup *root = sc->target_mem_cgroup;
2140 struct mem_cgroup_reclaim_cookie reclaim = { 1825 struct mem_cgroup_reclaim_cookie reclaim = {
2141 .zone = zone, 1826 .zone = zone,
2142 .priority = priority, 1827 .priority = sc->priority,
2143 }; 1828 };
2144 struct mem_cgroup *memcg; 1829 struct mem_cgroup *memcg;
2145 1830
2146 memcg = mem_cgroup_iter(root, NULL, &reclaim); 1831 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2147 do { 1832 do {
2148 struct mem_cgroup_zone mz = { 1833 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2149 .mem_cgroup = memcg, 1834
2150 .zone = zone, 1835 shrink_lruvec(lruvec, sc);
2151 };
2152 1836
2153 shrink_mem_cgroup_zone(priority, &mz, sc);
2154 /* 1837 /*
2155 * Limit reclaim has historically picked one memcg and 1838 * Limit reclaim has historically picked one memcg and
2156 * scanned it with decreasing priority levels until 1839 * scanned it with decreasing priority levels until
@@ -2226,8 +1909,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2226 * the caller that it should consider retrying the allocation instead of 1909 * the caller that it should consider retrying the allocation instead of
2227 * further reclaim. 1910 * further reclaim.
2228 */ 1911 */
2229static bool shrink_zones(int priority, struct zonelist *zonelist, 1912static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2230 struct scan_control *sc)
2231{ 1913{
2232 struct zoneref *z; 1914 struct zoneref *z;
2233 struct zone *zone; 1915 struct zone *zone;
@@ -2254,7 +1936,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2254 if (global_reclaim(sc)) { 1936 if (global_reclaim(sc)) {
2255 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1937 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2256 continue; 1938 continue;
2257 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1939 if (zone->all_unreclaimable &&
1940 sc->priority != DEF_PRIORITY)
2258 continue; /* Let kswapd poll it */ 1941 continue; /* Let kswapd poll it */
2259 if (COMPACTION_BUILD) { 1942 if (COMPACTION_BUILD) {
2260 /* 1943 /*
@@ -2286,7 +1969,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2286 /* need some check for avoid more shrink_zone() */ 1969 /* need some check for avoid more shrink_zone() */
2287 } 1970 }
2288 1971
2289 shrink_zone(priority, zone, sc); 1972 shrink_zone(zone, sc);
2290 } 1973 }
2291 1974
2292 return aborted_reclaim; 1975 return aborted_reclaim;
@@ -2337,7 +2020,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2337 struct scan_control *sc, 2020 struct scan_control *sc,
2338 struct shrink_control *shrink) 2021 struct shrink_control *shrink)
2339{ 2022{
2340 int priority;
2341 unsigned long total_scanned = 0; 2023 unsigned long total_scanned = 0;
2342 struct reclaim_state *reclaim_state = current->reclaim_state; 2024 struct reclaim_state *reclaim_state = current->reclaim_state;
2343 struct zoneref *z; 2025 struct zoneref *z;
@@ -2350,11 +2032,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2350 if (global_reclaim(sc)) 2032 if (global_reclaim(sc))
2351 count_vm_event(ALLOCSTALL); 2033 count_vm_event(ALLOCSTALL);
2352 2034
2353 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2035 do {
2354 sc->nr_scanned = 0; 2036 sc->nr_scanned = 0;
2355 if (!priority) 2037 aborted_reclaim = shrink_zones(zonelist, sc);
2356 disable_swap_token(sc->target_mem_cgroup);
2357 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2358 2038
2359 /* 2039 /*
2360 * Don't shrink slabs when reclaiming memory from 2040 * Don't shrink slabs when reclaiming memory from
@@ -2396,7 +2076,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2396 2076
2397 /* Take a nap, wait for some writeback to complete */ 2077 /* Take a nap, wait for some writeback to complete */
2398 if (!sc->hibernation_mode && sc->nr_scanned && 2078 if (!sc->hibernation_mode && sc->nr_scanned &&
2399 priority < DEF_PRIORITY - 2) { 2079 sc->priority < DEF_PRIORITY - 2) {
2400 struct zone *preferred_zone; 2080 struct zone *preferred_zone;
2401 2081
2402 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2082 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
@@ -2404,7 +2084,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2404 &preferred_zone); 2084 &preferred_zone);
2405 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2085 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2406 } 2086 }
2407 } 2087 } while (--sc->priority >= 0);
2408 2088
2409out: 2089out:
2410 delayacct_freepages_end(); 2090 delayacct_freepages_end();
@@ -2442,6 +2122,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2442 .may_unmap = 1, 2122 .may_unmap = 1,
2443 .may_swap = 1, 2123 .may_swap = 1,
2444 .order = order, 2124 .order = order,
2125 .priority = DEF_PRIORITY,
2445 .target_mem_cgroup = NULL, 2126 .target_mem_cgroup = NULL,
2446 .nodemask = nodemask, 2127 .nodemask = nodemask,
2447 }; 2128 };
@@ -2474,17 +2155,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2474 .may_unmap = 1, 2155 .may_unmap = 1,
2475 .may_swap = !noswap, 2156 .may_swap = !noswap,
2476 .order = 0, 2157 .order = 0,
2158 .priority = 0,
2477 .target_mem_cgroup = memcg, 2159 .target_mem_cgroup = memcg,
2478 }; 2160 };
2479 struct mem_cgroup_zone mz = { 2161 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2480 .mem_cgroup = memcg,
2481 .zone = zone,
2482 };
2483 2162
2484 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2163 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2485 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2164 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2486 2165
2487 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, 2166 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2488 sc.may_writepage, 2167 sc.may_writepage,
2489 sc.gfp_mask); 2168 sc.gfp_mask);
2490 2169
@@ -2495,7 +2174,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2495 * will pick up pages from other mem cgroup's as well. We hack 2174 * will pick up pages from other mem cgroup's as well. We hack
2496 * the priority and make it zero. 2175 * the priority and make it zero.
2497 */ 2176 */
2498 shrink_mem_cgroup_zone(0, &mz, &sc); 2177 shrink_lruvec(lruvec, &sc);
2499 2178
2500 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2179 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2501 2180
@@ -2516,6 +2195,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2516 .may_swap = !noswap, 2195 .may_swap = !noswap,
2517 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2196 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2518 .order = 0, 2197 .order = 0,
2198 .priority = DEF_PRIORITY,
2519 .target_mem_cgroup = memcg, 2199 .target_mem_cgroup = memcg,
2520 .nodemask = NULL, /* we don't care the placement */ 2200 .nodemask = NULL, /* we don't care the placement */
2521 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2201 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2546,8 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2546} 2226}
2547#endif 2227#endif
2548 2228
2549static void age_active_anon(struct zone *zone, struct scan_control *sc, 2229static void age_active_anon(struct zone *zone, struct scan_control *sc)
2550 int priority)
2551{ 2230{
2552 struct mem_cgroup *memcg; 2231 struct mem_cgroup *memcg;
2553 2232
@@ -2556,14 +2235,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc,
2556 2235
2557 memcg = mem_cgroup_iter(NULL, NULL, NULL); 2236 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2558 do { 2237 do {
2559 struct mem_cgroup_zone mz = { 2238 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2560 .mem_cgroup = memcg,
2561 .zone = zone,
2562 };
2563 2239
2564 if (inactive_anon_is_low(&mz)) 2240 if (inactive_anon_is_low(lruvec))
2565 shrink_active_list(SWAP_CLUSTER_MAX, &mz, 2241 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2566 sc, priority, 0); 2242 sc, LRU_ACTIVE_ANON);
2567 2243
2568 memcg = mem_cgroup_iter(NULL, memcg, NULL); 2244 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2569 } while (memcg); 2245 } while (memcg);
@@ -2672,7 +2348,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2672{ 2348{
2673 int all_zones_ok; 2349 int all_zones_ok;
2674 unsigned long balanced; 2350 unsigned long balanced;
2675 int priority;
2676 int i; 2351 int i;
2677 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2352 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2678 unsigned long total_scanned; 2353 unsigned long total_scanned;
@@ -2696,18 +2371,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2696 }; 2371 };
2697loop_again: 2372loop_again:
2698 total_scanned = 0; 2373 total_scanned = 0;
2374 sc.priority = DEF_PRIORITY;
2699 sc.nr_reclaimed = 0; 2375 sc.nr_reclaimed = 0;
2700 sc.may_writepage = !laptop_mode; 2376 sc.may_writepage = !laptop_mode;
2701 count_vm_event(PAGEOUTRUN); 2377 count_vm_event(PAGEOUTRUN);
2702 2378
2703 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2379 do {
2704 unsigned long lru_pages = 0; 2380 unsigned long lru_pages = 0;
2705 int has_under_min_watermark_zone = 0; 2381 int has_under_min_watermark_zone = 0;
2706 2382
2707 /* The swap token gets in the way of swapout... */
2708 if (!priority)
2709 disable_swap_token(NULL);
2710
2711 all_zones_ok = 1; 2383 all_zones_ok = 1;
2712 balanced = 0; 2384 balanced = 0;
2713 2385
@@ -2721,14 +2393,15 @@ loop_again:
2721 if (!populated_zone(zone)) 2393 if (!populated_zone(zone))
2722 continue; 2394 continue;
2723 2395
2724 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2396 if (zone->all_unreclaimable &&
2397 sc.priority != DEF_PRIORITY)
2725 continue; 2398 continue;
2726 2399
2727 /* 2400 /*
2728 * Do some background aging of the anon list, to give 2401 * Do some background aging of the anon list, to give
2729 * pages a chance to be referenced before reclaiming. 2402 * pages a chance to be referenced before reclaiming.
2730 */ 2403 */
2731 age_active_anon(zone, &sc, priority); 2404 age_active_anon(zone, &sc);
2732 2405
2733 /* 2406 /*
2734 * If the number of buffer_heads in the machine 2407 * If the number of buffer_heads in the machine
@@ -2776,7 +2449,8 @@ loop_again:
2776 if (!populated_zone(zone)) 2449 if (!populated_zone(zone))
2777 continue; 2450 continue;
2778 2451
2779 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2452 if (zone->all_unreclaimable &&
2453 sc.priority != DEF_PRIORITY)
2780 continue; 2454 continue;
2781 2455
2782 sc.nr_scanned = 0; 2456 sc.nr_scanned = 0;
@@ -2820,7 +2494,7 @@ loop_again:
2820 !zone_watermark_ok_safe(zone, testorder, 2494 !zone_watermark_ok_safe(zone, testorder,
2821 high_wmark_pages(zone) + balance_gap, 2495 high_wmark_pages(zone) + balance_gap,
2822 end_zone, 0)) { 2496 end_zone, 0)) {
2823 shrink_zone(priority, zone, &sc); 2497 shrink_zone(zone, &sc);
2824 2498
2825 reclaim_state->reclaimed_slab = 0; 2499 reclaim_state->reclaimed_slab = 0;
2826 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); 2500 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
@@ -2877,7 +2551,7 @@ loop_again:
2877 * OK, kswapd is getting into trouble. Take a nap, then take 2551 * OK, kswapd is getting into trouble. Take a nap, then take
2878 * another pass across the zones. 2552 * another pass across the zones.
2879 */ 2553 */
2880 if (total_scanned && (priority < DEF_PRIORITY - 2)) { 2554 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2881 if (has_under_min_watermark_zone) 2555 if (has_under_min_watermark_zone)
2882 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2556 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2883 else 2557 else
@@ -2892,7 +2566,7 @@ loop_again:
2892 */ 2566 */
2893 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2567 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2894 break; 2568 break;
2895 } 2569 } while (--sc.priority >= 0);
2896out: 2570out:
2897 2571
2898 /* 2572 /*
@@ -2942,7 +2616,8 @@ out:
2942 if (!populated_zone(zone)) 2616 if (!populated_zone(zone))
2943 continue; 2617 continue;
2944 2618
2945 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2619 if (zone->all_unreclaimable &&
2620 sc.priority != DEF_PRIORITY)
2946 continue; 2621 continue;
2947 2622
2948 /* Would compaction fail due to lack of free memory? */ 2623 /* Would compaction fail due to lack of free memory? */
@@ -3013,7 +2688,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
3013 * them before going back to sleep. 2688 * them before going back to sleep.
3014 */ 2689 */
3015 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2690 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3016 schedule(); 2691
2692 if (!kthread_should_stop())
2693 schedule();
2694
3017 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2695 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3018 } else { 2696 } else {
3019 if (remaining) 2697 if (remaining)
@@ -3209,6 +2887,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
3209 .nr_to_reclaim = nr_to_reclaim, 2887 .nr_to_reclaim = nr_to_reclaim,
3210 .hibernation_mode = 1, 2888 .hibernation_mode = 1,
3211 .order = 0, 2889 .order = 0,
2890 .priority = DEF_PRIORITY,
3212 }; 2891 };
3213 struct shrink_control shrink = { 2892 struct shrink_control shrink = {
3214 .gfp_mask = sc.gfp_mask, 2893 .gfp_mask = sc.gfp_mask,
@@ -3279,14 +2958,17 @@ int kswapd_run(int nid)
3279} 2958}
3280 2959
3281/* 2960/*
3282 * Called by memory hotplug when all memory in a node is offlined. 2961 * Called by memory hotplug when all memory in a node is offlined. Caller must
2962 * hold lock_memory_hotplug().
3283 */ 2963 */
3284void kswapd_stop(int nid) 2964void kswapd_stop(int nid)
3285{ 2965{
3286 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 2966 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3287 2967
3288 if (kswapd) 2968 if (kswapd) {
3289 kthread_stop(kswapd); 2969 kthread_stop(kswapd);
2970 NODE_DATA(nid)->kswapd = NULL;
2971 }
3290} 2972}
3291 2973
3292static int __init kswapd_init(void) 2974static int __init kswapd_init(void)
@@ -3386,7 +3068,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3386 const unsigned long nr_pages = 1 << order; 3068 const unsigned long nr_pages = 1 << order;
3387 struct task_struct *p = current; 3069 struct task_struct *p = current;
3388 struct reclaim_state reclaim_state; 3070 struct reclaim_state reclaim_state;
3389 int priority;
3390 struct scan_control sc = { 3071 struct scan_control sc = {
3391 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3072 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3392 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3073 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -3395,6 +3076,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3395 SWAP_CLUSTER_MAX), 3076 SWAP_CLUSTER_MAX),
3396 .gfp_mask = gfp_mask, 3077 .gfp_mask = gfp_mask,
3397 .order = order, 3078 .order = order,
3079 .priority = ZONE_RECLAIM_PRIORITY,
3398 }; 3080 };
3399 struct shrink_control shrink = { 3081 struct shrink_control shrink = {
3400 .gfp_mask = sc.gfp_mask, 3082 .gfp_mask = sc.gfp_mask,
@@ -3417,11 +3099,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3417 * Free memory by calling shrink zone with increasing 3099 * Free memory by calling shrink zone with increasing
3418 * priorities until we have enough memory freed. 3100 * priorities until we have enough memory freed.
3419 */ 3101 */
3420 priority = ZONE_RECLAIM_PRIORITY;
3421 do { 3102 do {
3422 shrink_zone(priority, zone, &sc); 3103 shrink_zone(zone, &sc);
3423 priority--; 3104 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3424 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
3425 } 3105 }
3426 3106
3427 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3107 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -3536,7 +3216,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
3536 if (mapping_unevictable(page_mapping(page))) 3216 if (mapping_unevictable(page_mapping(page)))
3537 return 0; 3217 return 0;
3538 3218
3539 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) 3219 if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
3540 return 0; 3220 return 0;
3541 3221
3542 return 1; 3222 return 1;
@@ -3572,6 +3252,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3572 zone = pagezone; 3252 zone = pagezone;
3573 spin_lock_irq(&zone->lru_lock); 3253 spin_lock_irq(&zone->lru_lock);
3574 } 3254 }
3255 lruvec = mem_cgroup_page_lruvec(page, zone);
3575 3256
3576 if (!PageLRU(page) || !PageUnevictable(page)) 3257 if (!PageLRU(page) || !PageUnevictable(page))
3577 continue; 3258 continue;
@@ -3581,11 +3262,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3581 3262
3582 VM_BUG_ON(PageActive(page)); 3263 VM_BUG_ON(PageActive(page));
3583 ClearPageUnevictable(page); 3264 ClearPageUnevictable(page);
3584 __dec_zone_state(zone, NR_UNEVICTABLE); 3265 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3585 lruvec = mem_cgroup_lru_move_lists(zone, page, 3266 add_page_to_lru_list(page, lruvec, lru);
3586 LRU_UNEVICTABLE, lru);
3587 list_move(&page->lru, &lruvec->lists[lru]);
3588 __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
3589 pgrescued++; 3267 pgrescued++;
3590 } 3268 }
3591 } 3269 }