aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c462
1 files changed, 184 insertions, 278 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8deb5f4da4d9..347b3ff2a478 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -78,6 +78,9 @@ struct scan_control {
78 78
79 int order; 79 int order;
80 80
81 /* Scan (total_size >> priority) pages at once */
82 int priority;
83
81 /* 84 /*
82 * The memory cgroup that hit its limit and as a result is the 85 * The memory cgroup that hit its limit and as a result is the
83 * primary target of this reclaim invocation. 86 * primary target of this reclaim invocation.
@@ -91,11 +94,6 @@ struct scan_control {
91 nodemask_t *nodemask; 94 nodemask_t *nodemask;
92}; 95};
93 96
94struct mem_cgroup_zone {
95 struct mem_cgroup *mem_cgroup;
96 struct zone *zone;
97};
98
99#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 97#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
100 98
101#ifdef ARCH_HAS_PREFETCH 99#ifdef ARCH_HAS_PREFETCH
@@ -147,24 +145,14 @@ static bool global_reclaim(struct scan_control *sc)
147} 145}
148#endif 146#endif
149 147
150static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) 148static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
151{
152 return &mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup)->reclaim_stat;
153}
154
155static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
156 enum lru_list lru)
157{ 149{
158 if (!mem_cgroup_disabled()) 150 if (!mem_cgroup_disabled())
159 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, 151 return mem_cgroup_get_lru_size(lruvec, lru);
160 zone_to_nid(mz->zone),
161 zone_idx(mz->zone),
162 BIT(lru));
163 152
164 return zone_page_state(mz->zone, NR_LRU_BASE + lru); 153 return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
165} 154}
166 155
167
168/* 156/*
169 * Add a shrinker callback to be called from the vm 157 * Add a shrinker callback to be called from the vm
170 */ 158 */
@@ -626,7 +614,6 @@ enum page_references {
626}; 614};
627 615
628static enum page_references page_check_references(struct page *page, 616static enum page_references page_check_references(struct page *page,
629 struct mem_cgroup_zone *mz,
630 struct scan_control *sc) 617 struct scan_control *sc)
631{ 618{
632 int referenced_ptes, referenced_page; 619 int referenced_ptes, referenced_page;
@@ -685,9 +672,8 @@ static enum page_references page_check_references(struct page *page,
685 * shrink_page_list() returns the number of reclaimed pages 672 * shrink_page_list() returns the number of reclaimed pages
686 */ 673 */
687static unsigned long shrink_page_list(struct list_head *page_list, 674static unsigned long shrink_page_list(struct list_head *page_list,
688 struct mem_cgroup_zone *mz, 675 struct zone *zone,
689 struct scan_control *sc, 676 struct scan_control *sc,
690 int priority,
691 unsigned long *ret_nr_dirty, 677 unsigned long *ret_nr_dirty,
692 unsigned long *ret_nr_writeback) 678 unsigned long *ret_nr_writeback)
693{ 679{
@@ -716,7 +702,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
716 goto keep; 702 goto keep;
717 703
718 VM_BUG_ON(PageActive(page)); 704 VM_BUG_ON(PageActive(page));
719 VM_BUG_ON(page_zone(page) != mz->zone); 705 VM_BUG_ON(page_zone(page) != zone);
720 706
721 sc->nr_scanned++; 707 sc->nr_scanned++;
722 708
@@ -739,7 +725,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
739 goto keep; 725 goto keep;
740 } 726 }
741 727
742 references = page_check_references(page, mz, sc); 728 references = page_check_references(page, sc);
743 switch (references) { 729 switch (references) {
744 case PAGEREF_ACTIVATE: 730 case PAGEREF_ACTIVATE:
745 goto activate_locked; 731 goto activate_locked;
@@ -790,7 +776,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
790 * unless under significant pressure. 776 * unless under significant pressure.
791 */ 777 */
792 if (page_is_file_cache(page) && 778 if (page_is_file_cache(page) &&
793 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { 779 (!current_is_kswapd() ||
780 sc->priority >= DEF_PRIORITY - 2)) {
794 /* 781 /*
795 * Immediately reclaim when written back. 782 * Immediately reclaim when written back.
796 * Similar in principal to deactivate_page() 783 * Similar in principal to deactivate_page()
@@ -928,7 +915,7 @@ keep:
928 * will encounter the same problem 915 * will encounter the same problem
929 */ 916 */
930 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) 917 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
931 zone_set_flag(mz->zone, ZONE_CONGESTED); 918 zone_set_flag(zone, ZONE_CONGESTED);
932 919
933 free_hot_cold_page_list(&free_pages, 1); 920 free_hot_cold_page_list(&free_pages, 1);
934 921
@@ -949,29 +936,14 @@ keep:
949 * 936 *
950 * returns 0 on success, -ve errno on failure. 937 * returns 0 on success, -ve errno on failure.
951 */ 938 */
952int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) 939int __isolate_lru_page(struct page *page, isolate_mode_t mode)
953{ 940{
954 bool all_lru_mode;
955 int ret = -EINVAL; 941 int ret = -EINVAL;
956 942
957 /* Only take pages on the LRU. */ 943 /* Only take pages on the LRU. */
958 if (!PageLRU(page)) 944 if (!PageLRU(page))
959 return ret; 945 return ret;
960 946
961 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
962 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
963
964 /*
965 * When checking the active state, we need to be sure we are
966 * dealing with comparible boolean values. Take the logical not
967 * of each.
968 */
969 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
970 return ret;
971
972 if (!all_lru_mode && !!page_is_file_cache(page) != file)
973 return ret;
974
975 /* Do not give back unevictable pages for compaction */ 947 /* Do not give back unevictable pages for compaction */
976 if (PageUnevictable(page)) 948 if (PageUnevictable(page))
977 return ret; 949 return ret;
@@ -1039,47 +1011,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1039 * Appropriate locks must be held before calling this function. 1011 * Appropriate locks must be held before calling this function.
1040 * 1012 *
1041 * @nr_to_scan: The number of pages to look through on the list. 1013 * @nr_to_scan: The number of pages to look through on the list.
1042 * @mz: The mem_cgroup_zone to pull pages from. 1014 * @lruvec: The LRU vector to pull pages from.
1043 * @dst: The temp list to put pages on to. 1015 * @dst: The temp list to put pages on to.
1044 * @nr_scanned: The number of pages that were scanned. 1016 * @nr_scanned: The number of pages that were scanned.
1045 * @sc: The scan_control struct for this reclaim session 1017 * @sc: The scan_control struct for this reclaim session
1046 * @mode: One of the LRU isolation modes 1018 * @mode: One of the LRU isolation modes
1047 * @active: True [1] if isolating active pages 1019 * @lru: LRU list id for isolating
1048 * @file: True [1] if isolating file [!anon] pages
1049 * 1020 *
1050 * returns how many pages were moved onto *@dst. 1021 * returns how many pages were moved onto *@dst.
1051 */ 1022 */
1052static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1023static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1053 struct mem_cgroup_zone *mz, struct list_head *dst, 1024 struct lruvec *lruvec, struct list_head *dst,
1054 unsigned long *nr_scanned, struct scan_control *sc, 1025 unsigned long *nr_scanned, struct scan_control *sc,
1055 isolate_mode_t mode, int active, int file) 1026 isolate_mode_t mode, enum lru_list lru)
1056{ 1027{
1057 struct lruvec *lruvec; 1028 struct list_head *src = &lruvec->lists[lru];
1058 struct list_head *src;
1059 unsigned long nr_taken = 0; 1029 unsigned long nr_taken = 0;
1060 unsigned long scan; 1030 unsigned long scan;
1061 int lru = LRU_BASE;
1062
1063 lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
1064 if (active)
1065 lru += LRU_ACTIVE;
1066 if (file)
1067 lru += LRU_FILE;
1068 src = &lruvec->lists[lru];
1069 1031
1070 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1032 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1071 struct page *page; 1033 struct page *page;
1034 int nr_pages;
1072 1035
1073 page = lru_to_page(src); 1036 page = lru_to_page(src);
1074 prefetchw_prev_lru_page(page, src, flags); 1037 prefetchw_prev_lru_page(page, src, flags);
1075 1038
1076 VM_BUG_ON(!PageLRU(page)); 1039 VM_BUG_ON(!PageLRU(page));
1077 1040
1078 switch (__isolate_lru_page(page, mode, file)) { 1041 switch (__isolate_lru_page(page, mode)) {
1079 case 0: 1042 case 0:
1080 mem_cgroup_lru_del(page); 1043 nr_pages = hpage_nr_pages(page);
1044 mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
1081 list_move(&page->lru, dst); 1045 list_move(&page->lru, dst);
1082 nr_taken += hpage_nr_pages(page); 1046 nr_taken += nr_pages;
1083 break; 1047 break;
1084 1048
1085 case -EBUSY: 1049 case -EBUSY:
@@ -1093,11 +1057,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1093 } 1057 }
1094 1058
1095 *nr_scanned = scan; 1059 *nr_scanned = scan;
1096 1060 trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
1097 trace_mm_vmscan_lru_isolate(sc->order, 1061 nr_taken, mode, is_file_lru(lru));
1098 nr_to_scan, scan,
1099 nr_taken,
1100 mode, file);
1101 return nr_taken; 1062 return nr_taken;
1102} 1063}
1103 1064
@@ -1134,15 +1095,16 @@ int isolate_lru_page(struct page *page)
1134 1095
1135 if (PageLRU(page)) { 1096 if (PageLRU(page)) {
1136 struct zone *zone = page_zone(page); 1097 struct zone *zone = page_zone(page);
1098 struct lruvec *lruvec;
1137 1099
1138 spin_lock_irq(&zone->lru_lock); 1100 spin_lock_irq(&zone->lru_lock);
1101 lruvec = mem_cgroup_page_lruvec(page, zone);
1139 if (PageLRU(page)) { 1102 if (PageLRU(page)) {
1140 int lru = page_lru(page); 1103 int lru = page_lru(page);
1141 ret = 0;
1142 get_page(page); 1104 get_page(page);
1143 ClearPageLRU(page); 1105 ClearPageLRU(page);
1144 1106 del_page_from_lru_list(page, lruvec, lru);
1145 del_page_from_lru_list(zone, page, lru); 1107 ret = 0;
1146 } 1108 }
1147 spin_unlock_irq(&zone->lru_lock); 1109 spin_unlock_irq(&zone->lru_lock);
1148 } 1110 }
@@ -1175,11 +1137,10 @@ static int too_many_isolated(struct zone *zone, int file,
1175} 1137}
1176 1138
1177static noinline_for_stack void 1139static noinline_for_stack void
1178putback_inactive_pages(struct mem_cgroup_zone *mz, 1140putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1179 struct list_head *page_list)
1180{ 1141{
1181 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1142 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1182 struct zone *zone = mz->zone; 1143 struct zone *zone = lruvec_zone(lruvec);
1183 LIST_HEAD(pages_to_free); 1144 LIST_HEAD(pages_to_free);
1184 1145
1185 /* 1146 /*
@@ -1197,9 +1158,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1197 spin_lock_irq(&zone->lru_lock); 1158 spin_lock_irq(&zone->lru_lock);
1198 continue; 1159 continue;
1199 } 1160 }
1161
1162 lruvec = mem_cgroup_page_lruvec(page, zone);
1163
1200 SetPageLRU(page); 1164 SetPageLRU(page);
1201 lru = page_lru(page); 1165 lru = page_lru(page);
1202 add_page_to_lru_list(zone, page, lru); 1166 add_page_to_lru_list(page, lruvec, lru);
1167
1203 if (is_active_lru(lru)) { 1168 if (is_active_lru(lru)) {
1204 int file = is_file_lru(lru); 1169 int file = is_file_lru(lru);
1205 int numpages = hpage_nr_pages(page); 1170 int numpages = hpage_nr_pages(page);
@@ -1208,7 +1173,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1208 if (put_page_testzero(page)) { 1173 if (put_page_testzero(page)) {
1209 __ClearPageLRU(page); 1174 __ClearPageLRU(page);
1210 __ClearPageActive(page); 1175 __ClearPageActive(page);
1211 del_page_from_lru_list(zone, page, lru); 1176 del_page_from_lru_list(page, lruvec, lru);
1212 1177
1213 if (unlikely(PageCompound(page))) { 1178 if (unlikely(PageCompound(page))) {
1214 spin_unlock_irq(&zone->lru_lock); 1179 spin_unlock_irq(&zone->lru_lock);
@@ -1225,71 +1190,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
1225 list_splice(&pages_to_free, page_list); 1190 list_splice(&pages_to_free, page_list);
1226} 1191}
1227 1192
1228static noinline_for_stack void
1229update_isolated_counts(struct mem_cgroup_zone *mz,
1230 struct list_head *page_list,
1231 unsigned long *nr_anon,
1232 unsigned long *nr_file)
1233{
1234 struct zone *zone = mz->zone;
1235 unsigned int count[NR_LRU_LISTS] = { 0, };
1236 unsigned long nr_active = 0;
1237 struct page *page;
1238 int lru;
1239
1240 /*
1241 * Count pages and clear active flags
1242 */
1243 list_for_each_entry(page, page_list, lru) {
1244 int numpages = hpage_nr_pages(page);
1245 lru = page_lru_base_type(page);
1246 if (PageActive(page)) {
1247 lru += LRU_ACTIVE;
1248 ClearPageActive(page);
1249 nr_active += numpages;
1250 }
1251 count[lru] += numpages;
1252 }
1253
1254 preempt_disable();
1255 __count_vm_events(PGDEACTIVATE, nr_active);
1256
1257 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1258 -count[LRU_ACTIVE_FILE]);
1259 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1260 -count[LRU_INACTIVE_FILE]);
1261 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1262 -count[LRU_ACTIVE_ANON]);
1263 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1264 -count[LRU_INACTIVE_ANON]);
1265
1266 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1267 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1268
1269 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1270 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1271 preempt_enable();
1272}
1273
1274/* 1193/*
1275 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1194 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1276 * of reclaimed pages 1195 * of reclaimed pages
1277 */ 1196 */
1278static noinline_for_stack unsigned long 1197static noinline_for_stack unsigned long
1279shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, 1198shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1280 struct scan_control *sc, int priority, int file) 1199 struct scan_control *sc, enum lru_list lru)
1281{ 1200{
1282 LIST_HEAD(page_list); 1201 LIST_HEAD(page_list);
1283 unsigned long nr_scanned; 1202 unsigned long nr_scanned;
1284 unsigned long nr_reclaimed = 0; 1203 unsigned long nr_reclaimed = 0;
1285 unsigned long nr_taken; 1204 unsigned long nr_taken;
1286 unsigned long nr_anon;
1287 unsigned long nr_file;
1288 unsigned long nr_dirty = 0; 1205 unsigned long nr_dirty = 0;
1289 unsigned long nr_writeback = 0; 1206 unsigned long nr_writeback = 0;
1290 isolate_mode_t isolate_mode = ISOLATE_INACTIVE; 1207 isolate_mode_t isolate_mode = 0;
1291 struct zone *zone = mz->zone; 1208 int file = is_file_lru(lru);
1292 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1209 struct zone *zone = lruvec_zone(lruvec);
1210 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1293 1211
1294 while (unlikely(too_many_isolated(zone, file, sc))) { 1212 while (unlikely(too_many_isolated(zone, file, sc))) {
1295 congestion_wait(BLK_RW_ASYNC, HZ/10); 1213 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1308,31 +1226,30 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1308 1226
1309 spin_lock_irq(&zone->lru_lock); 1227 spin_lock_irq(&zone->lru_lock);
1310 1228
1311 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, 1229 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1312 sc, isolate_mode, 0, file); 1230 &nr_scanned, sc, isolate_mode, lru);
1231
1232 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1233 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1234
1313 if (global_reclaim(sc)) { 1235 if (global_reclaim(sc)) {
1314 zone->pages_scanned += nr_scanned; 1236 zone->pages_scanned += nr_scanned;
1315 if (current_is_kswapd()) 1237 if (current_is_kswapd())
1316 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1238 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
1317 nr_scanned);
1318 else 1239 else
1319 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1240 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
1320 nr_scanned);
1321 } 1241 }
1322 spin_unlock_irq(&zone->lru_lock); 1242 spin_unlock_irq(&zone->lru_lock);
1323 1243
1324 if (nr_taken == 0) 1244 if (nr_taken == 0)
1325 return 0; 1245 return 0;
1326 1246
1327 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); 1247 nr_reclaimed = shrink_page_list(&page_list, zone, sc,
1328
1329 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1330 &nr_dirty, &nr_writeback); 1248 &nr_dirty, &nr_writeback);
1331 1249
1332 spin_lock_irq(&zone->lru_lock); 1250 spin_lock_irq(&zone->lru_lock);
1333 1251
1334 reclaim_stat->recent_scanned[0] += nr_anon; 1252 reclaim_stat->recent_scanned[file] += nr_taken;
1335 reclaim_stat->recent_scanned[1] += nr_file;
1336 1253
1337 if (global_reclaim(sc)) { 1254 if (global_reclaim(sc)) {
1338 if (current_is_kswapd()) 1255 if (current_is_kswapd())
@@ -1343,10 +1260,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1343 nr_reclaimed); 1260 nr_reclaimed);
1344 } 1261 }
1345 1262
1346 putback_inactive_pages(mz, &page_list); 1263 putback_inactive_pages(lruvec, &page_list);
1347 1264
1348 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); 1265 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1349 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1350 1266
1351 spin_unlock_irq(&zone->lru_lock); 1267 spin_unlock_irq(&zone->lru_lock);
1352 1268
@@ -1375,13 +1291,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1375 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any 1291 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1376 * isolated page is PageWriteback 1292 * isolated page is PageWriteback
1377 */ 1293 */
1378 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) 1294 if (nr_writeback && nr_writeback >=
1295 (nr_taken >> (DEF_PRIORITY - sc->priority)))
1379 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1296 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1380 1297
1381 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1298 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1382 zone_idx(zone), 1299 zone_idx(zone),
1383 nr_scanned, nr_reclaimed, 1300 nr_scanned, nr_reclaimed,
1384 priority, 1301 sc->priority,
1385 trace_shrink_flags(file)); 1302 trace_shrink_flags(file));
1386 return nr_reclaimed; 1303 return nr_reclaimed;
1387} 1304}
@@ -1404,30 +1321,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1404 * But we had to alter page->flags anyway. 1321 * But we had to alter page->flags anyway.
1405 */ 1322 */
1406 1323
1407static void move_active_pages_to_lru(struct zone *zone, 1324static void move_active_pages_to_lru(struct lruvec *lruvec,
1408 struct list_head *list, 1325 struct list_head *list,
1409 struct list_head *pages_to_free, 1326 struct list_head *pages_to_free,
1410 enum lru_list lru) 1327 enum lru_list lru)
1411{ 1328{
1329 struct zone *zone = lruvec_zone(lruvec);
1412 unsigned long pgmoved = 0; 1330 unsigned long pgmoved = 0;
1413 struct page *page; 1331 struct page *page;
1332 int nr_pages;
1414 1333
1415 while (!list_empty(list)) { 1334 while (!list_empty(list)) {
1416 struct lruvec *lruvec;
1417
1418 page = lru_to_page(list); 1335 page = lru_to_page(list);
1336 lruvec = mem_cgroup_page_lruvec(page, zone);
1419 1337
1420 VM_BUG_ON(PageLRU(page)); 1338 VM_BUG_ON(PageLRU(page));
1421 SetPageLRU(page); 1339 SetPageLRU(page);
1422 1340
1423 lruvec = mem_cgroup_lru_add_list(zone, page, lru); 1341 nr_pages = hpage_nr_pages(page);
1342 mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
1424 list_move(&page->lru, &lruvec->lists[lru]); 1343 list_move(&page->lru, &lruvec->lists[lru]);
1425 pgmoved += hpage_nr_pages(page); 1344 pgmoved += nr_pages;
1426 1345
1427 if (put_page_testzero(page)) { 1346 if (put_page_testzero(page)) {
1428 __ClearPageLRU(page); 1347 __ClearPageLRU(page);
1429 __ClearPageActive(page); 1348 __ClearPageActive(page);
1430 del_page_from_lru_list(zone, page, lru); 1349 del_page_from_lru_list(page, lruvec, lru);
1431 1350
1432 if (unlikely(PageCompound(page))) { 1351 if (unlikely(PageCompound(page))) {
1433 spin_unlock_irq(&zone->lru_lock); 1352 spin_unlock_irq(&zone->lru_lock);
@@ -1443,9 +1362,9 @@ static void move_active_pages_to_lru(struct zone *zone,
1443} 1362}
1444 1363
1445static void shrink_active_list(unsigned long nr_to_scan, 1364static void shrink_active_list(unsigned long nr_to_scan,
1446 struct mem_cgroup_zone *mz, 1365 struct lruvec *lruvec,
1447 struct scan_control *sc, 1366 struct scan_control *sc,
1448 int priority, int file) 1367 enum lru_list lru)
1449{ 1368{
1450 unsigned long nr_taken; 1369 unsigned long nr_taken;
1451 unsigned long nr_scanned; 1370 unsigned long nr_scanned;
@@ -1454,10 +1373,11 @@ static void shrink_active_list(unsigned long nr_to_scan,
1454 LIST_HEAD(l_active); 1373 LIST_HEAD(l_active);
1455 LIST_HEAD(l_inactive); 1374 LIST_HEAD(l_inactive);
1456 struct page *page; 1375 struct page *page;
1457 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1376 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1458 unsigned long nr_rotated = 0; 1377 unsigned long nr_rotated = 0;
1459 isolate_mode_t isolate_mode = ISOLATE_ACTIVE; 1378 isolate_mode_t isolate_mode = 0;
1460 struct zone *zone = mz->zone; 1379 int file = is_file_lru(lru);
1380 struct zone *zone = lruvec_zone(lruvec);
1461 1381
1462 lru_add_drain(); 1382 lru_add_drain();
1463 1383
@@ -1468,18 +1388,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
1468 1388
1469 spin_lock_irq(&zone->lru_lock); 1389 spin_lock_irq(&zone->lru_lock);
1470 1390
1471 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, 1391 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
1472 isolate_mode, 1, file); 1392 &nr_scanned, sc, isolate_mode, lru);
1473 if (global_reclaim(sc)) 1393 if (global_reclaim(sc))
1474 zone->pages_scanned += nr_scanned; 1394 zone->pages_scanned += nr_scanned;
1475 1395
1476 reclaim_stat->recent_scanned[file] += nr_taken; 1396 reclaim_stat->recent_scanned[file] += nr_taken;
1477 1397
1478 __count_zone_vm_events(PGREFILL, zone, nr_scanned); 1398 __count_zone_vm_events(PGREFILL, zone, nr_scanned);
1479 if (file) 1399 __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
1480 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
1481 else
1482 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
1483 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); 1400 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
1484 spin_unlock_irq(&zone->lru_lock); 1401 spin_unlock_irq(&zone->lru_lock);
1485 1402
@@ -1535,10 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
1535 */ 1452 */
1536 reclaim_stat->recent_rotated[file] += nr_rotated; 1453 reclaim_stat->recent_rotated[file] += nr_rotated;
1537 1454
1538 move_active_pages_to_lru(zone, &l_active, &l_hold, 1455 move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
1539 LRU_ACTIVE + file * LRU_FILE); 1456 move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
1540 move_active_pages_to_lru(zone, &l_inactive, &l_hold,
1541 LRU_BASE + file * LRU_FILE);
1542 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); 1457 __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
1543 spin_unlock_irq(&zone->lru_lock); 1458 spin_unlock_irq(&zone->lru_lock);
1544 1459
@@ -1561,13 +1476,12 @@ static int inactive_anon_is_low_global(struct zone *zone)
1561 1476
1562/** 1477/**
1563 * inactive_anon_is_low - check if anonymous pages need to be deactivated 1478 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1564 * @zone: zone to check 1479 * @lruvec: LRU vector to check
1565 * @sc: scan control of this context
1566 * 1480 *
1567 * Returns true if the zone does not have enough inactive anon pages, 1481 * Returns true if the zone does not have enough inactive anon pages,
1568 * meaning some active anon pages need to be deactivated. 1482 * meaning some active anon pages need to be deactivated.
1569 */ 1483 */
1570static int inactive_anon_is_low(struct mem_cgroup_zone *mz) 1484static int inactive_anon_is_low(struct lruvec *lruvec)
1571{ 1485{
1572 /* 1486 /*
1573 * If we don't have swap space, anonymous page deactivation 1487 * If we don't have swap space, anonymous page deactivation
@@ -1577,13 +1491,12 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1577 return 0; 1491 return 0;
1578 1492
1579 if (!mem_cgroup_disabled()) 1493 if (!mem_cgroup_disabled())
1580 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, 1494 return mem_cgroup_inactive_anon_is_low(lruvec);
1581 mz->zone);
1582 1495
1583 return inactive_anon_is_low_global(mz->zone); 1496 return inactive_anon_is_low_global(lruvec_zone(lruvec));
1584} 1497}
1585#else 1498#else
1586static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz) 1499static inline int inactive_anon_is_low(struct lruvec *lruvec)
1587{ 1500{
1588 return 0; 1501 return 0;
1589} 1502}
@@ -1601,7 +1514,7 @@ static int inactive_file_is_low_global(struct zone *zone)
1601 1514
1602/** 1515/**
1603 * inactive_file_is_low - check if file pages need to be deactivated 1516 * inactive_file_is_low - check if file pages need to be deactivated
1604 * @mz: memory cgroup and zone to check 1517 * @lruvec: LRU vector to check
1605 * 1518 *
1606 * When the system is doing streaming IO, memory pressure here 1519 * When the system is doing streaming IO, memory pressure here
1607 * ensures that active file pages get deactivated, until more 1520 * ensures that active file pages get deactivated, until more
@@ -1613,44 +1526,39 @@ static int inactive_file_is_low_global(struct zone *zone)
1613 * This uses a different ratio than the anonymous pages, because 1526 * This uses a different ratio than the anonymous pages, because
1614 * the page cache uses a use-once replacement algorithm. 1527 * the page cache uses a use-once replacement algorithm.
1615 */ 1528 */
1616static int inactive_file_is_low(struct mem_cgroup_zone *mz) 1529static int inactive_file_is_low(struct lruvec *lruvec)
1617{ 1530{
1618 if (!mem_cgroup_disabled()) 1531 if (!mem_cgroup_disabled())
1619 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, 1532 return mem_cgroup_inactive_file_is_low(lruvec);
1620 mz->zone);
1621 1533
1622 return inactive_file_is_low_global(mz->zone); 1534 return inactive_file_is_low_global(lruvec_zone(lruvec));
1623} 1535}
1624 1536
1625static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file) 1537static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
1626{ 1538{
1627 if (file) 1539 if (is_file_lru(lru))
1628 return inactive_file_is_low(mz); 1540 return inactive_file_is_low(lruvec);
1629 else 1541 else
1630 return inactive_anon_is_low(mz); 1542 return inactive_anon_is_low(lruvec);
1631} 1543}
1632 1544
1633static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1545static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1634 struct mem_cgroup_zone *mz, 1546 struct lruvec *lruvec, struct scan_control *sc)
1635 struct scan_control *sc, int priority)
1636{ 1547{
1637 int file = is_file_lru(lru);
1638
1639 if (is_active_lru(lru)) { 1548 if (is_active_lru(lru)) {
1640 if (inactive_list_is_low(mz, file)) 1549 if (inactive_list_is_low(lruvec, lru))
1641 shrink_active_list(nr_to_scan, mz, sc, priority, file); 1550 shrink_active_list(nr_to_scan, lruvec, sc, lru);
1642 return 0; 1551 return 0;
1643 } 1552 }
1644 1553
1645 return shrink_inactive_list(nr_to_scan, mz, sc, priority, file); 1554 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
1646} 1555}
1647 1556
1648static int vmscan_swappiness(struct mem_cgroup_zone *mz, 1557static int vmscan_swappiness(struct scan_control *sc)
1649 struct scan_control *sc)
1650{ 1558{
1651 if (global_reclaim(sc)) 1559 if (global_reclaim(sc))
1652 return vm_swappiness; 1560 return vm_swappiness;
1653 return mem_cgroup_swappiness(mz->mem_cgroup); 1561 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1654} 1562}
1655 1563
1656/* 1564/*
@@ -1662,17 +1570,18 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz,
1662 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan 1570 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
1663 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan 1571 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1664 */ 1572 */
1665static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, 1573static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1666 unsigned long *nr, int priority) 1574 unsigned long *nr)
1667{ 1575{
1668 unsigned long anon, file, free; 1576 unsigned long anon, file, free;
1669 unsigned long anon_prio, file_prio; 1577 unsigned long anon_prio, file_prio;
1670 unsigned long ap, fp; 1578 unsigned long ap, fp;
1671 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1579 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1672 u64 fraction[2], denominator; 1580 u64 fraction[2], denominator;
1673 enum lru_list lru; 1581 enum lru_list lru;
1674 int noswap = 0; 1582 int noswap = 0;
1675 bool force_scan = false; 1583 bool force_scan = false;
1584 struct zone *zone = lruvec_zone(lruvec);
1676 1585
1677 /* 1586 /*
1678 * If the zone or memcg is small, nr[l] can be 0. This 1587 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1684,7 +1593,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1684 * latencies, so it's better to scan a minimum amount there as 1593 * latencies, so it's better to scan a minimum amount there as
1685 * well. 1594 * well.
1686 */ 1595 */
1687 if (current_is_kswapd() && mz->zone->all_unreclaimable) 1596 if (current_is_kswapd() && zone->all_unreclaimable)
1688 force_scan = true; 1597 force_scan = true;
1689 if (!global_reclaim(sc)) 1598 if (!global_reclaim(sc))
1690 force_scan = true; 1599 force_scan = true;
@@ -1698,16 +1607,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1698 goto out; 1607 goto out;
1699 } 1608 }
1700 1609
1701 anon = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) + 1610 anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
1702 zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); 1611 get_lru_size(lruvec, LRU_INACTIVE_ANON);
1703 file = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) + 1612 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1704 zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); 1613 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1705 1614
1706 if (global_reclaim(sc)) { 1615 if (global_reclaim(sc)) {
1707 free = zone_page_state(mz->zone, NR_FREE_PAGES); 1616 free = zone_page_state(zone, NR_FREE_PAGES);
1708 /* If we have very few page cache pages, 1617 /* If we have very few page cache pages,
1709 force-scan anon pages. */ 1618 force-scan anon pages. */
1710 if (unlikely(file + free <= high_wmark_pages(mz->zone))) { 1619 if (unlikely(file + free <= high_wmark_pages(zone))) {
1711 fraction[0] = 1; 1620 fraction[0] = 1;
1712 fraction[1] = 0; 1621 fraction[1] = 0;
1713 denominator = 1; 1622 denominator = 1;
@@ -1719,8 +1628,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1719 * With swappiness at 100, anonymous and file have the same priority. 1628 * With swappiness at 100, anonymous and file have the same priority.
1720 * This scanning priority is essentially the inverse of IO cost. 1629 * This scanning priority is essentially the inverse of IO cost.
1721 */ 1630 */
1722 anon_prio = vmscan_swappiness(mz, sc); 1631 anon_prio = vmscan_swappiness(sc);
1723 file_prio = 200 - vmscan_swappiness(mz, sc); 1632 file_prio = 200 - anon_prio;
1724 1633
1725 /* 1634 /*
1726 * OK, so we have swap space and a fair amount of page cache 1635 * OK, so we have swap space and a fair amount of page cache
@@ -1733,7 +1642,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1733 * 1642 *
1734 * anon in [0], file in [1] 1643 * anon in [0], file in [1]
1735 */ 1644 */
1736 spin_lock_irq(&mz->zone->lru_lock); 1645 spin_lock_irq(&zone->lru_lock);
1737 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1646 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1738 reclaim_stat->recent_scanned[0] /= 2; 1647 reclaim_stat->recent_scanned[0] /= 2;
1739 reclaim_stat->recent_rotated[0] /= 2; 1648 reclaim_stat->recent_rotated[0] /= 2;
@@ -1754,7 +1663,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1754 1663
1755 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); 1664 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1756 fp /= reclaim_stat->recent_rotated[1] + 1; 1665 fp /= reclaim_stat->recent_rotated[1] + 1;
1757 spin_unlock_irq(&mz->zone->lru_lock); 1666 spin_unlock_irq(&zone->lru_lock);
1758 1667
1759 fraction[0] = ap; 1668 fraction[0] = ap;
1760 fraction[1] = fp; 1669 fraction[1] = fp;
@@ -1764,9 +1673,9 @@ out:
1764 int file = is_file_lru(lru); 1673 int file = is_file_lru(lru);
1765 unsigned long scan; 1674 unsigned long scan;
1766 1675
1767 scan = zone_nr_lru_pages(mz, lru); 1676 scan = get_lru_size(lruvec, lru);
1768 if (priority || noswap || !vmscan_swappiness(mz, sc)) { 1677 if (sc->priority || noswap || !vmscan_swappiness(sc)) {
1769 scan >>= priority; 1678 scan >>= sc->priority;
1770 if (!scan && force_scan) 1679 if (!scan && force_scan)
1771 scan = SWAP_CLUSTER_MAX; 1680 scan = SWAP_CLUSTER_MAX;
1772 scan = div64_u64(scan * fraction[file], denominator); 1681 scan = div64_u64(scan * fraction[file], denominator);
@@ -1776,11 +1685,11 @@ out:
1776} 1685}
1777 1686
1778/* Use reclaim/compaction for costly allocs or under memory pressure */ 1687/* Use reclaim/compaction for costly allocs or under memory pressure */
1779static bool in_reclaim_compaction(int priority, struct scan_control *sc) 1688static bool in_reclaim_compaction(struct scan_control *sc)
1780{ 1689{
1781 if (COMPACTION_BUILD && sc->order && 1690 if (COMPACTION_BUILD && sc->order &&
1782 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 1691 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1783 priority < DEF_PRIORITY - 2)) 1692 sc->priority < DEF_PRIORITY - 2))
1784 return true; 1693 return true;
1785 1694
1786 return false; 1695 return false;
@@ -1793,17 +1702,16 @@ static bool in_reclaim_compaction(int priority, struct scan_control *sc)
1793 * calls try_to_compact_zone() that it will have enough free pages to succeed. 1702 * calls try_to_compact_zone() that it will have enough free pages to succeed.
1794 * It will give up earlier than that if there is difficulty reclaiming pages. 1703 * It will give up earlier than that if there is difficulty reclaiming pages.
1795 */ 1704 */
1796static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, 1705static inline bool should_continue_reclaim(struct lruvec *lruvec,
1797 unsigned long nr_reclaimed, 1706 unsigned long nr_reclaimed,
1798 unsigned long nr_scanned, 1707 unsigned long nr_scanned,
1799 int priority,
1800 struct scan_control *sc) 1708 struct scan_control *sc)
1801{ 1709{
1802 unsigned long pages_for_compaction; 1710 unsigned long pages_for_compaction;
1803 unsigned long inactive_lru_pages; 1711 unsigned long inactive_lru_pages;
1804 1712
1805 /* If not in reclaim/compaction mode, stop */ 1713 /* If not in reclaim/compaction mode, stop */
1806 if (!in_reclaim_compaction(priority, sc)) 1714 if (!in_reclaim_compaction(sc))
1807 return false; 1715 return false;
1808 1716
1809 /* Consider stopping depending on scan and reclaim activity */ 1717 /* Consider stopping depending on scan and reclaim activity */
@@ -1834,15 +1742,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
1834 * inactive lists are large enough, continue reclaiming 1742 * inactive lists are large enough, continue reclaiming
1835 */ 1743 */
1836 pages_for_compaction = (2UL << sc->order); 1744 pages_for_compaction = (2UL << sc->order);
1837 inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE); 1745 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1838 if (nr_swap_pages > 0) 1746 if (nr_swap_pages > 0)
1839 inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON); 1747 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
1840 if (sc->nr_reclaimed < pages_for_compaction && 1748 if (sc->nr_reclaimed < pages_for_compaction &&
1841 inactive_lru_pages > pages_for_compaction) 1749 inactive_lru_pages > pages_for_compaction)
1842 return true; 1750 return true;
1843 1751
1844 /* If compaction would go ahead or the allocation would succeed, stop */ 1752 /* If compaction would go ahead or the allocation would succeed, stop */
1845 switch (compaction_suitable(mz->zone, sc->order)) { 1753 switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
1846 case COMPACT_PARTIAL: 1754 case COMPACT_PARTIAL:
1847 case COMPACT_CONTINUE: 1755 case COMPACT_CONTINUE:
1848 return false; 1756 return false;
@@ -1854,8 +1762,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
1854/* 1762/*
1855 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1763 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1856 */ 1764 */
1857static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz, 1765static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1858 struct scan_control *sc)
1859{ 1766{
1860 unsigned long nr[NR_LRU_LISTS]; 1767 unsigned long nr[NR_LRU_LISTS];
1861 unsigned long nr_to_scan; 1768 unsigned long nr_to_scan;
@@ -1867,7 +1774,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
1867restart: 1774restart:
1868 nr_reclaimed = 0; 1775 nr_reclaimed = 0;
1869 nr_scanned = sc->nr_scanned; 1776 nr_scanned = sc->nr_scanned;
1870 get_scan_count(mz, sc, nr, priority); 1777 get_scan_count(lruvec, sc, nr);
1871 1778
1872 blk_start_plug(&plug); 1779 blk_start_plug(&plug);
1873 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1780 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1879,7 +1786,7 @@ restart:
1879 nr[lru] -= nr_to_scan; 1786 nr[lru] -= nr_to_scan;
1880 1787
1881 nr_reclaimed += shrink_list(lru, nr_to_scan, 1788 nr_reclaimed += shrink_list(lru, nr_to_scan,
1882 mz, sc, priority); 1789 lruvec, sc);
1883 } 1790 }
1884 } 1791 }
1885 /* 1792 /*
@@ -1890,7 +1797,8 @@ restart:
1890 * with multiple processes reclaiming pages, the total 1797 * with multiple processes reclaiming pages, the total
1891 * freeing target can get unreasonably large. 1798 * freeing target can get unreasonably large.
1892 */ 1799 */
1893 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1800 if (nr_reclaimed >= nr_to_reclaim &&
1801 sc->priority < DEF_PRIORITY)
1894 break; 1802 break;
1895 } 1803 }
1896 blk_finish_plug(&plug); 1804 blk_finish_plug(&plug);
@@ -1900,36 +1808,33 @@ restart:
1900 * Even if we did not try to evict anon pages at all, we want to 1808 * Even if we did not try to evict anon pages at all, we want to
1901 * rebalance the anon lru active/inactive ratio. 1809 * rebalance the anon lru active/inactive ratio.
1902 */ 1810 */
1903 if (inactive_anon_is_low(mz)) 1811 if (inactive_anon_is_low(lruvec))
1904 shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0); 1812 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1813 sc, LRU_ACTIVE_ANON);
1905 1814
1906 /* reclaim/compaction might need reclaim to continue */ 1815 /* reclaim/compaction might need reclaim to continue */
1907 if (should_continue_reclaim(mz, nr_reclaimed, 1816 if (should_continue_reclaim(lruvec, nr_reclaimed,
1908 sc->nr_scanned - nr_scanned, 1817 sc->nr_scanned - nr_scanned, sc))
1909 priority, sc))
1910 goto restart; 1818 goto restart;
1911 1819
1912 throttle_vm_writeout(sc->gfp_mask); 1820 throttle_vm_writeout(sc->gfp_mask);
1913} 1821}
1914 1822
1915static void shrink_zone(int priority, struct zone *zone, 1823static void shrink_zone(struct zone *zone, struct scan_control *sc)
1916 struct scan_control *sc)
1917{ 1824{
1918 struct mem_cgroup *root = sc->target_mem_cgroup; 1825 struct mem_cgroup *root = sc->target_mem_cgroup;
1919 struct mem_cgroup_reclaim_cookie reclaim = { 1826 struct mem_cgroup_reclaim_cookie reclaim = {
1920 .zone = zone, 1827 .zone = zone,
1921 .priority = priority, 1828 .priority = sc->priority,
1922 }; 1829 };
1923 struct mem_cgroup *memcg; 1830 struct mem_cgroup *memcg;
1924 1831
1925 memcg = mem_cgroup_iter(root, NULL, &reclaim); 1832 memcg = mem_cgroup_iter(root, NULL, &reclaim);
1926 do { 1833 do {
1927 struct mem_cgroup_zone mz = { 1834 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1928 .mem_cgroup = memcg, 1835
1929 .zone = zone, 1836 shrink_lruvec(lruvec, sc);
1930 };
1931 1837
1932 shrink_mem_cgroup_zone(priority, &mz, sc);
1933 /* 1838 /*
1934 * Limit reclaim has historically picked one memcg and 1839 * Limit reclaim has historically picked one memcg and
1935 * scanned it with decreasing priority levels until 1840 * scanned it with decreasing priority levels until
@@ -2005,8 +1910,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2005 * the caller that it should consider retrying the allocation instead of 1910 * the caller that it should consider retrying the allocation instead of
2006 * further reclaim. 1911 * further reclaim.
2007 */ 1912 */
2008static bool shrink_zones(int priority, struct zonelist *zonelist, 1913static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2009 struct scan_control *sc)
2010{ 1914{
2011 struct zoneref *z; 1915 struct zoneref *z;
2012 struct zone *zone; 1916 struct zone *zone;
@@ -2033,7 +1937,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2033 if (global_reclaim(sc)) { 1937 if (global_reclaim(sc)) {
2034 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1938 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2035 continue; 1939 continue;
2036 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1940 if (zone->all_unreclaimable &&
1941 sc->priority != DEF_PRIORITY)
2037 continue; /* Let kswapd poll it */ 1942 continue; /* Let kswapd poll it */
2038 if (COMPACTION_BUILD) { 1943 if (COMPACTION_BUILD) {
2039 /* 1944 /*
@@ -2065,7 +1970,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2065 /* need some check for avoid more shrink_zone() */ 1970 /* need some check for avoid more shrink_zone() */
2066 } 1971 }
2067 1972
2068 shrink_zone(priority, zone, sc); 1973 shrink_zone(zone, sc);
2069 } 1974 }
2070 1975
2071 return aborted_reclaim; 1976 return aborted_reclaim;
@@ -2116,7 +2021,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2116 struct scan_control *sc, 2021 struct scan_control *sc,
2117 struct shrink_control *shrink) 2022 struct shrink_control *shrink)
2118{ 2023{
2119 int priority;
2120 unsigned long total_scanned = 0; 2024 unsigned long total_scanned = 0;
2121 struct reclaim_state *reclaim_state = current->reclaim_state; 2025 struct reclaim_state *reclaim_state = current->reclaim_state;
2122 struct zoneref *z; 2026 struct zoneref *z;
@@ -2129,9 +2033,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2129 if (global_reclaim(sc)) 2033 if (global_reclaim(sc))
2130 count_vm_event(ALLOCSTALL); 2034 count_vm_event(ALLOCSTALL);
2131 2035
2132 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2036 do {
2133 sc->nr_scanned = 0; 2037 sc->nr_scanned = 0;
2134 aborted_reclaim = shrink_zones(priority, zonelist, sc); 2038 aborted_reclaim = shrink_zones(zonelist, sc);
2135 2039
2136 /* 2040 /*
2137 * Don't shrink slabs when reclaiming memory from 2041 * Don't shrink slabs when reclaiming memory from
@@ -2173,7 +2077,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2173 2077
2174 /* Take a nap, wait for some writeback to complete */ 2078 /* Take a nap, wait for some writeback to complete */
2175 if (!sc->hibernation_mode && sc->nr_scanned && 2079 if (!sc->hibernation_mode && sc->nr_scanned &&
2176 priority < DEF_PRIORITY - 2) { 2080 sc->priority < DEF_PRIORITY - 2) {
2177 struct zone *preferred_zone; 2081 struct zone *preferred_zone;
2178 2082
2179 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2083 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
@@ -2181,7 +2085,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2181 &preferred_zone); 2085 &preferred_zone);
2182 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2086 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2183 } 2087 }
2184 } 2088 } while (--sc->priority >= 0);
2185 2089
2186out: 2090out:
2187 delayacct_freepages_end(); 2091 delayacct_freepages_end();
@@ -2219,6 +2123,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2219 .may_unmap = 1, 2123 .may_unmap = 1,
2220 .may_swap = 1, 2124 .may_swap = 1,
2221 .order = order, 2125 .order = order,
2126 .priority = DEF_PRIORITY,
2222 .target_mem_cgroup = NULL, 2127 .target_mem_cgroup = NULL,
2223 .nodemask = nodemask, 2128 .nodemask = nodemask,
2224 }; 2129 };
@@ -2251,17 +2156,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2251 .may_unmap = 1, 2156 .may_unmap = 1,
2252 .may_swap = !noswap, 2157 .may_swap = !noswap,
2253 .order = 0, 2158 .order = 0,
2159 .priority = 0,
2254 .target_mem_cgroup = memcg, 2160 .target_mem_cgroup = memcg,
2255 }; 2161 };
2256 struct mem_cgroup_zone mz = { 2162 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2257 .mem_cgroup = memcg,
2258 .zone = zone,
2259 };
2260 2163
2261 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2164 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2262 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2165 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
2263 2166
2264 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, 2167 trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
2265 sc.may_writepage, 2168 sc.may_writepage,
2266 sc.gfp_mask); 2169 sc.gfp_mask);
2267 2170
@@ -2272,7 +2175,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2272 * will pick up pages from other mem cgroup's as well. We hack 2175 * will pick up pages from other mem cgroup's as well. We hack
2273 * the priority and make it zero. 2176 * the priority and make it zero.
2274 */ 2177 */
2275 shrink_mem_cgroup_zone(0, &mz, &sc); 2178 shrink_lruvec(lruvec, &sc);
2276 2179
2277 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2180 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2278 2181
@@ -2293,6 +2196,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2293 .may_swap = !noswap, 2196 .may_swap = !noswap,
2294 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2197 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2295 .order = 0, 2198 .order = 0,
2199 .priority = DEF_PRIORITY,
2296 .target_mem_cgroup = memcg, 2200 .target_mem_cgroup = memcg,
2297 .nodemask = NULL, /* we don't care the placement */ 2201 .nodemask = NULL, /* we don't care the placement */
2298 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2202 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2323,8 +2227,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
2323} 2227}
2324#endif 2228#endif
2325 2229
2326static void age_active_anon(struct zone *zone, struct scan_control *sc, 2230static void age_active_anon(struct zone *zone, struct scan_control *sc)
2327 int priority)
2328{ 2231{
2329 struct mem_cgroup *memcg; 2232 struct mem_cgroup *memcg;
2330 2233
@@ -2333,14 +2236,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc,
2333 2236
2334 memcg = mem_cgroup_iter(NULL, NULL, NULL); 2237 memcg = mem_cgroup_iter(NULL, NULL, NULL);
2335 do { 2238 do {
2336 struct mem_cgroup_zone mz = { 2239 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2337 .mem_cgroup = memcg,
2338 .zone = zone,
2339 };
2340 2240
2341 if (inactive_anon_is_low(&mz)) 2241 if (inactive_anon_is_low(lruvec))
2342 shrink_active_list(SWAP_CLUSTER_MAX, &mz, 2242 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2343 sc, priority, 0); 2243 sc, LRU_ACTIVE_ANON);
2344 2244
2345 memcg = mem_cgroup_iter(NULL, memcg, NULL); 2245 memcg = mem_cgroup_iter(NULL, memcg, NULL);
2346 } while (memcg); 2246 } while (memcg);
@@ -2449,7 +2349,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2449{ 2349{
2450 int all_zones_ok; 2350 int all_zones_ok;
2451 unsigned long balanced; 2351 unsigned long balanced;
2452 int priority;
2453 int i; 2352 int i;
2454 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2353 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2455 unsigned long total_scanned; 2354 unsigned long total_scanned;
@@ -2473,11 +2372,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2473 }; 2372 };
2474loop_again: 2373loop_again:
2475 total_scanned = 0; 2374 total_scanned = 0;
2375 sc.priority = DEF_PRIORITY;
2476 sc.nr_reclaimed = 0; 2376 sc.nr_reclaimed = 0;
2477 sc.may_writepage = !laptop_mode; 2377 sc.may_writepage = !laptop_mode;
2478 count_vm_event(PAGEOUTRUN); 2378 count_vm_event(PAGEOUTRUN);
2479 2379
2480 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2380 do {
2481 unsigned long lru_pages = 0; 2381 unsigned long lru_pages = 0;
2482 int has_under_min_watermark_zone = 0; 2382 int has_under_min_watermark_zone = 0;
2483 2383
@@ -2494,14 +2394,15 @@ loop_again:
2494 if (!populated_zone(zone)) 2394 if (!populated_zone(zone))
2495 continue; 2395 continue;
2496 2396
2497 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2397 if (zone->all_unreclaimable &&
2398 sc.priority != DEF_PRIORITY)
2498 continue; 2399 continue;
2499 2400
2500 /* 2401 /*
2501 * Do some background aging of the anon list, to give 2402 * Do some background aging of the anon list, to give
2502 * pages a chance to be referenced before reclaiming. 2403 * pages a chance to be referenced before reclaiming.
2503 */ 2404 */
2504 age_active_anon(zone, &sc, priority); 2405 age_active_anon(zone, &sc);
2505 2406
2506 /* 2407 /*
2507 * If the number of buffer_heads in the machine 2408 * If the number of buffer_heads in the machine
@@ -2549,7 +2450,8 @@ loop_again:
2549 if (!populated_zone(zone)) 2450 if (!populated_zone(zone))
2550 continue; 2451 continue;
2551 2452
2552 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2453 if (zone->all_unreclaimable &&
2454 sc.priority != DEF_PRIORITY)
2553 continue; 2455 continue;
2554 2456
2555 sc.nr_scanned = 0; 2457 sc.nr_scanned = 0;
@@ -2593,7 +2495,7 @@ loop_again:
2593 !zone_watermark_ok_safe(zone, testorder, 2495 !zone_watermark_ok_safe(zone, testorder,
2594 high_wmark_pages(zone) + balance_gap, 2496 high_wmark_pages(zone) + balance_gap,
2595 end_zone, 0)) { 2497 end_zone, 0)) {
2596 shrink_zone(priority, zone, &sc); 2498 shrink_zone(zone, &sc);
2597 2499
2598 reclaim_state->reclaimed_slab = 0; 2500 reclaim_state->reclaimed_slab = 0;
2599 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); 2501 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
@@ -2650,7 +2552,7 @@ loop_again:
2650 * OK, kswapd is getting into trouble. Take a nap, then take 2552 * OK, kswapd is getting into trouble. Take a nap, then take
2651 * another pass across the zones. 2553 * another pass across the zones.
2652 */ 2554 */
2653 if (total_scanned && (priority < DEF_PRIORITY - 2)) { 2555 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2654 if (has_under_min_watermark_zone) 2556 if (has_under_min_watermark_zone)
2655 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); 2557 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2656 else 2558 else
@@ -2665,7 +2567,7 @@ loop_again:
2665 */ 2567 */
2666 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2568 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2667 break; 2569 break;
2668 } 2570 } while (--sc.priority >= 0);
2669out: 2571out:
2670 2572
2671 /* 2573 /*
@@ -2715,7 +2617,8 @@ out:
2715 if (!populated_zone(zone)) 2617 if (!populated_zone(zone))
2716 continue; 2618 continue;
2717 2619
2718 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2620 if (zone->all_unreclaimable &&
2621 sc.priority != DEF_PRIORITY)
2719 continue; 2622 continue;
2720 2623
2721 /* Would compaction fail due to lack of free memory? */ 2624 /* Would compaction fail due to lack of free memory? */
@@ -2786,7 +2689,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2786 * them before going back to sleep. 2689 * them before going back to sleep.
2787 */ 2690 */
2788 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2691 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2789 schedule(); 2692
2693 if (!kthread_should_stop())
2694 schedule();
2695
2790 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); 2696 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2791 } else { 2697 } else {
2792 if (remaining) 2698 if (remaining)
@@ -2982,6 +2888,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2982 .nr_to_reclaim = nr_to_reclaim, 2888 .nr_to_reclaim = nr_to_reclaim,
2983 .hibernation_mode = 1, 2889 .hibernation_mode = 1,
2984 .order = 0, 2890 .order = 0,
2891 .priority = DEF_PRIORITY,
2985 }; 2892 };
2986 struct shrink_control shrink = { 2893 struct shrink_control shrink = {
2987 .gfp_mask = sc.gfp_mask, 2894 .gfp_mask = sc.gfp_mask,
@@ -3052,14 +2959,17 @@ int kswapd_run(int nid)
3052} 2959}
3053 2960
3054/* 2961/*
3055 * Called by memory hotplug when all memory in a node is offlined. 2962 * Called by memory hotplug when all memory in a node is offlined. Caller must
2963 * hold lock_memory_hotplug().
3056 */ 2964 */
3057void kswapd_stop(int nid) 2965void kswapd_stop(int nid)
3058{ 2966{
3059 struct task_struct *kswapd = NODE_DATA(nid)->kswapd; 2967 struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
3060 2968
3061 if (kswapd) 2969 if (kswapd) {
3062 kthread_stop(kswapd); 2970 kthread_stop(kswapd);
2971 NODE_DATA(nid)->kswapd = NULL;
2972 }
3063} 2973}
3064 2974
3065static int __init kswapd_init(void) 2975static int __init kswapd_init(void)
@@ -3159,7 +3069,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3159 const unsigned long nr_pages = 1 << order; 3069 const unsigned long nr_pages = 1 << order;
3160 struct task_struct *p = current; 3070 struct task_struct *p = current;
3161 struct reclaim_state reclaim_state; 3071 struct reclaim_state reclaim_state;
3162 int priority;
3163 struct scan_control sc = { 3072 struct scan_control sc = {
3164 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3073 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3165 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3074 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -3168,6 +3077,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3168 SWAP_CLUSTER_MAX), 3077 SWAP_CLUSTER_MAX),
3169 .gfp_mask = gfp_mask, 3078 .gfp_mask = gfp_mask,
3170 .order = order, 3079 .order = order,
3080 .priority = ZONE_RECLAIM_PRIORITY,
3171 }; 3081 };
3172 struct shrink_control shrink = { 3082 struct shrink_control shrink = {
3173 .gfp_mask = sc.gfp_mask, 3083 .gfp_mask = sc.gfp_mask,
@@ -3190,11 +3100,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3190 * Free memory by calling shrink zone with increasing 3100 * Free memory by calling shrink zone with increasing
3191 * priorities until we have enough memory freed. 3101 * priorities until we have enough memory freed.
3192 */ 3102 */
3193 priority = ZONE_RECLAIM_PRIORITY;
3194 do { 3103 do {
3195 shrink_zone(priority, zone, &sc); 3104 shrink_zone(zone, &sc);
3196 priority--; 3105 } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
3197 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
3198 } 3106 }
3199 3107
3200 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 3108 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -3345,6 +3253,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3345 zone = pagezone; 3253 zone = pagezone;
3346 spin_lock_irq(&zone->lru_lock); 3254 spin_lock_irq(&zone->lru_lock);
3347 } 3255 }
3256 lruvec = mem_cgroup_page_lruvec(page, zone);
3348 3257
3349 if (!PageLRU(page) || !PageUnevictable(page)) 3258 if (!PageLRU(page) || !PageUnevictable(page))
3350 continue; 3259 continue;
@@ -3354,11 +3263,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3354 3263
3355 VM_BUG_ON(PageActive(page)); 3264 VM_BUG_ON(PageActive(page));
3356 ClearPageUnevictable(page); 3265 ClearPageUnevictable(page);
3357 __dec_zone_state(zone, NR_UNEVICTABLE); 3266 del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
3358 lruvec = mem_cgroup_lru_move_lists(zone, page, 3267 add_page_to_lru_list(page, lruvec, lru);
3359 LRU_UNEVICTABLE, lru);
3360 list_move(&page->lru, &lruvec->lists[lru]);
3361 __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
3362 pgrescued++; 3268 pgrescued++;
3363 } 3269 }
3364 } 3270 }