aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-08-31 03:45:21 -0400
committerIngo Molnar <mingo@elte.hu>2010-08-31 03:45:46 -0400
commitdaab7fc734a53fdeaf844b7c03053118ad1769da (patch)
tree575deb3cdcc6dda562acaed6f7c29bc81ae01cf2 /mm/vmscan.c
parent774ea0bcb27f57b6fd521b3b6c43237782fed4b9 (diff)
parent2bfc96a127bc1cc94d26bfaa40159966064f9c8c (diff)
Merge commit 'v2.6.36-rc3' into x86/memblock
Conflicts: arch/x86/kernel/trampoline.c mm/memblock.c Merge reason: Resolve the conflicts, update to latest upstream. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c548
1 files changed, 305 insertions, 243 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da43..c391c320dbaf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@
48 48
49#include "internal.h" 49#include "internal.h"
50 50
51#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h>
53
51struct scan_control { 54struct scan_control {
52 /* Incremented by the number of inactive pages that were scanned */ 55 /* Incremented by the number of inactive pages that were scanned */
53 unsigned long nr_scanned; 56 unsigned long nr_scanned;
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
398 /* synchronous write or broken a_ops? */ 401 /* synchronous write or broken a_ops? */
399 ClearPageReclaim(page); 402 ClearPageReclaim(page);
400 } 403 }
404 trace_mm_vmscan_writepage(page,
405 trace_reclaim_flags(page, sync_writeback));
401 inc_zone_page_state(page, NR_VMSCAN_WRITE); 406 inc_zone_page_state(page, NR_VMSCAN_WRITE);
402 return PAGE_SUCCESS; 407 return PAGE_SUCCESS;
403 } 408 }
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
617 return PAGEREF_RECLAIM; 622 return PAGEREF_RECLAIM;
618} 623}
619 624
625static noinline_for_stack void free_page_list(struct list_head *free_pages)
626{
627 struct pagevec freed_pvec;
628 struct page *page, *tmp;
629
630 pagevec_init(&freed_pvec, 1);
631
632 list_for_each_entry_safe(page, tmp, free_pages, lru) {
633 list_del(&page->lru);
634 if (!pagevec_add(&freed_pvec, page)) {
635 __pagevec_free(&freed_pvec);
636 pagevec_reinit(&freed_pvec);
637 }
638 }
639
640 pagevec_free(&freed_pvec);
641}
642
620/* 643/*
621 * shrink_page_list() returns the number of reclaimed pages 644 * shrink_page_list() returns the number of reclaimed pages
622 */ 645 */
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
625 enum pageout_io sync_writeback) 648 enum pageout_io sync_writeback)
626{ 649{
627 LIST_HEAD(ret_pages); 650 LIST_HEAD(ret_pages);
628 struct pagevec freed_pvec; 651 LIST_HEAD(free_pages);
629 int pgactivate = 0; 652 int pgactivate = 0;
630 unsigned long nr_reclaimed = 0; 653 unsigned long nr_reclaimed = 0;
631 654
632 cond_resched(); 655 cond_resched();
633 656
634 pagevec_init(&freed_pvec, 1);
635 while (!list_empty(page_list)) { 657 while (!list_empty(page_list)) {
636 enum page_references references; 658 enum page_references references;
637 struct address_space *mapping; 659 struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
806 __clear_page_locked(page); 828 __clear_page_locked(page);
807free_it: 829free_it:
808 nr_reclaimed++; 830 nr_reclaimed++;
809 if (!pagevec_add(&freed_pvec, page)) { 831
810 __pagevec_free(&freed_pvec); 832 /*
811 pagevec_reinit(&freed_pvec); 833 * Is there need to periodically free_page_list? It would
812 } 834 * appear not as the counts should be low
835 */
836 list_add(&page->lru, &free_pages);
813 continue; 837 continue;
814 838
815cull_mlocked: 839cull_mlocked:
@@ -832,9 +856,10 @@ keep:
832 list_add(&page->lru, &ret_pages); 856 list_add(&page->lru, &ret_pages);
833 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 857 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
834 } 858 }
859
860 free_page_list(&free_pages);
861
835 list_splice(&ret_pages, page_list); 862 list_splice(&ret_pages, page_list);
836 if (pagevec_count(&freed_pvec))
837 __pagevec_free(&freed_pvec);
838 count_vm_events(PGACTIVATE, pgactivate); 863 count_vm_events(PGACTIVATE, pgactivate);
839 return nr_reclaimed; 864 return nr_reclaimed;
840} 865}
@@ -916,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
916 unsigned long *scanned, int order, int mode, int file) 941 unsigned long *scanned, int order, int mode, int file)
917{ 942{
918 unsigned long nr_taken = 0; 943 unsigned long nr_taken = 0;
944 unsigned long nr_lumpy_taken = 0;
945 unsigned long nr_lumpy_dirty = 0;
946 unsigned long nr_lumpy_failed = 0;
919 unsigned long scan; 947 unsigned long scan;
920 948
921 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 949 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -993,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
993 list_move(&cursor_page->lru, dst); 1021 list_move(&cursor_page->lru, dst);
994 mem_cgroup_del_lru(cursor_page); 1022 mem_cgroup_del_lru(cursor_page);
995 nr_taken++; 1023 nr_taken++;
1024 nr_lumpy_taken++;
1025 if (PageDirty(cursor_page))
1026 nr_lumpy_dirty++;
996 scan++; 1027 scan++;
1028 } else {
1029 if (mode == ISOLATE_BOTH &&
1030 page_count(cursor_page))
1031 nr_lumpy_failed++;
997 } 1032 }
998 } 1033 }
999 } 1034 }
1000 1035
1001 *scanned = scan; 1036 *scanned = scan;
1037
1038 trace_mm_vmscan_lru_isolate(order,
1039 nr_to_scan, scan,
1040 nr_taken,
1041 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1042 mode);
1002 return nr_taken; 1043 return nr_taken;
1003} 1044}
1004 1045
@@ -1035,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1035 ClearPageActive(page); 1076 ClearPageActive(page);
1036 nr_active++; 1077 nr_active++;
1037 } 1078 }
1038 count[lru]++; 1079 if (count)
1080 count[lru]++;
1039 } 1081 }
1040 1082
1041 return nr_active; 1083 return nr_active;
@@ -1112,174 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file,
1112} 1154}
1113 1155
1114/* 1156/*
1115 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1157 * TODO: Try merging with migrations version of putback_lru_pages
1116 * of reclaimed pages
1117 */ 1158 */
1118static unsigned long shrink_inactive_list(unsigned long max_scan, 1159static noinline_for_stack void
1119 struct zone *zone, struct scan_control *sc, 1160putback_lru_pages(struct zone *zone, struct scan_control *sc,
1120 int priority, int file) 1161 unsigned long nr_anon, unsigned long nr_file,
1162 struct list_head *page_list)
1121{ 1163{
1122 LIST_HEAD(page_list); 1164 struct page *page;
1123 struct pagevec pvec; 1165 struct pagevec pvec;
1124 unsigned long nr_scanned = 0;
1125 unsigned long nr_reclaimed = 0;
1126 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1166 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1127 1167
1128 while (unlikely(too_many_isolated(zone, file, sc))) { 1168 pagevec_init(&pvec, 1);
1129 congestion_wait(BLK_RW_ASYNC, HZ/10);
1130 1169
1131 /* We are about to die and free our memory. Return now. */ 1170 /*
1132 if (fatal_signal_pending(current)) 1171 * Put back any unfreeable pages.
1133 return SWAP_CLUSTER_MAX; 1172 */
1173 spin_lock(&zone->lru_lock);
1174 while (!list_empty(page_list)) {
1175 int lru;
1176 page = lru_to_page(page_list);
1177 VM_BUG_ON(PageLRU(page));
1178 list_del(&page->lru);
1179 if (unlikely(!page_evictable(page, NULL))) {
1180 spin_unlock_irq(&zone->lru_lock);
1181 putback_lru_page(page);
1182 spin_lock_irq(&zone->lru_lock);
1183 continue;
1184 }
1185 SetPageLRU(page);
1186 lru = page_lru(page);
1187 add_page_to_lru_list(zone, page, lru);
1188 if (is_active_lru(lru)) {
1189 int file = is_file_lru(lru);
1190 reclaim_stat->recent_rotated[file]++;
1191 }
1192 if (!pagevec_add(&pvec, page)) {
1193 spin_unlock_irq(&zone->lru_lock);
1194 __pagevec_release(&pvec);
1195 spin_lock_irq(&zone->lru_lock);
1196 }
1134 } 1197 }
1198 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1199 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1200
1201 spin_unlock_irq(&zone->lru_lock);
1202 pagevec_release(&pvec);
1203}
1135 1204
1205static noinline_for_stack void update_isolated_counts(struct zone *zone,
1206 struct scan_control *sc,
1207 unsigned long *nr_anon,
1208 unsigned long *nr_file,
1209 struct list_head *isolated_list)
1210{
1211 unsigned long nr_active;
1212 unsigned int count[NR_LRU_LISTS] = { 0, };
1213 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1136 1214
1137 pagevec_init(&pvec, 1); 1215 nr_active = clear_active_flags(isolated_list, count);
1216 __count_vm_events(PGDEACTIVATE, nr_active);
1138 1217
1139 lru_add_drain(); 1218 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
1140 spin_lock_irq(&zone->lru_lock); 1219 -count[LRU_ACTIVE_FILE]);
1141 do { 1220 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
1142 struct page *page; 1221 -count[LRU_INACTIVE_FILE]);
1143 unsigned long nr_taken; 1222 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1144 unsigned long nr_scan; 1223 -count[LRU_ACTIVE_ANON]);
1145 unsigned long nr_freed; 1224 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1146 unsigned long nr_active; 1225 -count[LRU_INACTIVE_ANON]);
1147 unsigned int count[NR_LRU_LISTS] = { 0, };
1148 int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1149 unsigned long nr_anon;
1150 unsigned long nr_file;
1151 1226
1152 if (scanning_global_lru(sc)) { 1227 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1153 nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX, 1228 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1154 &page_list, &nr_scan, 1229 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1155 sc->order, mode, 1230 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1156 zone, 0, file);
1157 zone->pages_scanned += nr_scan;
1158 if (current_is_kswapd())
1159 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1160 nr_scan);
1161 else
1162 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1163 nr_scan);
1164 } else {
1165 nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
1166 &page_list, &nr_scan,
1167 sc->order, mode,
1168 zone, sc->mem_cgroup,
1169 0, file);
1170 /*
1171 * mem_cgroup_isolate_pages() keeps track of
1172 * scanned pages on its own.
1173 */
1174 }
1175 1231
1176 if (nr_taken == 0) 1232 reclaim_stat->recent_scanned[0] += *nr_anon;
1177 goto done; 1233 reclaim_stat->recent_scanned[1] += *nr_file;
1234}
1178 1235
1179 nr_active = clear_active_flags(&page_list, count); 1236/*
1180 __count_vm_events(PGDEACTIVATE, nr_active); 1237 * Returns true if the caller should wait to clean dirty/writeback pages.
1238 *
1239 * If we are direct reclaiming for contiguous pages and we do not reclaim
1240 * everything in the list, try again and wait for writeback IO to complete.
1241 * This will stall high-order allocations noticeably. Only do that when really
1242 * need to free the pages under high memory pressure.
1243 */
1244static inline bool should_reclaim_stall(unsigned long nr_taken,
1245 unsigned long nr_freed,
1246 int priority,
1247 struct scan_control *sc)
1248{
1249 int lumpy_stall_priority;
1181 1250
1182 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1251 /* kswapd should not stall on sync IO */
1183 -count[LRU_ACTIVE_FILE]); 1252 if (current_is_kswapd())
1184 __mod_zone_page_state(zone, NR_INACTIVE_FILE, 1253 return false;
1185 -count[LRU_INACTIVE_FILE]);
1186 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1187 -count[LRU_ACTIVE_ANON]);
1188 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1189 -count[LRU_INACTIVE_ANON]);
1190 1254
1191 nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1255 /* Only stall on lumpy reclaim */
1192 nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1256 if (!sc->lumpy_reclaim_mode)
1193 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); 1257 return false;
1194 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1195 1258
1196 reclaim_stat->recent_scanned[0] += nr_anon; 1259 /* If we have relaimed everything on the isolated list, no stall */
1197 reclaim_stat->recent_scanned[1] += nr_file; 1260 if (nr_freed == nr_taken)
1261 return false;
1198 1262
1199 spin_unlock_irq(&zone->lru_lock); 1263 /*
1264 * For high-order allocations, there are two stall thresholds.
1265 * High-cost allocations stall immediately where as lower
1266 * order allocations such as stacks require the scanning
1267 * priority to be much higher before stalling.
1268 */
1269 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1270 lumpy_stall_priority = DEF_PRIORITY;
1271 else
1272 lumpy_stall_priority = DEF_PRIORITY / 3;
1200 1273
1201 nr_scanned += nr_scan; 1274 return priority <= lumpy_stall_priority;
1202 nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); 1275}
1203 1276
1277/*
1278 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1279 * of reclaimed pages
1280 */
1281static noinline_for_stack unsigned long
1282shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1283 struct scan_control *sc, int priority, int file)
1284{
1285 LIST_HEAD(page_list);
1286 unsigned long nr_scanned;
1287 unsigned long nr_reclaimed = 0;
1288 unsigned long nr_taken;
1289 unsigned long nr_active;
1290 unsigned long nr_anon;
1291 unsigned long nr_file;
1292
1293 while (unlikely(too_many_isolated(zone, file, sc))) {
1294 congestion_wait(BLK_RW_ASYNC, HZ/10);
1295
1296 /* We are about to die and free our memory. Return now. */
1297 if (fatal_signal_pending(current))
1298 return SWAP_CLUSTER_MAX;
1299 }
1300
1301
1302 lru_add_drain();
1303 spin_lock_irq(&zone->lru_lock);
1304
1305 if (scanning_global_lru(sc)) {
1306 nr_taken = isolate_pages_global(nr_to_scan,
1307 &page_list, &nr_scanned, sc->order,
1308 sc->lumpy_reclaim_mode ?
1309 ISOLATE_BOTH : ISOLATE_INACTIVE,
1310 zone, 0, file);
1311 zone->pages_scanned += nr_scanned;
1312 if (current_is_kswapd())
1313 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
1314 nr_scanned);
1315 else
1316 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1317 nr_scanned);
1318 } else {
1319 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1320 &page_list, &nr_scanned, sc->order,
1321 sc->lumpy_reclaim_mode ?
1322 ISOLATE_BOTH : ISOLATE_INACTIVE,
1323 zone, sc->mem_cgroup,
1324 0, file);
1204 /* 1325 /*
1205 * If we are direct reclaiming for contiguous pages and we do 1326 * mem_cgroup_isolate_pages() keeps track of
1206 * not reclaim everything in the list, try again and wait 1327 * scanned pages on its own.
1207 * for IO to complete. This will stall high-order allocations
1208 * but that should be acceptable to the caller
1209 */ 1328 */
1210 if (nr_freed < nr_taken && !current_is_kswapd() && 1329 }
1211 sc->lumpy_reclaim_mode) {
1212 congestion_wait(BLK_RW_ASYNC, HZ/10);
1213 1330
1214 /* 1331 if (nr_taken == 0) {
1215 * The attempt at page out may have made some 1332 spin_unlock_irq(&zone->lru_lock);
1216 * of the pages active, mark them inactive again. 1333 return 0;
1217 */ 1334 }
1218 nr_active = clear_active_flags(&page_list, count);
1219 count_vm_events(PGDEACTIVATE, nr_active);
1220 1335
1221 nr_freed += shrink_page_list(&page_list, sc, 1336 update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
1222 PAGEOUT_IO_SYNC);
1223 }
1224 1337
1225 nr_reclaimed += nr_freed; 1338 spin_unlock_irq(&zone->lru_lock);
1226 1339
1227 local_irq_disable(); 1340 nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
1228 if (current_is_kswapd()) 1341
1229 __count_vm_events(KSWAPD_STEAL, nr_freed); 1342 /* Check if we should syncronously wait for writeback */
1230 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1343 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1344 congestion_wait(BLK_RW_ASYNC, HZ/10);
1231 1345
1232 spin_lock(&zone->lru_lock);
1233 /* 1346 /*
1234 * Put back any unfreeable pages. 1347 * The attempt at page out may have made some
1348 * of the pages active, mark them inactive again.
1235 */ 1349 */
1236 while (!list_empty(&page_list)) { 1350 nr_active = clear_active_flags(&page_list, NULL);
1237 int lru; 1351 count_vm_events(PGDEACTIVATE, nr_active);
1238 page = lru_to_page(&page_list);
1239 VM_BUG_ON(PageLRU(page));
1240 list_del(&page->lru);
1241 if (unlikely(!page_evictable(page, NULL))) {
1242 spin_unlock_irq(&zone->lru_lock);
1243 putback_lru_page(page);
1244 spin_lock_irq(&zone->lru_lock);
1245 continue;
1246 }
1247 SetPageLRU(page);
1248 lru = page_lru(page);
1249 add_page_to_lru_list(zone, page, lru);
1250 if (is_active_lru(lru)) {
1251 int file = is_file_lru(lru);
1252 reclaim_stat->recent_rotated[file]++;
1253 }
1254 if (!pagevec_add(&pvec, page)) {
1255 spin_unlock_irq(&zone->lru_lock);
1256 __pagevec_release(&pvec);
1257 spin_lock_irq(&zone->lru_lock);
1258 }
1259 }
1260 __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
1261 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
1262 1352
1263 } while (nr_scanned < max_scan); 1353 nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
1354 }
1264 1355
1265done: 1356 local_irq_disable();
1266 spin_unlock_irq(&zone->lru_lock); 1357 if (current_is_kswapd())
1267 pagevec_release(&pvec); 1358 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1268 return nr_reclaimed; 1359 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
1269}
1270 1360
1271/* 1361 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1272 * We are about to scan this zone at a certain priority level. If that priority 1362 return nr_reclaimed;
1273 * level is smaller (ie: more urgent) than the previous priority, then note
1274 * that priority level within the zone. This is done so that when the next
1275 * process comes in to scan this zone, it will immediately start out at this
1276 * priority level rather than having to build up its own scanning priority.
1277 * Here, this priority affects only the reclaim-mapped threshold.
1278 */
1279static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1280{
1281 if (priority < zone->prev_priority)
1282 zone->prev_priority = priority;
1283} 1363}
1284 1364
1285/* 1365/*
@@ -1583,6 +1663,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1583 } 1663 }
1584 1664
1585 /* 1665 /*
1666 * With swappiness at 100, anonymous and file have the same priority.
1667 * This scanning priority is essentially the inverse of IO cost.
1668 */
1669 anon_prio = sc->swappiness;
1670 file_prio = 200 - sc->swappiness;
1671
1672 /*
1586 * OK, so we have swap space and a fair amount of page cache 1673 * OK, so we have swap space and a fair amount of page cache
1587 * pages. We use the recently rotated / recently scanned 1674 * pages. We use the recently rotated / recently scanned
1588 * ratios to determine how valuable each cache is. 1675 * ratios to determine how valuable each cache is.
@@ -1593,28 +1680,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1593 * 1680 *
1594 * anon in [0], file in [1] 1681 * anon in [0], file in [1]
1595 */ 1682 */
1683 spin_lock_irq(&zone->lru_lock);
1596 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { 1684 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1597 spin_lock_irq(&zone->lru_lock);
1598 reclaim_stat->recent_scanned[0] /= 2; 1685 reclaim_stat->recent_scanned[0] /= 2;
1599 reclaim_stat->recent_rotated[0] /= 2; 1686 reclaim_stat->recent_rotated[0] /= 2;
1600 spin_unlock_irq(&zone->lru_lock);
1601 } 1687 }
1602 1688
1603 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { 1689 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1604 spin_lock_irq(&zone->lru_lock);
1605 reclaim_stat->recent_scanned[1] /= 2; 1690 reclaim_stat->recent_scanned[1] /= 2;
1606 reclaim_stat->recent_rotated[1] /= 2; 1691 reclaim_stat->recent_rotated[1] /= 2;
1607 spin_unlock_irq(&zone->lru_lock);
1608 } 1692 }
1609 1693
1610 /* 1694 /*
1611 * With swappiness at 100, anonymous and file have the same priority.
1612 * This scanning priority is essentially the inverse of IO cost.
1613 */
1614 anon_prio = sc->swappiness;
1615 file_prio = 200 - sc->swappiness;
1616
1617 /*
1618 * The amount of pressure on anon vs file pages is inversely 1695 * The amount of pressure on anon vs file pages is inversely
1619 * proportional to the fraction of recently scanned pages on 1696 * proportional to the fraction of recently scanned pages on
1620 * each list that were recently referenced and in active use. 1697 * each list that were recently referenced and in active use.
@@ -1624,6 +1701,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1624 1701
1625 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1702 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1626 fp /= reclaim_stat->recent_rotated[1] + 1; 1703 fp /= reclaim_stat->recent_rotated[1] + 1;
1704 spin_unlock_irq(&zone->lru_lock);
1627 1705
1628 fraction[0] = ap; 1706 fraction[0] = ap;
1629 fraction[1] = fp; 1707 fraction[1] = fp;
@@ -1729,13 +1807,12 @@ static void shrink_zone(int priority, struct zone *zone,
1729static bool shrink_zones(int priority, struct zonelist *zonelist, 1807static bool shrink_zones(int priority, struct zonelist *zonelist,
1730 struct scan_control *sc) 1808 struct scan_control *sc)
1731{ 1809{
1732 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1733 struct zoneref *z; 1810 struct zoneref *z;
1734 struct zone *zone; 1811 struct zone *zone;
1735 bool all_unreclaimable = true; 1812 bool all_unreclaimable = true;
1736 1813
1737 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1814 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1738 sc->nodemask) { 1815 gfp_zone(sc->gfp_mask), sc->nodemask) {
1739 if (!populated_zone(zone)) 1816 if (!populated_zone(zone))
1740 continue; 1817 continue;
1741 /* 1818 /*
@@ -1745,17 +1822,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
1745 if (scanning_global_lru(sc)) { 1822 if (scanning_global_lru(sc)) {
1746 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1823 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1747 continue; 1824 continue;
1748 note_zone_scanning_priority(zone, priority);
1749
1750 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1825 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1751 continue; /* Let kswapd poll it */ 1826 continue; /* Let kswapd poll it */
1752 } else {
1753 /*
1754 * Ignore cpuset limitation here. We just want to reduce
1755 * # of used pages by us regardless of memory shortage.
1756 */
1757 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1758 priority);
1759 } 1827 }
1760 1828
1761 shrink_zone(priority, zone, sc); 1829 shrink_zone(priority, zone, sc);
@@ -1787,10 +1855,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1787 bool all_unreclaimable; 1855 bool all_unreclaimable;
1788 unsigned long total_scanned = 0; 1856 unsigned long total_scanned = 0;
1789 struct reclaim_state *reclaim_state = current->reclaim_state; 1857 struct reclaim_state *reclaim_state = current->reclaim_state;
1790 unsigned long lru_pages = 0;
1791 struct zoneref *z; 1858 struct zoneref *z;
1792 struct zone *zone; 1859 struct zone *zone;
1793 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1794 unsigned long writeback_threshold; 1860 unsigned long writeback_threshold;
1795 1861
1796 get_mems_allowed(); 1862 get_mems_allowed();
@@ -1798,18 +1864,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1798 1864
1799 if (scanning_global_lru(sc)) 1865 if (scanning_global_lru(sc))
1800 count_vm_event(ALLOCSTALL); 1866 count_vm_event(ALLOCSTALL);
1801 /*
1802 * mem_cgroup will not do shrink_slab.
1803 */
1804 if (scanning_global_lru(sc)) {
1805 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1806
1807 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1808 continue;
1809
1810 lru_pages += zone_reclaimable_pages(zone);
1811 }
1812 }
1813 1867
1814 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1868 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
1815 sc->nr_scanned = 0; 1869 sc->nr_scanned = 0;
@@ -1821,6 +1875,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1821 * over limit cgroups 1875 * over limit cgroups
1822 */ 1876 */
1823 if (scanning_global_lru(sc)) { 1877 if (scanning_global_lru(sc)) {
1878 unsigned long lru_pages = 0;
1879 for_each_zone_zonelist(zone, z, zonelist,
1880 gfp_zone(sc->gfp_mask)) {
1881 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1882 continue;
1883
1884 lru_pages += zone_reclaimable_pages(zone);
1885 }
1886
1824 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 1887 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1825 if (reclaim_state) { 1888 if (reclaim_state) {
1826 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 1889 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1861,17 +1924,6 @@ out:
1861 if (priority < 0) 1924 if (priority < 0)
1862 priority = 0; 1925 priority = 0;
1863 1926
1864 if (scanning_global_lru(sc)) {
1865 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1866
1867 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1868 continue;
1869
1870 zone->prev_priority = priority;
1871 }
1872 } else
1873 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1874
1875 delayacct_freepages_end(); 1927 delayacct_freepages_end();
1876 put_mems_allowed(); 1928 put_mems_allowed();
1877 1929
@@ -1888,6 +1940,7 @@ out:
1888unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 1940unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1889 gfp_t gfp_mask, nodemask_t *nodemask) 1941 gfp_t gfp_mask, nodemask_t *nodemask)
1890{ 1942{
1943 unsigned long nr_reclaimed;
1891 struct scan_control sc = { 1944 struct scan_control sc = {
1892 .gfp_mask = gfp_mask, 1945 .gfp_mask = gfp_mask,
1893 .may_writepage = !laptop_mode, 1946 .may_writepage = !laptop_mode,
@@ -1900,7 +1953,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1900 .nodemask = nodemask, 1953 .nodemask = nodemask,
1901 }; 1954 };
1902 1955
1903 return do_try_to_free_pages(zonelist, &sc); 1956 trace_mm_vmscan_direct_reclaim_begin(order,
1957 sc.may_writepage,
1958 gfp_mask);
1959
1960 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
1961
1962 trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
1963
1964 return nr_reclaimed;
1904} 1965}
1905 1966
1906#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1967#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1908,9 +1969,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1908unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 1969unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1909 gfp_t gfp_mask, bool noswap, 1970 gfp_t gfp_mask, bool noswap,
1910 unsigned int swappiness, 1971 unsigned int swappiness,
1911 struct zone *zone, int nid) 1972 struct zone *zone)
1912{ 1973{
1913 struct scan_control sc = { 1974 struct scan_control sc = {
1975 .nr_to_reclaim = SWAP_CLUSTER_MAX,
1914 .may_writepage = !laptop_mode, 1976 .may_writepage = !laptop_mode,
1915 .may_unmap = 1, 1977 .may_unmap = 1,
1916 .may_swap = !noswap, 1978 .may_swap = !noswap,
@@ -1918,13 +1980,13 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1918 .order = 0, 1980 .order = 0,
1919 .mem_cgroup = mem, 1981 .mem_cgroup = mem,
1920 }; 1982 };
1921 nodemask_t nm = nodemask_of_node(nid);
1922
1923 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1983 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1924 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1984 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1925 sc.nodemask = &nm; 1985
1926 sc.nr_reclaimed = 0; 1986 trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
1927 sc.nr_scanned = 0; 1987 sc.may_writepage,
1988 sc.gfp_mask);
1989
1928 /* 1990 /*
1929 * NOTE: Although we can get the priority field, using it 1991 * NOTE: Although we can get the priority field, using it
1930 * here is not a good idea, since it limits the pages we can scan. 1992 * here is not a good idea, since it limits the pages we can scan.
@@ -1933,6 +1995,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
1933 * the priority and make it zero. 1995 * the priority and make it zero.
1934 */ 1996 */
1935 shrink_zone(0, zone, &sc); 1997 shrink_zone(0, zone, &sc);
1998
1999 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2000
1936 return sc.nr_reclaimed; 2001 return sc.nr_reclaimed;
1937} 2002}
1938 2003
@@ -1942,6 +2007,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1942 unsigned int swappiness) 2007 unsigned int swappiness)
1943{ 2008{
1944 struct zonelist *zonelist; 2009 struct zonelist *zonelist;
2010 unsigned long nr_reclaimed;
1945 struct scan_control sc = { 2011 struct scan_control sc = {
1946 .may_writepage = !laptop_mode, 2012 .may_writepage = !laptop_mode,
1947 .may_unmap = 1, 2013 .may_unmap = 1,
@@ -1956,7 +2022,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1956 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2022 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1957 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2023 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1958 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 2024 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
1959 return do_try_to_free_pages(zonelist, &sc); 2025
2026 trace_mm_vmscan_memcg_reclaim_begin(0,
2027 sc.may_writepage,
2028 sc.gfp_mask);
2029
2030 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2031
2032 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2033
2034 return nr_reclaimed;
1960} 2035}
1961#endif 2036#endif
1962 2037
@@ -2028,22 +2103,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
2028 .order = order, 2103 .order = order,
2029 .mem_cgroup = NULL, 2104 .mem_cgroup = NULL,
2030 }; 2105 };
2031 /*
2032 * temp_priority is used to remember the scanning priority at which
2033 * this zone was successfully refilled to
2034 * free_pages == high_wmark_pages(zone).
2035 */
2036 int temp_priority[MAX_NR_ZONES];
2037
2038loop_again: 2106loop_again:
2039 total_scanned = 0; 2107 total_scanned = 0;
2040 sc.nr_reclaimed = 0; 2108 sc.nr_reclaimed = 0;
2041 sc.may_writepage = !laptop_mode; 2109 sc.may_writepage = !laptop_mode;
2042 count_vm_event(PAGEOUTRUN); 2110 count_vm_event(PAGEOUTRUN);
2043 2111
2044 for (i = 0; i < pgdat->nr_zones; i++)
2045 temp_priority[i] = DEF_PRIORITY;
2046
2047 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2112 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2048 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2113 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2049 unsigned long lru_pages = 0; 2114 unsigned long lru_pages = 0;
@@ -2103,7 +2168,6 @@ loop_again:
2103 for (i = 0; i <= end_zone; i++) { 2168 for (i = 0; i <= end_zone; i++) {
2104 struct zone *zone = pgdat->node_zones + i; 2169 struct zone *zone = pgdat->node_zones + i;
2105 int nr_slab; 2170 int nr_slab;
2106 int nid, zid;
2107 2171
2108 if (!populated_zone(zone)) 2172 if (!populated_zone(zone))
2109 continue; 2173 continue;
@@ -2111,18 +2175,14 @@ loop_again:
2111 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2175 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2112 continue; 2176 continue;
2113 2177
2114 temp_priority[i] = priority;
2115 sc.nr_scanned = 0; 2178 sc.nr_scanned = 0;
2116 note_zone_scanning_priority(zone, priority);
2117 2179
2118 nid = pgdat->node_id;
2119 zid = zone_idx(zone);
2120 /* 2180 /*
2121 * Call soft limit reclaim before calling shrink_zone. 2181 * Call soft limit reclaim before calling shrink_zone.
2122 * For now we ignore the return value 2182 * For now we ignore the return value
2123 */ 2183 */
2124 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, 2184 mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
2125 nid, zid); 2185
2126 /* 2186 /*
2127 * We put equal pressure on every zone, unless one 2187 * We put equal pressure on every zone, unless one
2128 * zone has way too many pages free already. 2188 * zone has way too many pages free already.
@@ -2186,16 +2246,6 @@ loop_again:
2186 break; 2246 break;
2187 } 2247 }
2188out: 2248out:
2189 /*
2190 * Note within each zone the priority level at which this zone was
2191 * brought into a happy state. So that the next thread which scans this
2192 * zone will start out at that priority level.
2193 */
2194 for (i = 0; i < pgdat->nr_zones; i++) {
2195 struct zone *zone = pgdat->node_zones + i;
2196
2197 zone->prev_priority = temp_priority[i];
2198 }
2199 if (!all_zones_ok) { 2249 if (!all_zones_ok) {
2200 cond_resched(); 2250 cond_resched();
2201 2251
@@ -2299,9 +2349,10 @@ static int kswapd(void *p)
2299 * premature sleep. If not, then go fully 2349 * premature sleep. If not, then go fully
2300 * to sleep until explicitly woken up 2350 * to sleep until explicitly woken up
2301 */ 2351 */
2302 if (!sleeping_prematurely(pgdat, order, remaining)) 2352 if (!sleeping_prematurely(pgdat, order, remaining)) {
2353 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2303 schedule(); 2354 schedule();
2304 else { 2355 } else {
2305 if (remaining) 2356 if (remaining)
2306 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); 2357 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2307 else 2358 else
@@ -2321,8 +2372,10 @@ static int kswapd(void *p)
2321 * We can speed up thawing tasks if we don't call balance_pgdat 2372 * We can speed up thawing tasks if we don't call balance_pgdat
2322 * after returning from the refrigerator 2373 * after returning from the refrigerator
2323 */ 2374 */
2324 if (!ret) 2375 if (!ret) {
2376 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2325 balance_pgdat(pgdat, order); 2377 balance_pgdat(pgdat, order);
2378 }
2326 } 2379 }
2327 return 0; 2380 return 0;
2328} 2381}
@@ -2342,6 +2395,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2342 return; 2395 return;
2343 if (pgdat->kswapd_max_order < order) 2396 if (pgdat->kswapd_max_order < order)
2344 pgdat->kswapd_max_order = order; 2397 pgdat->kswapd_max_order = order;
2398 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2345 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2399 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2346 return; 2400 return;
2347 if (!waitqueue_active(&pgdat->kswapd_wait)) 2401 if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2644,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2590 .swappiness = vm_swappiness, 2644 .swappiness = vm_swappiness,
2591 .order = order, 2645 .order = order,
2592 }; 2646 };
2593 unsigned long slab_reclaimable; 2647 unsigned long nr_slab_pages0, nr_slab_pages1;
2594 2648
2595 disable_swap_token();
2596 cond_resched(); 2649 cond_resched();
2597 /* 2650 /*
2598 * We need to be able to allocate from the reserves for RECLAIM_SWAP 2651 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2664,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2611 */ 2664 */
2612 priority = ZONE_RECLAIM_PRIORITY; 2665 priority = ZONE_RECLAIM_PRIORITY;
2613 do { 2666 do {
2614 note_zone_scanning_priority(zone, priority);
2615 shrink_zone(priority, zone, &sc); 2667 shrink_zone(priority, zone, &sc);
2616 priority--; 2668 priority--;
2617 } while (priority >= 0 && sc.nr_reclaimed < nr_pages); 2669 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2618 } 2670 }
2619 2671
2620 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2672 nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2621 if (slab_reclaimable > zone->min_slab_pages) { 2673 if (nr_slab_pages0 > zone->min_slab_pages) {
2622 /* 2674 /*
2623 * shrink_slab() does not currently allow us to determine how 2675 * shrink_slab() does not currently allow us to determine how
2624 * many pages were freed in this zone. So we take the current 2676 * many pages were freed in this zone. So we take the current
@@ -2629,17 +2681,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2629 * Note that shrink_slab will free memory on all zones and may 2681 * Note that shrink_slab will free memory on all zones and may
2630 * take a long time. 2682 * take a long time.
2631 */ 2683 */
2632 while (shrink_slab(sc.nr_scanned, gfp_mask, order) && 2684 for (;;) {
2633 zone_page_state(zone, NR_SLAB_RECLAIMABLE) > 2685 unsigned long lru_pages = zone_reclaimable_pages(zone);
2634 slab_reclaimable - nr_pages) 2686
2635 ; 2687 /* No reclaimable slab or very low memory pressure */
2688 if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
2689 break;
2690
2691 /* Freed enough memory */
2692 nr_slab_pages1 = zone_page_state(zone,
2693 NR_SLAB_RECLAIMABLE);
2694 if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
2695 break;
2696 }
2636 2697
2637 /* 2698 /*
2638 * Update nr_reclaimed by the number of slab pages we 2699 * Update nr_reclaimed by the number of slab pages we
2639 * reclaimed from this zone. 2700 * reclaimed from this zone.
2640 */ 2701 */
2641 sc.nr_reclaimed += slab_reclaimable - 2702 nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2642 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2703 if (nr_slab_pages1 < nr_slab_pages0)
2704 sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
2643 } 2705 }
2644 2706
2645 p->reclaim_state = NULL; 2707 p->reclaim_state = NULL;