aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2008-10-18 23:26:34 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 11:50:25 -0400
commit556adecba110bf5f1db6c6b56416cfab5bcab698 (patch)
treea721d84d28c4d99a54632b472b452ea3d4b2b137
parent4f98a2fee8acdb4ac84545df98cccecfd130f8db (diff)
vmscan: second chance replacement for anonymous pages
We avoid evicting and scanning anonymous pages for the most part, but under some workloads we can end up with most of memory filled with anonymous pages. At that point, we suddenly need to clear the referenced bits on all of memory, which can take ages on very large memory systems. We can reduce the maximum number of pages that need to be scanned by not taking the referenced state into account when deactivating an anonymous page. After all, every anonymous page starts out referenced, so why check? If an anonymous page gets referenced again before it reaches the end of the inactive list, we move it back to the active list. To keep the maximum amount of necessary work reasonable, we scale the active to inactive ratio with the size of memory, using the formula active:inactive ratio = sqrt(memory in GB * 10). Kswapd CPU use now seems to scale by the amount of pageout bandwidth, instead of by the amount of memory present in the system. [kamezawa.hiroyu@jp.fujitsu.com: fix OOM with memcg] [kamezawa.hiroyu@jp.fujitsu.com: memcg: lru scan fix] Signed-off-by: Rik van Riel <riel@redhat.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mm_inline.h19
-rw-r--r--include/linux/mmzone.h6
-rw-r--r--mm/page_alloc.c41
-rw-r--r--mm/vmscan.c38
-rw-r--r--mm/vmstat.c6
5 files changed, 104 insertions, 6 deletions
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2eb599465d56..f451fedd1e75 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -117,4 +117,23 @@ static inline enum lru_list page_lru(struct page *page)
117 return lru; 117 return lru;
118} 118}
119 119
120/**
121 * inactive_anon_is_low - check if anonymous pages need to be deactivated
122 * @zone: zone to check
123 *
124 * Returns true if the zone does not have enough inactive anon pages,
125 * meaning some active anon pages need to be deactivated.
126 */
127static inline int inactive_anon_is_low(struct zone *zone)
128{
129 unsigned long active, inactive;
130
131 active = zone_page_state(zone, NR_ACTIVE_ANON);
132 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
133
134 if (inactive * zone->inactive_ratio < active)
135 return 1;
136
137 return 0;
138}
120#endif 139#endif
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 59a4c8fd6ebd..9c5111f49a32 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -323,6 +323,12 @@ struct zone {
323 */ 323 */
324 int prev_priority; 324 int prev_priority;
325 325
326 /*
327 * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
328 * this zone's LRU. Maintained by the pageout code.
329 */
330 unsigned int inactive_ratio;
331
326 332
327 ZONE_PADDING(_pad2_) 333 ZONE_PADDING(_pad2_)
328 /* Rarely used or read-mostly fields */ 334 /* Rarely used or read-mostly fields */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 740a16a32c22..79c0981b1d32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4263,6 +4263,46 @@ void setup_per_zone_pages_min(void)
4263 calculate_totalreserve_pages(); 4263 calculate_totalreserve_pages();
4264} 4264}
4265 4265
4266/**
4267 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4268 *
4269 * The inactive anon list should be small enough that the VM never has to
4270 * do too much work, but large enough that each inactive page has a chance
4271 * to be referenced again before it is swapped out.
4272 *
4273 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
4274 * INACTIVE_ANON pages on this zone's LRU, maintained by the
4275 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
4276 * the anonymous pages are kept on the inactive list.
4277 *
4278 * total target max
4279 * memory ratio inactive anon
4280 * -------------------------------------
4281 * 10MB 1 5MB
4282 * 100MB 1 50MB
4283 * 1GB 3 250MB
4284 * 10GB 10 0.9GB
4285 * 100GB 31 3GB
4286 * 1TB 101 10GB
4287 * 10TB 320 32GB
4288 */
4289void setup_per_zone_inactive_ratio(void)
4290{
4291 struct zone *zone;
4292
4293 for_each_zone(zone) {
4294 unsigned int gb, ratio;
4295
4296 /* Zone size in gigabytes */
4297 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4298 ratio = int_sqrt(10 * gb);
4299 if (!ratio)
4300 ratio = 1;
4301
4302 zone->inactive_ratio = ratio;
4303 }
4304}
4305
4266/* 4306/*
4267 * Initialise min_free_kbytes. 4307 * Initialise min_free_kbytes.
4268 * 4308 *
@@ -4300,6 +4340,7 @@ static int __init init_per_zone_pages_min(void)
4300 min_free_kbytes = 65536; 4340 min_free_kbytes = 65536;
4301 setup_per_zone_pages_min(); 4341 setup_per_zone_pages_min();
4302 setup_per_zone_lowmem_reserve(); 4342 setup_per_zone_lowmem_reserve();
4343 setup_per_zone_inactive_ratio();
4303 return 0; 4344 return 0;
4304} 4345}
4305module_init(init_per_zone_pages_min) 4346module_init(init_per_zone_pages_min)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d10d2f9a33f3..c82ee9a33cfc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1090,6 +1090,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1090 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1090 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1091 spin_unlock_irq(&zone->lru_lock); 1091 spin_unlock_irq(&zone->lru_lock);
1092 1092
1093 pgmoved = 0;
1093 while (!list_empty(&l_hold)) { 1094 while (!list_empty(&l_hold)) {
1094 cond_resched(); 1095 cond_resched();
1095 page = lru_to_page(&l_hold); 1096 page = lru_to_page(&l_hold);
@@ -1098,6 +1099,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1098 } 1099 }
1099 1100
1100 /* 1101 /*
1102 * Count the referenced pages as rotated, even when they are moved
1103 * to the inactive list. This helps balance scan pressure between
1104 * file and anonymous pages in get_scan_ratio.
1105 */
1106 zone->recent_rotated[!!file] += pgmoved;
1107
1108 /*
1101 * Now put the pages back on the appropriate [file or anon] inactive 1109 * Now put the pages back on the appropriate [file or anon] inactive
1102 * and active lists. 1110 * and active lists.
1103 */ 1111 */
@@ -1158,7 +1166,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1158 } 1166 }
1159 } 1167 }
1160 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1168 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1161 zone->recent_rotated[!!file] += pgmoved;
1162 1169
1163 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1170 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1164 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1171 __count_vm_events(PGDEACTIVATE, pgdeactivate);
@@ -1174,7 +1181,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1174{ 1181{
1175 int file = is_file_lru(lru); 1182 int file = is_file_lru(lru);
1176 1183
1177 if (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE) { 1184 if (lru == LRU_ACTIVE_FILE) {
1185 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1186 return 0;
1187 }
1188
1189 if (lru == LRU_ACTIVE_ANON &&
1190 (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1178 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1191 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1179 return 0; 1192 return 0;
1180 } 1193 }
@@ -1310,8 +1323,8 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1310 } 1323 }
1311 } 1324 }
1312 1325
1313 while (nr[LRU_ACTIVE_ANON] || nr[LRU_INACTIVE_ANON] || 1326 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1314 nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { 1327 nr[LRU_INACTIVE_FILE]) {
1315 for_each_lru(l) { 1328 for_each_lru(l) {
1316 if (nr[l]) { 1329 if (nr[l]) {
1317 nr_to_scan = min(nr[l], 1330 nr_to_scan = min(nr[l],
@@ -1324,6 +1337,15 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1324 } 1337 }
1325 } 1338 }
1326 1339
1340 /*
1341 * Even if we did not try to evict anon pages at all, we want to
1342 * rebalance the anon lru active/inactive ratio.
1343 */
1344 if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
1345 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1346 else if (!scan_global_lru(sc))
1347 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1348
1327 throttle_vm_writeout(sc->gfp_mask); 1349 throttle_vm_writeout(sc->gfp_mask);
1328 return nr_reclaimed; 1350 return nr_reclaimed;
1329} 1351}
@@ -1617,6 +1639,14 @@ loop_again:
1617 priority != DEF_PRIORITY) 1639 priority != DEF_PRIORITY)
1618 continue; 1640 continue;
1619 1641
1642 /*
1643 * Do some background aging of the anon list, to give
1644 * pages a chance to be referenced before reclaiming.
1645 */
1646 if (inactive_anon_is_low(zone))
1647 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1648 &sc, priority, 0);
1649
1620 if (!zone_watermark_ok(zone, order, zone->pages_high, 1650 if (!zone_watermark_ok(zone, order, zone->pages_high,
1621 0, 0)) { 1651 0, 0)) {
1622 end_zone = i; 1652 end_zone = i;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 27400b7da7c4..4380b0dba6d9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -738,10 +738,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
738 seq_printf(m, 738 seq_printf(m,
739 "\n all_unreclaimable: %u" 739 "\n all_unreclaimable: %u"
740 "\n prev_priority: %i" 740 "\n prev_priority: %i"
741 "\n start_pfn: %lu", 741 "\n start_pfn: %lu"
742 "\n inactive_ratio: %u",
742 zone_is_all_unreclaimable(zone), 743 zone_is_all_unreclaimable(zone),
743 zone->prev_priority, 744 zone->prev_priority,
744 zone->zone_start_pfn); 745 zone->zone_start_pfn,
746 zone->inactive_ratio);
745 seq_putc(m, '\n'); 747 seq_putc(m, '\n');
746} 748}
747 749