aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2008-02-07 03:14:37 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-07 11:42:22 -0500
commit1cfb419b394ba82745c54ff05436d598ecc2dbd5 (patch)
tree33624176aff8f3a09f572c0fa3d699dbacdb447d
parentcc38108e1ba7f3b9e12b82d0236fa3730c2e0439 (diff)
per-zone and reclaim enhancements for memory controller: modifies vmscan.c for isolate globa/cgroup lru activity
When using memory controller, there are 2 levels of memory reclaim. 1. zone memory reclaim because of system/zone memory shortage. 2. memory cgroup memory reclaim because of hitting limit. These two can be distinguished by sc->mem_cgroup parameter. (scan_global_lru() macro) This patch tries to make memory cgroup reclaim routine avoid affecting system/zone memory reclaim. This patch inserts if (scan_global_lru()) and hook to memory_cgroup reclaim support functions. This patch can be a help for isolating system lru activity and group lru activity and shows what additional functions are necessary. * mem_cgroup_calc_mapped_ratio() ... calculate mapped ratio for cgroup. * mem_cgroup_reclaim_imbalance() ... calculate active/inactive balance in cgroup. * mem_cgroup_calc_reclaim_active() ... calculate the number of active pages to be scanned in this priority in mem_cgroup. * mem_cgroup_calc_reclaim_inactive() ... calculate the number of inactive pages to be scanned in this priority in mem_cgroup. * mem_cgroup_all_unreclaimable() .. checks cgroup's page is all unreclaimable or not. * mem_cgroup_get_reclaim_priority() ... * mem_cgroup_note_reclaim_priority() ... record reclaim priority (temporal) * mem_cgroup_remember_reclaim_priority() .... record reclaim priority as zone->prev_priority. This value is used for calc reclaim_mapped. [akpm@linux-foundation.org: fix unused var warning] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: David Rientjes <rientjes@google.com> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Paul Menage <menage@google.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/vmscan.c332
1 files changed, 201 insertions, 131 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be4dfe87be03..a26dabd62fed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -856,7 +856,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
856 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); 856 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
857 __mod_zone_page_state(zone, NR_INACTIVE, 857 __mod_zone_page_state(zone, NR_INACTIVE,
858 -(nr_taken - nr_active)); 858 -(nr_taken - nr_active));
859 zone->pages_scanned += nr_scan; 859 if (scan_global_lru(sc))
860 zone->pages_scanned += nr_scan;
860 spin_unlock_irq(&zone->lru_lock); 861 spin_unlock_irq(&zone->lru_lock);
861 862
862 nr_scanned += nr_scan; 863 nr_scanned += nr_scan;
@@ -888,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
888 if (current_is_kswapd()) { 889 if (current_is_kswapd()) {
889 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 890 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
890 __count_vm_events(KSWAPD_STEAL, nr_freed); 891 __count_vm_events(KSWAPD_STEAL, nr_freed);
891 } else 892 } else if (scan_global_lru(sc))
892 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 893 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
894
893 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 895 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
894 896
895 if (nr_taken == 0) 897 if (nr_taken == 0)
@@ -943,6 +945,113 @@ static inline int zone_is_near_oom(struct zone *zone)
943} 945}
944 946
945/* 947/*
948 * Determine we should try to reclaim mapped pages.
949 * This is called only when sc->mem_cgroup is NULL.
950 */
951static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
952 int priority)
953{
954 long mapped_ratio;
955 long distress;
956 long swap_tendency;
957 long imbalance;
958 int reclaim_mapped = 0;
959 int prev_priority;
960
961 if (scan_global_lru(sc) && zone_is_near_oom(zone))
962 return 1;
963 /*
964 * `distress' is a measure of how much trouble we're having
965 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
966 */
967 if (scan_global_lru(sc))
968 prev_priority = zone->prev_priority;
969 else
970 prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
971
972 distress = 100 >> min(prev_priority, priority);
973
974 /*
975 * The point of this algorithm is to decide when to start
976 * reclaiming mapped memory instead of just pagecache. Work out
977 * how much memory
978 * is mapped.
979 */
980 if (scan_global_lru(sc))
981 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
982 global_page_state(NR_ANON_PAGES)) * 100) /
983 vm_total_pages;
984 else
985 mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
986
987 /*
988 * Now decide how much we really want to unmap some pages. The
989 * mapped ratio is downgraded - just because there's a lot of
990 * mapped memory doesn't necessarily mean that page reclaim
991 * isn't succeeding.
992 *
993 * The distress ratio is important - we don't want to start
994 * going oom.
995 *
996 * A 100% value of vm_swappiness overrides this algorithm
997 * altogether.
998 */
999 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
1000
1001 /*
1002 * If there's huge imbalance between active and inactive
1003 * (think active 100 times larger than inactive) we should
1004 * become more permissive, or the system will take too much
1005 * cpu before it start swapping during memory pressure.
1006 * Distress is about avoiding early-oom, this is about
1007 * making swappiness graceful despite setting it to low
1008 * values.
1009 *
1010 * Avoid div by zero with nr_inactive+1, and max resulting
1011 * value is vm_total_pages.
1012 */
1013 if (scan_global_lru(sc)) {
1014 imbalance = zone_page_state(zone, NR_ACTIVE);
1015 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1016 } else
1017 imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
1018
1019 /*
1020 * Reduce the effect of imbalance if swappiness is low,
1021 * this means for a swappiness very low, the imbalance
1022 * must be much higher than 100 for this logic to make
1023 * the difference.
1024 *
1025 * Max temporary value is vm_total_pages*100.
1026 */
1027 imbalance *= (vm_swappiness + 1);
1028 imbalance /= 100;
1029
1030 /*
1031 * If not much of the ram is mapped, makes the imbalance
1032 * less relevant, it's high priority we refill the inactive
1033 * list with mapped pages only in presence of high ratio of
1034 * mapped pages.
1035 *
1036 * Max temporary value is vm_total_pages*100.
1037 */
1038 imbalance *= mapped_ratio;
1039 imbalance /= 100;
1040
1041 /* apply imbalance feedback to swap_tendency */
1042 swap_tendency += imbalance;
1043
1044 /*
1045 * Now use this metric to decide whether to start moving mapped
1046 * memory onto the inactive list.
1047 */
1048 if (swap_tendency >= 100)
1049 reclaim_mapped = 1;
1050
1051 return reclaim_mapped;
1052}
1053
1054/*
946 * This moves pages from the active list to the inactive list. 1055 * This moves pages from the active list to the inactive list.
947 * 1056 *
948 * We move them the other way if the page is referenced by one or more 1057 * We move them the other way if the page is referenced by one or more
@@ -959,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone)
959 * The downside is that we have to touch page->_count against each page. 1068 * The downside is that we have to touch page->_count against each page.
960 * But we had to alter page->flags anyway. 1069 * But we had to alter page->flags anyway.
961 */ 1070 */
1071
1072
962static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1073static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
963 struct scan_control *sc, int priority) 1074 struct scan_control *sc, int priority)
964{ 1075{
@@ -972,100 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
972 struct pagevec pvec; 1083 struct pagevec pvec;
973 int reclaim_mapped = 0; 1084 int reclaim_mapped = 0;
974 1085
975 if (sc->may_swap) { 1086 if (sc->may_swap)
976 long mapped_ratio; 1087 reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
977 long distress;
978 long swap_tendency;
979 long imbalance;
980
981 if (zone_is_near_oom(zone))
982 goto force_reclaim_mapped;
983
984 /*
985 * `distress' is a measure of how much trouble we're having
986 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
987 */
988 distress = 100 >> min(zone->prev_priority, priority);
989
990 /*
991 * The point of this algorithm is to decide when to start
992 * reclaiming mapped memory instead of just pagecache. Work out
993 * how much memory
994 * is mapped.
995 */
996 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
997 global_page_state(NR_ANON_PAGES)) * 100) /
998 vm_total_pages;
999
1000 /*
1001 * Now decide how much we really want to unmap some pages. The
1002 * mapped ratio is downgraded - just because there's a lot of
1003 * mapped memory doesn't necessarily mean that page reclaim
1004 * isn't succeeding.
1005 *
1006 * The distress ratio is important - we don't want to start
1007 * going oom.
1008 *
1009 * A 100% value of vm_swappiness overrides this algorithm
1010 * altogether.
1011 */
1012 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
1013
1014 /*
1015 * If there's huge imbalance between active and inactive
1016 * (think active 100 times larger than inactive) we should
1017 * become more permissive, or the system will take too much
1018 * cpu before it start swapping during memory pressure.
1019 * Distress is about avoiding early-oom, this is about
1020 * making swappiness graceful despite setting it to low
1021 * values.
1022 *
1023 * Avoid div by zero with nr_inactive+1, and max resulting
1024 * value is vm_total_pages.
1025 */
1026 imbalance = zone_page_state(zone, NR_ACTIVE);
1027 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1028
1029 /*
1030 * Reduce the effect of imbalance if swappiness is low,
1031 * this means for a swappiness very low, the imbalance
1032 * must be much higher than 100 for this logic to make
1033 * the difference.
1034 *
1035 * Max temporary value is vm_total_pages*100.
1036 */
1037 imbalance *= (vm_swappiness + 1);
1038 imbalance /= 100;
1039
1040 /*
1041 * If not much of the ram is mapped, makes the imbalance
1042 * less relevant, it's high priority we refill the inactive
1043 * list with mapped pages only in presence of high ratio of
1044 * mapped pages.
1045 *
1046 * Max temporary value is vm_total_pages*100.
1047 */
1048 imbalance *= mapped_ratio;
1049 imbalance /= 100;
1050
1051 /* apply imbalance feedback to swap_tendency */
1052 swap_tendency += imbalance;
1053
1054 /*
1055 * Now use this metric to decide whether to start moving mapped
1056 * memory onto the inactive list.
1057 */
1058 if (swap_tendency >= 100)
1059force_reclaim_mapped:
1060 reclaim_mapped = 1;
1061 }
1062 1088
1063 lru_add_drain(); 1089 lru_add_drain();
1064 spin_lock_irq(&zone->lru_lock); 1090 spin_lock_irq(&zone->lru_lock);
1065 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1091 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1066 ISOLATE_ACTIVE, zone, 1092 ISOLATE_ACTIVE, zone,
1067 sc->mem_cgroup, 1); 1093 sc->mem_cgroup, 1);
1068 zone->pages_scanned += pgscanned; 1094 /*
1095 * zone->pages_scanned is used for detect zone's oom
1096 * mem_cgroup remembers nr_scan by itself.
1097 */
1098 if (scan_global_lru(sc))
1099 zone->pages_scanned += pgscanned;
1100
1069 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1101 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
1070 spin_unlock_irq(&zone->lru_lock); 1102 spin_unlock_irq(&zone->lru_lock);
1071 1103
@@ -1155,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1155 unsigned long nr_to_scan; 1187 unsigned long nr_to_scan;
1156 unsigned long nr_reclaimed = 0; 1188 unsigned long nr_reclaimed = 0;
1157 1189
1158 /* 1190 if (scan_global_lru(sc)) {
1159 * Add one to `nr_to_scan' just to make sure that the kernel will 1191 /*
1160 * slowly sift through the active list. 1192 * Add one to nr_to_scan just to make sure that the kernel
1161 */ 1193 * will slowly sift through the active list.
1162 zone->nr_scan_active += 1194 */
1163 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; 1195 zone->nr_scan_active +=
1164 nr_active = zone->nr_scan_active; 1196 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
1165 if (nr_active >= sc->swap_cluster_max) 1197 nr_active = zone->nr_scan_active;
1166 zone->nr_scan_active = 0; 1198 zone->nr_scan_inactive +=
1167 else 1199 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1168 nr_active = 0; 1200 nr_inactive = zone->nr_scan_inactive;
1201 if (nr_inactive >= sc->swap_cluster_max)
1202 zone->nr_scan_inactive = 0;
1203 else
1204 nr_inactive = 0;
1205
1206 if (nr_active >= sc->swap_cluster_max)
1207 zone->nr_scan_active = 0;
1208 else
1209 nr_active = 0;
1210 } else {
1211 /*
1212 * This reclaim occurs not because zone memory shortage but
1213 * because memory controller hits its limit.
1214 * Then, don't modify zone reclaim related data.
1215 */
1216 nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
1217 zone, priority);
1218
1219 nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
1220 zone, priority);
1221 }
1169 1222
1170 zone->nr_scan_inactive +=
1171 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1172 nr_inactive = zone->nr_scan_inactive;
1173 if (nr_inactive >= sc->swap_cluster_max)
1174 zone->nr_scan_inactive = 0;
1175 else
1176 nr_inactive = 0;
1177 1223
1178 while (nr_active || nr_inactive) { 1224 while (nr_active || nr_inactive) {
1179 if (nr_active) { 1225 if (nr_active) {
@@ -1218,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1218 unsigned long nr_reclaimed = 0; 1264 unsigned long nr_reclaimed = 0;
1219 int i; 1265 int i;
1220 1266
1267
1221 sc->all_unreclaimable = 1; 1268 sc->all_unreclaimable = 1;
1222 for (i = 0; zones[i] != NULL; i++) { 1269 for (i = 0; zones[i] != NULL; i++) {
1223 struct zone *zone = zones[i]; 1270 struct zone *zone = zones[i];
1224 1271
1225 if (!populated_zone(zone)) 1272 if (!populated_zone(zone))
1226 continue; 1273 continue;
1274 /*
1275 * Take care memory controller reclaiming has small influence
1276 * to global LRU.
1277 */
1278 if (scan_global_lru(sc)) {
1279 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1280 continue;
1281 note_zone_scanning_priority(zone, priority);
1227 1282
1228 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1283 if (zone_is_all_unreclaimable(zone) &&
1229 continue; 1284 priority != DEF_PRIORITY)
1230 1285 continue; /* Let kswapd poll it */
1231 note_zone_scanning_priority(zone, priority); 1286 sc->all_unreclaimable = 0;
1232 1287 } else {
1233 if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) 1288 /*
1234 continue; /* Let kswapd poll it */ 1289 * Ignore cpuset limitation here. We just want to reduce
1235 1290 * # of used pages by us regardless of memory shortage.
1236 sc->all_unreclaimable = 0; 1291 */
1292 sc->all_unreclaimable = 0;
1293 mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
1294 priority);
1295 }
1237 1296
1238 nr_reclaimed += shrink_zone(priority, zone, sc); 1297 nr_reclaimed += shrink_zone(priority, zone, sc);
1239 } 1298 }
1299
1240 return nr_reclaimed; 1300 return nr_reclaimed;
1241} 1301}
1242 1302
@@ -1264,16 +1324,21 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
1264 unsigned long lru_pages = 0; 1324 unsigned long lru_pages = 0;
1265 int i; 1325 int i;
1266 1326
1267 count_vm_event(ALLOCSTALL); 1327 if (scan_global_lru(sc))
1268 1328 count_vm_event(ALLOCSTALL);
1269 for (i = 0; zones[i] != NULL; i++) { 1329 /*
1270 struct zone *zone = zones[i]; 1330 * mem_cgroup will not do shrink_slab.
1331 */
1332 if (scan_global_lru(sc)) {
1333 for (i = 0; zones[i] != NULL; i++) {
1334 struct zone *zone = zones[i];
1271 1335
1272 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1336 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1273 continue; 1337 continue;
1274 1338
1275 lru_pages += zone_page_state(zone, NR_ACTIVE) 1339 lru_pages += zone_page_state(zone, NR_ACTIVE)
1276 + zone_page_state(zone, NR_INACTIVE); 1340 + zone_page_state(zone, NR_INACTIVE);
1341 }
1277 } 1342 }
1278 1343
1279 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 1344 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
@@ -1330,14 +1395,19 @@ out:
1330 */ 1395 */
1331 if (priority < 0) 1396 if (priority < 0)
1332 priority = 0; 1397 priority = 0;
1333 for (i = 0; zones[i] != NULL; i++) {
1334 struct zone *zone = zones[i];
1335 1398
1336 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1399 if (scan_global_lru(sc)) {
1337 continue; 1400 for (i = 0; zones[i] != NULL; i++) {
1401 struct zone *zone = zones[i];
1402
1403 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1404 continue;
1405
1406 zone->prev_priority = priority;
1407 }
1408 } else
1409 mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
1338 1410
1339 zone->prev_priority = priority;
1340 }
1341 return ret; 1411 return ret;
1342} 1412}
1343 1413