diff options
-rw-r--r-- | mm/vmscan.c | 332 |
1 files changed, 201 insertions, 131 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index be4dfe87be03..a26dabd62fed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -856,7 +856,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
856 | __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); | 856 | __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); |
857 | __mod_zone_page_state(zone, NR_INACTIVE, | 857 | __mod_zone_page_state(zone, NR_INACTIVE, |
858 | -(nr_taken - nr_active)); | 858 | -(nr_taken - nr_active)); |
859 | zone->pages_scanned += nr_scan; | 859 | if (scan_global_lru(sc)) |
860 | zone->pages_scanned += nr_scan; | ||
860 | spin_unlock_irq(&zone->lru_lock); | 861 | spin_unlock_irq(&zone->lru_lock); |
861 | 862 | ||
862 | nr_scanned += nr_scan; | 863 | nr_scanned += nr_scan; |
@@ -888,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
888 | if (current_is_kswapd()) { | 889 | if (current_is_kswapd()) { |
889 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); | 890 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); |
890 | __count_vm_events(KSWAPD_STEAL, nr_freed); | 891 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
891 | } else | 892 | } else if (scan_global_lru(sc)) |
892 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); | 893 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); |
894 | |||
893 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); | 895 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); |
894 | 896 | ||
895 | if (nr_taken == 0) | 897 | if (nr_taken == 0) |
@@ -943,6 +945,113 @@ static inline int zone_is_near_oom(struct zone *zone) | |||
943 | } | 945 | } |
944 | 946 | ||
945 | /* | 947 | /* |
948 | * Determine we should try to reclaim mapped pages. | ||
949 | * This is called only when sc->mem_cgroup is NULL. | ||
950 | */ | ||
951 | static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, | ||
952 | int priority) | ||
953 | { | ||
954 | long mapped_ratio; | ||
955 | long distress; | ||
956 | long swap_tendency; | ||
957 | long imbalance; | ||
958 | int reclaim_mapped = 0; | ||
959 | int prev_priority; | ||
960 | |||
961 | if (scan_global_lru(sc) && zone_is_near_oom(zone)) | ||
962 | return 1; | ||
963 | /* | ||
964 | * `distress' is a measure of how much trouble we're having | ||
965 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | ||
966 | */ | ||
967 | if (scan_global_lru(sc)) | ||
968 | prev_priority = zone->prev_priority; | ||
969 | else | ||
970 | prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); | ||
971 | |||
972 | distress = 100 >> min(prev_priority, priority); | ||
973 | |||
974 | /* | ||
975 | * The point of this algorithm is to decide when to start | ||
976 | * reclaiming mapped memory instead of just pagecache. Work out | ||
977 | * how much memory | ||
978 | * is mapped. | ||
979 | */ | ||
980 | if (scan_global_lru(sc)) | ||
981 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + | ||
982 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
983 | vm_total_pages; | ||
984 | else | ||
985 | mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); | ||
986 | |||
987 | /* | ||
988 | * Now decide how much we really want to unmap some pages. The | ||
989 | * mapped ratio is downgraded - just because there's a lot of | ||
990 | * mapped memory doesn't necessarily mean that page reclaim | ||
991 | * isn't succeeding. | ||
992 | * | ||
993 | * The distress ratio is important - we don't want to start | ||
994 | * going oom. | ||
995 | * | ||
996 | * A 100% value of vm_swappiness overrides this algorithm | ||
997 | * altogether. | ||
998 | */ | ||
999 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | ||
1000 | |||
1001 | /* | ||
1002 | * If there's huge imbalance between active and inactive | ||
1003 | * (think active 100 times larger than inactive) we should | ||
1004 | * become more permissive, or the system will take too much | ||
1005 | * cpu before it start swapping during memory pressure. | ||
1006 | * Distress is about avoiding early-oom, this is about | ||
1007 | * making swappiness graceful despite setting it to low | ||
1008 | * values. | ||
1009 | * | ||
1010 | * Avoid div by zero with nr_inactive+1, and max resulting | ||
1011 | * value is vm_total_pages. | ||
1012 | */ | ||
1013 | if (scan_global_lru(sc)) { | ||
1014 | imbalance = zone_page_state(zone, NR_ACTIVE); | ||
1015 | imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; | ||
1016 | } else | ||
1017 | imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); | ||
1018 | |||
1019 | /* | ||
1020 | * Reduce the effect of imbalance if swappiness is low, | ||
1021 | * this means for a swappiness very low, the imbalance | ||
1022 | * must be much higher than 100 for this logic to make | ||
1023 | * the difference. | ||
1024 | * | ||
1025 | * Max temporary value is vm_total_pages*100. | ||
1026 | */ | ||
1027 | imbalance *= (vm_swappiness + 1); | ||
1028 | imbalance /= 100; | ||
1029 | |||
1030 | /* | ||
1031 | * If not much of the ram is mapped, makes the imbalance | ||
1032 | * less relevant, it's high priority we refill the inactive | ||
1033 | * list with mapped pages only in presence of high ratio of | ||
1034 | * mapped pages. | ||
1035 | * | ||
1036 | * Max temporary value is vm_total_pages*100. | ||
1037 | */ | ||
1038 | imbalance *= mapped_ratio; | ||
1039 | imbalance /= 100; | ||
1040 | |||
1041 | /* apply imbalance feedback to swap_tendency */ | ||
1042 | swap_tendency += imbalance; | ||
1043 | |||
1044 | /* | ||
1045 | * Now use this metric to decide whether to start moving mapped | ||
1046 | * memory onto the inactive list. | ||
1047 | */ | ||
1048 | if (swap_tendency >= 100) | ||
1049 | reclaim_mapped = 1; | ||
1050 | |||
1051 | return reclaim_mapped; | ||
1052 | } | ||
1053 | |||
1054 | /* | ||
946 | * This moves pages from the active list to the inactive list. | 1055 | * This moves pages from the active list to the inactive list. |
947 | * | 1056 | * |
948 | * We move them the other way if the page is referenced by one or more | 1057 | * We move them the other way if the page is referenced by one or more |
@@ -959,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone) | |||
959 | * The downside is that we have to touch page->_count against each page. | 1068 | * The downside is that we have to touch page->_count against each page. |
960 | * But we had to alter page->flags anyway. | 1069 | * But we had to alter page->flags anyway. |
961 | */ | 1070 | */ |
1071 | |||
1072 | |||
962 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1073 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
963 | struct scan_control *sc, int priority) | 1074 | struct scan_control *sc, int priority) |
964 | { | 1075 | { |
@@ -972,100 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
972 | struct pagevec pvec; | 1083 | struct pagevec pvec; |
973 | int reclaim_mapped = 0; | 1084 | int reclaim_mapped = 0; |
974 | 1085 | ||
975 | if (sc->may_swap) { | 1086 | if (sc->may_swap) |
976 | long mapped_ratio; | 1087 | reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); |
977 | long distress; | ||
978 | long swap_tendency; | ||
979 | long imbalance; | ||
980 | |||
981 | if (zone_is_near_oom(zone)) | ||
982 | goto force_reclaim_mapped; | ||
983 | |||
984 | /* | ||
985 | * `distress' is a measure of how much trouble we're having | ||
986 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | ||
987 | */ | ||
988 | distress = 100 >> min(zone->prev_priority, priority); | ||
989 | |||
990 | /* | ||
991 | * The point of this algorithm is to decide when to start | ||
992 | * reclaiming mapped memory instead of just pagecache. Work out | ||
993 | * how much memory | ||
994 | * is mapped. | ||
995 | */ | ||
996 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + | ||
997 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
998 | vm_total_pages; | ||
999 | |||
1000 | /* | ||
1001 | * Now decide how much we really want to unmap some pages. The | ||
1002 | * mapped ratio is downgraded - just because there's a lot of | ||
1003 | * mapped memory doesn't necessarily mean that page reclaim | ||
1004 | * isn't succeeding. | ||
1005 | * | ||
1006 | * The distress ratio is important - we don't want to start | ||
1007 | * going oom. | ||
1008 | * | ||
1009 | * A 100% value of vm_swappiness overrides this algorithm | ||
1010 | * altogether. | ||
1011 | */ | ||
1012 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; | ||
1013 | |||
1014 | /* | ||
1015 | * If there's huge imbalance between active and inactive | ||
1016 | * (think active 100 times larger than inactive) we should | ||
1017 | * become more permissive, or the system will take too much | ||
1018 | * cpu before it start swapping during memory pressure. | ||
1019 | * Distress is about avoiding early-oom, this is about | ||
1020 | * making swappiness graceful despite setting it to low | ||
1021 | * values. | ||
1022 | * | ||
1023 | * Avoid div by zero with nr_inactive+1, and max resulting | ||
1024 | * value is vm_total_pages. | ||
1025 | */ | ||
1026 | imbalance = zone_page_state(zone, NR_ACTIVE); | ||
1027 | imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; | ||
1028 | |||
1029 | /* | ||
1030 | * Reduce the effect of imbalance if swappiness is low, | ||
1031 | * this means for a swappiness very low, the imbalance | ||
1032 | * must be much higher than 100 for this logic to make | ||
1033 | * the difference. | ||
1034 | * | ||
1035 | * Max temporary value is vm_total_pages*100. | ||
1036 | */ | ||
1037 | imbalance *= (vm_swappiness + 1); | ||
1038 | imbalance /= 100; | ||
1039 | |||
1040 | /* | ||
1041 | * If not much of the ram is mapped, makes the imbalance | ||
1042 | * less relevant, it's high priority we refill the inactive | ||
1043 | * list with mapped pages only in presence of high ratio of | ||
1044 | * mapped pages. | ||
1045 | * | ||
1046 | * Max temporary value is vm_total_pages*100. | ||
1047 | */ | ||
1048 | imbalance *= mapped_ratio; | ||
1049 | imbalance /= 100; | ||
1050 | |||
1051 | /* apply imbalance feedback to swap_tendency */ | ||
1052 | swap_tendency += imbalance; | ||
1053 | |||
1054 | /* | ||
1055 | * Now use this metric to decide whether to start moving mapped | ||
1056 | * memory onto the inactive list. | ||
1057 | */ | ||
1058 | if (swap_tendency >= 100) | ||
1059 | force_reclaim_mapped: | ||
1060 | reclaim_mapped = 1; | ||
1061 | } | ||
1062 | 1088 | ||
1063 | lru_add_drain(); | 1089 | lru_add_drain(); |
1064 | spin_lock_irq(&zone->lru_lock); | 1090 | spin_lock_irq(&zone->lru_lock); |
1065 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, | 1091 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, |
1066 | ISOLATE_ACTIVE, zone, | 1092 | ISOLATE_ACTIVE, zone, |
1067 | sc->mem_cgroup, 1); | 1093 | sc->mem_cgroup, 1); |
1068 | zone->pages_scanned += pgscanned; | 1094 | /* |
1095 | * zone->pages_scanned is used for detect zone's oom | ||
1096 | * mem_cgroup remembers nr_scan by itself. | ||
1097 | */ | ||
1098 | if (scan_global_lru(sc)) | ||
1099 | zone->pages_scanned += pgscanned; | ||
1100 | |||
1069 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); | 1101 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); |
1070 | spin_unlock_irq(&zone->lru_lock); | 1102 | spin_unlock_irq(&zone->lru_lock); |
1071 | 1103 | ||
@@ -1155,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone, | |||
1155 | unsigned long nr_to_scan; | 1187 | unsigned long nr_to_scan; |
1156 | unsigned long nr_reclaimed = 0; | 1188 | unsigned long nr_reclaimed = 0; |
1157 | 1189 | ||
1158 | /* | 1190 | if (scan_global_lru(sc)) { |
1159 | * Add one to `nr_to_scan' just to make sure that the kernel will | 1191 | /* |
1160 | * slowly sift through the active list. | 1192 | * Add one to nr_to_scan just to make sure that the kernel |
1161 | */ | 1193 | * will slowly sift through the active list. |
1162 | zone->nr_scan_active += | 1194 | */ |
1163 | (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; | 1195 | zone->nr_scan_active += |
1164 | nr_active = zone->nr_scan_active; | 1196 | (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; |
1165 | if (nr_active >= sc->swap_cluster_max) | 1197 | nr_active = zone->nr_scan_active; |
1166 | zone->nr_scan_active = 0; | 1198 | zone->nr_scan_inactive += |
1167 | else | 1199 | (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; |
1168 | nr_active = 0; | 1200 | nr_inactive = zone->nr_scan_inactive; |
1201 | if (nr_inactive >= sc->swap_cluster_max) | ||
1202 | zone->nr_scan_inactive = 0; | ||
1203 | else | ||
1204 | nr_inactive = 0; | ||
1205 | |||
1206 | if (nr_active >= sc->swap_cluster_max) | ||
1207 | zone->nr_scan_active = 0; | ||
1208 | else | ||
1209 | nr_active = 0; | ||
1210 | } else { | ||
1211 | /* | ||
1212 | * This reclaim occurs not because zone memory shortage but | ||
1213 | * because memory controller hits its limit. | ||
1214 | * Then, don't modify zone reclaim related data. | ||
1215 | */ | ||
1216 | nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, | ||
1217 | zone, priority); | ||
1218 | |||
1219 | nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, | ||
1220 | zone, priority); | ||
1221 | } | ||
1169 | 1222 | ||
1170 | zone->nr_scan_inactive += | ||
1171 | (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; | ||
1172 | nr_inactive = zone->nr_scan_inactive; | ||
1173 | if (nr_inactive >= sc->swap_cluster_max) | ||
1174 | zone->nr_scan_inactive = 0; | ||
1175 | else | ||
1176 | nr_inactive = 0; | ||
1177 | 1223 | ||
1178 | while (nr_active || nr_inactive) { | 1224 | while (nr_active || nr_inactive) { |
1179 | if (nr_active) { | 1225 | if (nr_active) { |
@@ -1218,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones, | |||
1218 | unsigned long nr_reclaimed = 0; | 1264 | unsigned long nr_reclaimed = 0; |
1219 | int i; | 1265 | int i; |
1220 | 1266 | ||
1267 | |||
1221 | sc->all_unreclaimable = 1; | 1268 | sc->all_unreclaimable = 1; |
1222 | for (i = 0; zones[i] != NULL; i++) { | 1269 | for (i = 0; zones[i] != NULL; i++) { |
1223 | struct zone *zone = zones[i]; | 1270 | struct zone *zone = zones[i]; |
1224 | 1271 | ||
1225 | if (!populated_zone(zone)) | 1272 | if (!populated_zone(zone)) |
1226 | continue; | 1273 | continue; |
1274 | /* | ||
1275 | * Take care memory controller reclaiming has small influence | ||
1276 | * to global LRU. | ||
1277 | */ | ||
1278 | if (scan_global_lru(sc)) { | ||
1279 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | ||
1280 | continue; | ||
1281 | note_zone_scanning_priority(zone, priority); | ||
1227 | 1282 | ||
1228 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1283 | if (zone_is_all_unreclaimable(zone) && |
1229 | continue; | 1284 | priority != DEF_PRIORITY) |
1230 | 1285 | continue; /* Let kswapd poll it */ | |
1231 | note_zone_scanning_priority(zone, priority); | 1286 | sc->all_unreclaimable = 0; |
1232 | 1287 | } else { | |
1233 | if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) | 1288 | /* |
1234 | continue; /* Let kswapd poll it */ | 1289 | * Ignore cpuset limitation here. We just want to reduce |
1235 | 1290 | * # of used pages by us regardless of memory shortage. | |
1236 | sc->all_unreclaimable = 0; | 1291 | */ |
1292 | sc->all_unreclaimable = 0; | ||
1293 | mem_cgroup_note_reclaim_priority(sc->mem_cgroup, | ||
1294 | priority); | ||
1295 | } | ||
1237 | 1296 | ||
1238 | nr_reclaimed += shrink_zone(priority, zone, sc); | 1297 | nr_reclaimed += shrink_zone(priority, zone, sc); |
1239 | } | 1298 | } |
1299 | |||
1240 | return nr_reclaimed; | 1300 | return nr_reclaimed; |
1241 | } | 1301 | } |
1242 | 1302 | ||
@@ -1264,16 +1324,21 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, | |||
1264 | unsigned long lru_pages = 0; | 1324 | unsigned long lru_pages = 0; |
1265 | int i; | 1325 | int i; |
1266 | 1326 | ||
1267 | count_vm_event(ALLOCSTALL); | 1327 | if (scan_global_lru(sc)) |
1268 | 1328 | count_vm_event(ALLOCSTALL); | |
1269 | for (i = 0; zones[i] != NULL; i++) { | 1329 | /* |
1270 | struct zone *zone = zones[i]; | 1330 | * mem_cgroup will not do shrink_slab. |
1331 | */ | ||
1332 | if (scan_global_lru(sc)) { | ||
1333 | for (i = 0; zones[i] != NULL; i++) { | ||
1334 | struct zone *zone = zones[i]; | ||
1271 | 1335 | ||
1272 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1336 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
1273 | continue; | 1337 | continue; |
1274 | 1338 | ||
1275 | lru_pages += zone_page_state(zone, NR_ACTIVE) | 1339 | lru_pages += zone_page_state(zone, NR_ACTIVE) |
1276 | + zone_page_state(zone, NR_INACTIVE); | 1340 | + zone_page_state(zone, NR_INACTIVE); |
1341 | } | ||
1277 | } | 1342 | } |
1278 | 1343 | ||
1279 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 1344 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
@@ -1330,14 +1395,19 @@ out: | |||
1330 | */ | 1395 | */ |
1331 | if (priority < 0) | 1396 | if (priority < 0) |
1332 | priority = 0; | 1397 | priority = 0; |
1333 | for (i = 0; zones[i] != NULL; i++) { | ||
1334 | struct zone *zone = zones[i]; | ||
1335 | 1398 | ||
1336 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1399 | if (scan_global_lru(sc)) { |
1337 | continue; | 1400 | for (i = 0; zones[i] != NULL; i++) { |
1401 | struct zone *zone = zones[i]; | ||
1402 | |||
1403 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | ||
1404 | continue; | ||
1405 | |||
1406 | zone->prev_priority = priority; | ||
1407 | } | ||
1408 | } else | ||
1409 | mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); | ||
1338 | 1410 | ||
1339 | zone->prev_priority = priority; | ||
1340 | } | ||
1341 | return ret; | 1411 | return ret; |
1342 | } | 1412 | } |
1343 | 1413 | ||