diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 226 |
1 files changed, 212 insertions, 14 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 90f0b13e1c3c..011aba6cad70 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -139,6 +139,8 @@ struct mem_cgroup_per_zone { | |||
139 | unsigned long long usage_in_excess;/* Set to the value by which */ | 139 | unsigned long long usage_in_excess;/* Set to the value by which */ |
140 | /* the soft limit is exceeded*/ | 140 | /* the soft limit is exceeded*/ |
141 | bool on_tree; | 141 | bool on_tree; |
142 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | ||
143 | /* use container_of */ | ||
142 | }; | 144 | }; |
143 | /* Macro for accessing counter */ | 145 | /* Macro for accessing counter */ |
144 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 146 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
@@ -228,6 +230,13 @@ struct mem_cgroup { | |||
228 | struct mem_cgroup_stat stat; | 230 | struct mem_cgroup_stat stat; |
229 | }; | 231 | }; |
230 | 232 | ||
233 | /* | ||
234 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | ||
235 | * limit reclaim to prevent infinite loops, if they ever occur. | ||
236 | */ | ||
237 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) | ||
238 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) | ||
239 | |||
231 | enum charge_type { | 240 | enum charge_type { |
232 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 241 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
233 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 242 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
@@ -259,6 +268,8 @@ enum charge_type { | |||
259 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | 268 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) |
260 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | 269 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 |
261 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | 270 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) |
271 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | ||
272 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | ||
262 | 273 | ||
263 | static void mem_cgroup_get(struct mem_cgroup *mem); | 274 | static void mem_cgroup_get(struct mem_cgroup *mem); |
264 | static void mem_cgroup_put(struct mem_cgroup *mem); | 275 | static void mem_cgroup_put(struct mem_cgroup *mem); |
@@ -299,7 +310,7 @@ soft_limit_tree_from_page(struct page *page) | |||
299 | } | 310 | } |
300 | 311 | ||
301 | static void | 312 | static void |
302 | mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | 313 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, |
303 | struct mem_cgroup_per_zone *mz, | 314 | struct mem_cgroup_per_zone *mz, |
304 | struct mem_cgroup_tree_per_zone *mctz) | 315 | struct mem_cgroup_tree_per_zone *mctz) |
305 | { | 316 | { |
@@ -311,7 +322,6 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | |||
311 | return; | 322 | return; |
312 | 323 | ||
313 | mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); | 324 | mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); |
314 | spin_lock(&mctz->lock); | ||
315 | while (*p) { | 325 | while (*p) { |
316 | parent = *p; | 326 | parent = *p; |
317 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | 327 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, |
@@ -328,6 +338,26 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | |||
328 | rb_link_node(&mz->tree_node, parent, p); | 338 | rb_link_node(&mz->tree_node, parent, p); |
329 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | 339 | rb_insert_color(&mz->tree_node, &mctz->rb_root); |
330 | mz->on_tree = true; | 340 | mz->on_tree = true; |
341 | } | ||
342 | |||
343 | static void | ||
344 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
345 | struct mem_cgroup_per_zone *mz, | ||
346 | struct mem_cgroup_tree_per_zone *mctz) | ||
347 | { | ||
348 | if (!mz->on_tree) | ||
349 | return; | ||
350 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
351 | mz->on_tree = false; | ||
352 | } | ||
353 | |||
354 | static void | ||
355 | mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
356 | struct mem_cgroup_per_zone *mz, | ||
357 | struct mem_cgroup_tree_per_zone *mctz) | ||
358 | { | ||
359 | spin_lock(&mctz->lock); | ||
360 | __mem_cgroup_insert_exceeded(mem, mz, mctz); | ||
331 | spin_unlock(&mctz->lock); | 361 | spin_unlock(&mctz->lock); |
332 | } | 362 | } |
333 | 363 | ||
@@ -337,8 +367,7 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | |||
337 | struct mem_cgroup_tree_per_zone *mctz) | 367 | struct mem_cgroup_tree_per_zone *mctz) |
338 | { | 368 | { |
339 | spin_lock(&mctz->lock); | 369 | spin_lock(&mctz->lock); |
340 | rb_erase(&mz->tree_node, &mctz->rb_root); | 370 | __mem_cgroup_remove_exceeded(mem, mz, mctz); |
341 | mz->on_tree = false; | ||
342 | spin_unlock(&mctz->lock); | 371 | spin_unlock(&mctz->lock); |
343 | } | 372 | } |
344 | 373 | ||
@@ -408,6 +437,47 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | |||
408 | } | 437 | } |
409 | } | 438 | } |
410 | 439 | ||
440 | static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) | ||
441 | { | ||
442 | return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; | ||
443 | } | ||
444 | |||
445 | static struct mem_cgroup_per_zone * | ||
446 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
447 | { | ||
448 | struct rb_node *rightmost = NULL; | ||
449 | struct mem_cgroup_per_zone *mz = NULL; | ||
450 | |||
451 | retry: | ||
452 | rightmost = rb_last(&mctz->rb_root); | ||
453 | if (!rightmost) | ||
454 | goto done; /* Nothing to reclaim from */ | ||
455 | |||
456 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
457 | /* | ||
458 | * Remove the node now but someone else can add it back, | ||
459 | * we will to add it back at the end of reclaim to its correct | ||
460 | * position in the tree. | ||
461 | */ | ||
462 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
463 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | ||
464 | !css_tryget(&mz->mem->css)) | ||
465 | goto retry; | ||
466 | done: | ||
467 | return mz; | ||
468 | } | ||
469 | |||
470 | static struct mem_cgroup_per_zone * | ||
471 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
472 | { | ||
473 | struct mem_cgroup_per_zone *mz; | ||
474 | |||
475 | spin_lock(&mctz->lock); | ||
476 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
477 | spin_unlock(&mctz->lock); | ||
478 | return mz; | ||
479 | } | ||
480 | |||
411 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 481 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
412 | struct page_cgroup *pc, | 482 | struct page_cgroup *pc, |
413 | bool charge) | 483 | bool charge) |
@@ -1037,6 +1107,7 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1037 | * If shrink==true, for avoiding to free too much, this returns immedieately. | 1107 | * If shrink==true, for avoiding to free too much, this returns immedieately. |
1038 | */ | 1108 | */ |
1039 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1109 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
1110 | struct zone *zone, | ||
1040 | gfp_t gfp_mask, | 1111 | gfp_t gfp_mask, |
1041 | unsigned long reclaim_options) | 1112 | unsigned long reclaim_options) |
1042 | { | 1113 | { |
@@ -1045,23 +1116,53 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1045 | int loop = 0; | 1116 | int loop = 0; |
1046 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | 1117 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; |
1047 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | 1118 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
1119 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | ||
1120 | unsigned long excess = mem_cgroup_get_excess(root_mem); | ||
1048 | 1121 | ||
1049 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1122 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1050 | if (root_mem->memsw_is_minimum) | 1123 | if (root_mem->memsw_is_minimum) |
1051 | noswap = true; | 1124 | noswap = true; |
1052 | 1125 | ||
1053 | while (loop < 2) { | 1126 | while (1) { |
1054 | victim = mem_cgroup_select_victim(root_mem); | 1127 | victim = mem_cgroup_select_victim(root_mem); |
1055 | if (victim == root_mem) | 1128 | if (victim == root_mem) { |
1056 | loop++; | 1129 | loop++; |
1130 | if (loop >= 2) { | ||
1131 | /* | ||
1132 | * If we have not been able to reclaim | ||
1133 | * anything, it might because there are | ||
1134 | * no reclaimable pages under this hierarchy | ||
1135 | */ | ||
1136 | if (!check_soft || !total) { | ||
1137 | css_put(&victim->css); | ||
1138 | break; | ||
1139 | } | ||
1140 | /* | ||
1141 | * We want to do more targetted reclaim. | ||
1142 | * excess >> 2 is not to excessive so as to | ||
1143 | * reclaim too much, nor too less that we keep | ||
1144 | * coming back to reclaim from this cgroup | ||
1145 | */ | ||
1146 | if (total >= (excess >> 2) || | ||
1147 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { | ||
1148 | css_put(&victim->css); | ||
1149 | break; | ||
1150 | } | ||
1151 | } | ||
1152 | } | ||
1057 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1153 | if (!mem_cgroup_local_usage(&victim->stat)) { |
1058 | /* this cgroup's local usage == 0 */ | 1154 | /* this cgroup's local usage == 0 */ |
1059 | css_put(&victim->css); | 1155 | css_put(&victim->css); |
1060 | continue; | 1156 | continue; |
1061 | } | 1157 | } |
1062 | /* we use swappiness of local cgroup */ | 1158 | /* we use swappiness of local cgroup */ |
1063 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, | 1159 | if (check_soft) |
1064 | get_swappiness(victim)); | 1160 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1161 | noswap, get_swappiness(victim), zone, | ||
1162 | zone->zone_pgdat->node_id); | ||
1163 | else | ||
1164 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | ||
1165 | noswap, get_swappiness(victim)); | ||
1065 | css_put(&victim->css); | 1166 | css_put(&victim->css); |
1066 | /* | 1167 | /* |
1067 | * At shrinking usage, we can't check we should stop here or | 1168 | * At shrinking usage, we can't check we should stop here or |
@@ -1071,7 +1172,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1071 | if (shrink) | 1172 | if (shrink) |
1072 | return ret; | 1173 | return ret; |
1073 | total += ret; | 1174 | total += ret; |
1074 | if (mem_cgroup_check_under_limit(root_mem)) | 1175 | if (check_soft) { |
1176 | if (res_counter_check_under_soft_limit(&root_mem->res)) | ||
1177 | return total; | ||
1178 | } else if (mem_cgroup_check_under_limit(root_mem)) | ||
1075 | return 1 + total; | 1179 | return 1 + total; |
1076 | } | 1180 | } |
1077 | return total; | 1181 | return total; |
@@ -1206,8 +1310,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1206 | if (!(gfp_mask & __GFP_WAIT)) | 1310 | if (!(gfp_mask & __GFP_WAIT)) |
1207 | goto nomem; | 1311 | goto nomem; |
1208 | 1312 | ||
1209 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, | 1313 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1210 | flags); | 1314 | gfp_mask, flags); |
1211 | if (ret) | 1315 | if (ret) |
1212 | continue; | 1316 | continue; |
1213 | 1317 | ||
@@ -2018,8 +2122,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2018 | if (!ret) | 2122 | if (!ret) |
2019 | break; | 2123 | break; |
2020 | 2124 | ||
2021 | progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, | 2125 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, |
2022 | MEM_CGROUP_RECLAIM_SHRINK); | 2126 | GFP_KERNEL, |
2127 | MEM_CGROUP_RECLAIM_SHRINK); | ||
2023 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2128 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2024 | /* Usage is reduced ? */ | 2129 | /* Usage is reduced ? */ |
2025 | if (curusage >= oldusage) | 2130 | if (curusage >= oldusage) |
@@ -2071,7 +2176,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2071 | if (!ret) | 2176 | if (!ret) |
2072 | break; | 2177 | break; |
2073 | 2178 | ||
2074 | mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, | 2179 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2075 | MEM_CGROUP_RECLAIM_NOSWAP | | 2180 | MEM_CGROUP_RECLAIM_NOSWAP | |
2076 | MEM_CGROUP_RECLAIM_SHRINK); | 2181 | MEM_CGROUP_RECLAIM_SHRINK); |
2077 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 2182 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
@@ -2084,6 +2189,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2084 | return ret; | 2189 | return ret; |
2085 | } | 2190 | } |
2086 | 2191 | ||
2192 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
2193 | gfp_t gfp_mask, int nid, | ||
2194 | int zid) | ||
2195 | { | ||
2196 | unsigned long nr_reclaimed = 0; | ||
2197 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
2198 | unsigned long reclaimed; | ||
2199 | int loop = 0; | ||
2200 | struct mem_cgroup_tree_per_zone *mctz; | ||
2201 | |||
2202 | if (order > 0) | ||
2203 | return 0; | ||
2204 | |||
2205 | mctz = soft_limit_tree_node_zone(nid, zid); | ||
2206 | /* | ||
2207 | * This loop can run a while, specially if mem_cgroup's continuously | ||
2208 | * keep exceeding their soft limit and putting the system under | ||
2209 | * pressure | ||
2210 | */ | ||
2211 | do { | ||
2212 | if (next_mz) | ||
2213 | mz = next_mz; | ||
2214 | else | ||
2215 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
2216 | if (!mz) | ||
2217 | break; | ||
2218 | |||
2219 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | ||
2220 | gfp_mask, | ||
2221 | MEM_CGROUP_RECLAIM_SOFT); | ||
2222 | nr_reclaimed += reclaimed; | ||
2223 | spin_lock(&mctz->lock); | ||
2224 | |||
2225 | /* | ||
2226 | * If we failed to reclaim anything from this memory cgroup | ||
2227 | * it is time to move on to the next cgroup | ||
2228 | */ | ||
2229 | next_mz = NULL; | ||
2230 | if (!reclaimed) { | ||
2231 | do { | ||
2232 | /* | ||
2233 | * Loop until we find yet another one. | ||
2234 | * | ||
2235 | * By the time we get the soft_limit lock | ||
2236 | * again, someone might have aded the | ||
2237 | * group back on the RB tree. Iterate to | ||
2238 | * make sure we get a different mem. | ||
2239 | * mem_cgroup_largest_soft_limit_node returns | ||
2240 | * NULL if no other cgroup is present on | ||
2241 | * the tree | ||
2242 | */ | ||
2243 | next_mz = | ||
2244 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
2245 | if (next_mz == mz) { | ||
2246 | css_put(&next_mz->mem->css); | ||
2247 | next_mz = NULL; | ||
2248 | } else /* next_mz == NULL or other memcg */ | ||
2249 | break; | ||
2250 | } while (1); | ||
2251 | } | ||
2252 | mz->usage_in_excess = | ||
2253 | res_counter_soft_limit_excess(&mz->mem->res); | ||
2254 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
2255 | /* | ||
2256 | * One school of thought says that we should not add | ||
2257 | * back the node to the tree if reclaim returns 0. | ||
2258 | * But our reclaim could return 0, simply because due | ||
2259 | * to priority we are exposing a smaller subset of | ||
2260 | * memory to reclaim from. Consider this as a longer | ||
2261 | * term TODO. | ||
2262 | */ | ||
2263 | if (mz->usage_in_excess) | ||
2264 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz); | ||
2265 | spin_unlock(&mctz->lock); | ||
2266 | css_put(&mz->mem->css); | ||
2267 | loop++; | ||
2268 | /* | ||
2269 | * Could not reclaim anything and there are no more | ||
2270 | * mem cgroups to try or we seem to be looping without | ||
2271 | * reclaiming anything. | ||
2272 | */ | ||
2273 | if (!nr_reclaimed && | ||
2274 | (next_mz == NULL || | ||
2275 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
2276 | break; | ||
2277 | } while (!nr_reclaimed); | ||
2278 | if (next_mz) | ||
2279 | css_put(&next_mz->mem->css); | ||
2280 | return nr_reclaimed; | ||
2281 | } | ||
2282 | |||
2087 | /* | 2283 | /* |
2088 | * This routine traverse page_cgroup in given list and drop them all. | 2284 | * This routine traverse page_cgroup in given list and drop them all. |
2089 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 2285 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
@@ -2686,6 +2882,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
2686 | for_each_lru(l) | 2882 | for_each_lru(l) |
2687 | INIT_LIST_HEAD(&mz->lists[l]); | 2883 | INIT_LIST_HEAD(&mz->lists[l]); |
2688 | mz->usage_in_excess = 0; | 2884 | mz->usage_in_excess = 0; |
2885 | mz->on_tree = false; | ||
2886 | mz->mem = mem; | ||
2689 | } | 2887 | } |
2690 | return 0; | 2888 | return 0; |
2691 | } | 2889 | } |