aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2009-09-23 18:56:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-24 10:20:59 -0400
commit4e41695356fb4e0b153be1440ad027e46e0a7ea2 (patch)
tree547dae77d1655a1acb260ea8b266c7b8a48f2d2c /mm/memcontrol.c
parent75822b4495b62e8721e9b88e3cf9e653a0c85b73 (diff)
memory controller: soft limit reclaim on contention
Implement reclaim from groups over their soft limit Permit reclaim from memory cgroups on contention (via the direct reclaim path). memory cgroup soft limit reclaim finds the group that exceeds its soft limit by the largest number of pages and reclaims pages from it and then reinserts the cgroup into its correct place in the rbtree. Add additional checks to mem_cgroup_hierarchical_reclaim() to detect long loops in case all swap is turned off. The code has been refactored and the loop check (loop < 2) has been enhanced for soft limits. For soft limits, we try to do more targetted reclaim. Instead of bailing out after two loops, the routine now reclaims memory proportional to the size by which the soft limit is exceeded. The proportion has been empirically determined. [akpm@linux-foundation.org: build fix] [kamezawa.hiroyu@jp.fujitsu.com: fix softlimit css refcnt handling] [nishimura@mxp.nes.nec.co.jp: refcount of the "victim" should be decremented before exiting the loop] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c226
1 files changed, 212 insertions, 14 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 90f0b13e1c3c..011aba6cad70 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -139,6 +139,8 @@ struct mem_cgroup_per_zone {
139 unsigned long long usage_in_excess;/* Set to the value by which */ 139 unsigned long long usage_in_excess;/* Set to the value by which */
140 /* the soft limit is exceeded*/ 140 /* the soft limit is exceeded*/
141 bool on_tree; 141 bool on_tree;
142 struct mem_cgroup *mem; /* Back pointer, we cannot */
143 /* use container_of */
142}; 144};
143/* Macro for accessing counter */ 145/* Macro for accessing counter */
144#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 146#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -228,6 +230,13 @@ struct mem_cgroup {
228 struct mem_cgroup_stat stat; 230 struct mem_cgroup_stat stat;
229}; 231};
230 232
233/*
234 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
235 * limit reclaim to prevent infinite loops, if they ever occur.
236 */
237#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
238#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
239
231enum charge_type { 240enum charge_type {
232 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 241 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
233 MEM_CGROUP_CHARGE_TYPE_MAPPED, 242 MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -259,6 +268,8 @@ enum charge_type {
259#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 268#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
260#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 269#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
261#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 270#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
271#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
272#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
262 273
263static void mem_cgroup_get(struct mem_cgroup *mem); 274static void mem_cgroup_get(struct mem_cgroup *mem);
264static void mem_cgroup_put(struct mem_cgroup *mem); 275static void mem_cgroup_put(struct mem_cgroup *mem);
@@ -299,7 +310,7 @@ soft_limit_tree_from_page(struct page *page)
299} 310}
300 311
301static void 312static void
302mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 313__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
303 struct mem_cgroup_per_zone *mz, 314 struct mem_cgroup_per_zone *mz,
304 struct mem_cgroup_tree_per_zone *mctz) 315 struct mem_cgroup_tree_per_zone *mctz)
305{ 316{
@@ -311,7 +322,6 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
311 return; 322 return;
312 323
313 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); 324 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
314 spin_lock(&mctz->lock);
315 while (*p) { 325 while (*p) {
316 parent = *p; 326 parent = *p;
317 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 327 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
@@ -328,6 +338,26 @@ mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
328 rb_link_node(&mz->tree_node, parent, p); 338 rb_link_node(&mz->tree_node, parent, p);
329 rb_insert_color(&mz->tree_node, &mctz->rb_root); 339 rb_insert_color(&mz->tree_node, &mctz->rb_root);
330 mz->on_tree = true; 340 mz->on_tree = true;
341}
342
343static void
344__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
345 struct mem_cgroup_per_zone *mz,
346 struct mem_cgroup_tree_per_zone *mctz)
347{
348 if (!mz->on_tree)
349 return;
350 rb_erase(&mz->tree_node, &mctz->rb_root);
351 mz->on_tree = false;
352}
353
354static void
355mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
356 struct mem_cgroup_per_zone *mz,
357 struct mem_cgroup_tree_per_zone *mctz)
358{
359 spin_lock(&mctz->lock);
360 __mem_cgroup_insert_exceeded(mem, mz, mctz);
331 spin_unlock(&mctz->lock); 361 spin_unlock(&mctz->lock);
332} 362}
333 363
@@ -337,8 +367,7 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
337 struct mem_cgroup_tree_per_zone *mctz) 367 struct mem_cgroup_tree_per_zone *mctz)
338{ 368{
339 spin_lock(&mctz->lock); 369 spin_lock(&mctz->lock);
340 rb_erase(&mz->tree_node, &mctz->rb_root); 370 __mem_cgroup_remove_exceeded(mem, mz, mctz);
341 mz->on_tree = false;
342 spin_unlock(&mctz->lock); 371 spin_unlock(&mctz->lock);
343} 372}
344 373
@@ -408,6 +437,47 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
408 } 437 }
409} 438}
410 439
440static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
441{
442 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
443}
444
445static struct mem_cgroup_per_zone *
446__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
447{
448 struct rb_node *rightmost = NULL;
449 struct mem_cgroup_per_zone *mz = NULL;
450
451retry:
452 rightmost = rb_last(&mctz->rb_root);
453 if (!rightmost)
454 goto done; /* Nothing to reclaim from */
455
456 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
457 /*
458 * Remove the node now but someone else can add it back,
459 * we will to add it back at the end of reclaim to its correct
460 * position in the tree.
461 */
462 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
463 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
464 !css_tryget(&mz->mem->css))
465 goto retry;
466done:
467 return mz;
468}
469
470static struct mem_cgroup_per_zone *
471mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
472{
473 struct mem_cgroup_per_zone *mz;
474
475 spin_lock(&mctz->lock);
476 mz = __mem_cgroup_largest_soft_limit_node(mctz);
477 spin_unlock(&mctz->lock);
478 return mz;
479}
480
411static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 481static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
412 struct page_cgroup *pc, 482 struct page_cgroup *pc,
413 bool charge) 483 bool charge)
@@ -1037,6 +1107,7 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1037 * If shrink==true, for avoiding to free too much, this returns immedieately. 1107 * If shrink==true, for avoiding to free too much, this returns immedieately.
1038 */ 1108 */
1039static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1109static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1110 struct zone *zone,
1040 gfp_t gfp_mask, 1111 gfp_t gfp_mask,
1041 unsigned long reclaim_options) 1112 unsigned long reclaim_options)
1042{ 1113{
@@ -1045,23 +1116,53 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1045 int loop = 0; 1116 int loop = 0;
1046 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1117 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1047 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1118 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1119 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1120 unsigned long excess = mem_cgroup_get_excess(root_mem);
1048 1121
1049 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1122 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1050 if (root_mem->memsw_is_minimum) 1123 if (root_mem->memsw_is_minimum)
1051 noswap = true; 1124 noswap = true;
1052 1125
1053 while (loop < 2) { 1126 while (1) {
1054 victim = mem_cgroup_select_victim(root_mem); 1127 victim = mem_cgroup_select_victim(root_mem);
1055 if (victim == root_mem) 1128 if (victim == root_mem) {
1056 loop++; 1129 loop++;
1130 if (loop >= 2) {
1131 /*
1132 * If we have not been able to reclaim
1133 * anything, it might because there are
1134 * no reclaimable pages under this hierarchy
1135 */
1136 if (!check_soft || !total) {
1137 css_put(&victim->css);
1138 break;
1139 }
1140 /*
1141 * We want to do more targetted reclaim.
1142 * excess >> 2 is not to excessive so as to
1143 * reclaim too much, nor too less that we keep
1144 * coming back to reclaim from this cgroup
1145 */
1146 if (total >= (excess >> 2) ||
1147 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1148 css_put(&victim->css);
1149 break;
1150 }
1151 }
1152 }
1057 if (!mem_cgroup_local_usage(&victim->stat)) { 1153 if (!mem_cgroup_local_usage(&victim->stat)) {
1058 /* this cgroup's local usage == 0 */ 1154 /* this cgroup's local usage == 0 */
1059 css_put(&victim->css); 1155 css_put(&victim->css);
1060 continue; 1156 continue;
1061 } 1157 }
1062 /* we use swappiness of local cgroup */ 1158 /* we use swappiness of local cgroup */
1063 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, 1159 if (check_soft)
1064 get_swappiness(victim)); 1160 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1161 noswap, get_swappiness(victim), zone,
1162 zone->zone_pgdat->node_id);
1163 else
1164 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1165 noswap, get_swappiness(victim));
1065 css_put(&victim->css); 1166 css_put(&victim->css);
1066 /* 1167 /*
1067 * At shrinking usage, we can't check we should stop here or 1168 * At shrinking usage, we can't check we should stop here or
@@ -1071,7 +1172,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1071 if (shrink) 1172 if (shrink)
1072 return ret; 1173 return ret;
1073 total += ret; 1174 total += ret;
1074 if (mem_cgroup_check_under_limit(root_mem)) 1175 if (check_soft) {
1176 if (res_counter_check_under_soft_limit(&root_mem->res))
1177 return total;
1178 } else if (mem_cgroup_check_under_limit(root_mem))
1075 return 1 + total; 1179 return 1 + total;
1076 } 1180 }
1077 return total; 1181 return total;
@@ -1206,8 +1310,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1206 if (!(gfp_mask & __GFP_WAIT)) 1310 if (!(gfp_mask & __GFP_WAIT))
1207 goto nomem; 1311 goto nomem;
1208 1312
1209 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 1313 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1210 flags); 1314 gfp_mask, flags);
1211 if (ret) 1315 if (ret)
1212 continue; 1316 continue;
1213 1317
@@ -2018,8 +2122,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2018 if (!ret) 2122 if (!ret)
2019 break; 2123 break;
2020 2124
2021 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2125 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
2022 MEM_CGROUP_RECLAIM_SHRINK); 2126 GFP_KERNEL,
2127 MEM_CGROUP_RECLAIM_SHRINK);
2023 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2128 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
2024 /* Usage is reduced ? */ 2129 /* Usage is reduced ? */
2025 if (curusage >= oldusage) 2130 if (curusage >= oldusage)
@@ -2071,7 +2176,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2071 if (!ret) 2176 if (!ret)
2072 break; 2177 break;
2073 2178
2074 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2179 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2075 MEM_CGROUP_RECLAIM_NOSWAP | 2180 MEM_CGROUP_RECLAIM_NOSWAP |
2076 MEM_CGROUP_RECLAIM_SHRINK); 2181 MEM_CGROUP_RECLAIM_SHRINK);
2077 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2182 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
@@ -2084,6 +2189,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2084 return ret; 2189 return ret;
2085} 2190}
2086 2191
2192unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2193 gfp_t gfp_mask, int nid,
2194 int zid)
2195{
2196 unsigned long nr_reclaimed = 0;
2197 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2198 unsigned long reclaimed;
2199 int loop = 0;
2200 struct mem_cgroup_tree_per_zone *mctz;
2201
2202 if (order > 0)
2203 return 0;
2204
2205 mctz = soft_limit_tree_node_zone(nid, zid);
2206 /*
2207 * This loop can run a while, specially if mem_cgroup's continuously
2208 * keep exceeding their soft limit and putting the system under
2209 * pressure
2210 */
2211 do {
2212 if (next_mz)
2213 mz = next_mz;
2214 else
2215 mz = mem_cgroup_largest_soft_limit_node(mctz);
2216 if (!mz)
2217 break;
2218
2219 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2220 gfp_mask,
2221 MEM_CGROUP_RECLAIM_SOFT);
2222 nr_reclaimed += reclaimed;
2223 spin_lock(&mctz->lock);
2224
2225 /*
2226 * If we failed to reclaim anything from this memory cgroup
2227 * it is time to move on to the next cgroup
2228 */
2229 next_mz = NULL;
2230 if (!reclaimed) {
2231 do {
2232 /*
2233 * Loop until we find yet another one.
2234 *
2235 * By the time we get the soft_limit lock
2236 * again, someone might have aded the
2237 * group back on the RB tree. Iterate to
2238 * make sure we get a different mem.
2239 * mem_cgroup_largest_soft_limit_node returns
2240 * NULL if no other cgroup is present on
2241 * the tree
2242 */
2243 next_mz =
2244 __mem_cgroup_largest_soft_limit_node(mctz);
2245 if (next_mz == mz) {
2246 css_put(&next_mz->mem->css);
2247 next_mz = NULL;
2248 } else /* next_mz == NULL or other memcg */
2249 break;
2250 } while (1);
2251 }
2252 mz->usage_in_excess =
2253 res_counter_soft_limit_excess(&mz->mem->res);
2254 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2255 /*
2256 * One school of thought says that we should not add
2257 * back the node to the tree if reclaim returns 0.
2258 * But our reclaim could return 0, simply because due
2259 * to priority we are exposing a smaller subset of
2260 * memory to reclaim from. Consider this as a longer
2261 * term TODO.
2262 */
2263 if (mz->usage_in_excess)
2264 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
2265 spin_unlock(&mctz->lock);
2266 css_put(&mz->mem->css);
2267 loop++;
2268 /*
2269 * Could not reclaim anything and there are no more
2270 * mem cgroups to try or we seem to be looping without
2271 * reclaiming anything.
2272 */
2273 if (!nr_reclaimed &&
2274 (next_mz == NULL ||
2275 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2276 break;
2277 } while (!nr_reclaimed);
2278 if (next_mz)
2279 css_put(&next_mz->mem->css);
2280 return nr_reclaimed;
2281}
2282
2087/* 2283/*
2088 * This routine traverse page_cgroup in given list and drop them all. 2284 * This routine traverse page_cgroup in given list and drop them all.
2089 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2285 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2686,6 +2882,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2686 for_each_lru(l) 2882 for_each_lru(l)
2687 INIT_LIST_HEAD(&mz->lists[l]); 2883 INIT_LIST_HEAD(&mz->lists[l]);
2688 mz->usage_in_excess = 0; 2884 mz->usage_in_excess = 0;
2885 mz->on_tree = false;
2886 mz->mem = mem;
2689 } 2887 }
2690 return 0; 2888 return 0;
2691} 2889}