aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2009-01-07 21:08:06 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-01-08 11:31:06 -0500
commit6d61ef409d6ba168972f7c2f8c35baaade636a58 (patch)
treeb43ef8d2426ced40bd3ed4741ea8523e4496da23 /mm
parent28dbc4b6a01fb579a9441c7b81e3d3413dc452df (diff)
memcg: memory cgroup hierarchical reclaim
This patch introduces hierarchical reclaim. When an ancestor goes over its limit, the charging routine points to the parent that is above its limit. The reclaim process then starts from the last scanned child of the ancestor and reclaims until the ancestor goes below its limit. [akpm@linux-foundation.org: coding-style fixes] [d-nishimura@mtf.biglobe.ne.jp: mem_cgroup_from_res_counter should handle both mem->res and mem->memsw] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp> Cc: Paul Menage <menage@google.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c166
1 files changed, 162 insertions, 4 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e72fb2b4a7d8..20e1d90b3363 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -143,6 +143,13 @@ struct mem_cgroup {
143 struct mem_cgroup_lru_info info; 143 struct mem_cgroup_lru_info info;
144 144
145 int prev_priority; /* for recording reclaim priority */ 145 int prev_priority; /* for recording reclaim priority */
146
147 /*
148 * While reclaiming in a hiearchy, we cache the last child we
149 * reclaimed from. Protected by cgroup_lock()
150 */
151 struct mem_cgroup *last_scanned_child;
152
146 int obsolete; 153 int obsolete;
147 atomic_t refcnt; 154 atomic_t refcnt;
148 /* 155 /*
@@ -461,6 +468,149 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
461 return nr_taken; 468 return nr_taken;
462} 469}
463 470
471#define mem_cgroup_from_res_counter(counter, member) \
472 container_of(counter, struct mem_cgroup, member)
473
474/*
475 * This routine finds the DFS walk successor. This routine should be
476 * called with cgroup_mutex held
477 */
478static struct mem_cgroup *
479mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
480{
481 struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
482
483 curr_cgroup = curr->css.cgroup;
484 root_cgroup = root_mem->css.cgroup;
485
486 if (!list_empty(&curr_cgroup->children)) {
487 /*
488 * Walk down to children
489 */
490 mem_cgroup_put(curr);
491 cgroup = list_entry(curr_cgroup->children.next,
492 struct cgroup, sibling);
493 curr = mem_cgroup_from_cont(cgroup);
494 mem_cgroup_get(curr);
495 goto done;
496 }
497
498visit_parent:
499 if (curr_cgroup == root_cgroup) {
500 mem_cgroup_put(curr);
501 curr = root_mem;
502 mem_cgroup_get(curr);
503 goto done;
504 }
505
506 /*
507 * Goto next sibling
508 */
509 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
510 mem_cgroup_put(curr);
511 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
512 sibling);
513 curr = mem_cgroup_from_cont(cgroup);
514 mem_cgroup_get(curr);
515 goto done;
516 }
517
518 /*
519 * Go up to next parent and next parent's sibling if need be
520 */
521 curr_cgroup = curr_cgroup->parent;
522 goto visit_parent;
523
524done:
525 root_mem->last_scanned_child = curr;
526 return curr;
527}
528
529/*
530 * Visit the first child (need not be the first child as per the ordering
531 * of the cgroup list, since we track last_scanned_child) of @mem and use
532 * that to reclaim free pages from.
533 */
534static struct mem_cgroup *
535mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
536{
537 struct cgroup *cgroup;
538 struct mem_cgroup *ret;
539 bool obsolete = (root_mem->last_scanned_child &&
540 root_mem->last_scanned_child->obsolete);
541
542 /*
543 * Scan all children under the mem_cgroup mem
544 */
545 cgroup_lock();
546 if (list_empty(&root_mem->css.cgroup->children)) {
547 ret = root_mem;
548 goto done;
549 }
550
551 if (!root_mem->last_scanned_child || obsolete) {
552
553 if (obsolete)
554 mem_cgroup_put(root_mem->last_scanned_child);
555
556 cgroup = list_first_entry(&root_mem->css.cgroup->children,
557 struct cgroup, sibling);
558 ret = mem_cgroup_from_cont(cgroup);
559 mem_cgroup_get(ret);
560 } else
561 ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
562 root_mem);
563
564done:
565 root_mem->last_scanned_child = ret;
566 cgroup_unlock();
567 return ret;
568}
569
570/*
571 * Dance down the hierarchy if needed to reclaim memory. We remember the
572 * last child we reclaimed from, so that we don't end up penalizing
573 * one child extensively based on its position in the children list.
574 *
575 * root_mem is the original ancestor that we've been reclaim from.
576 */
577static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
578 gfp_t gfp_mask, bool noswap)
579{
580 struct mem_cgroup *next_mem;
581 int ret = 0;
582
583 /*
584 * Reclaim unconditionally and don't check for return value.
585 * We need to reclaim in the current group and down the tree.
586 * One might think about checking for children before reclaiming,
587 * but there might be left over accounting, even after children
588 * have left.
589 */
590 ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap);
591 if (res_counter_check_under_limit(&root_mem->res))
592 return 0;
593
594 next_mem = mem_cgroup_get_first_node(root_mem);
595
596 while (next_mem != root_mem) {
597 if (next_mem->obsolete) {
598 mem_cgroup_put(next_mem);
599 cgroup_lock();
600 next_mem = mem_cgroup_get_first_node(root_mem);
601 cgroup_unlock();
602 continue;
603 }
604 ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap);
605 if (res_counter_check_under_limit(&root_mem->res))
606 return 0;
607 cgroup_lock();
608 next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
609 cgroup_unlock();
610 }
611 return ret;
612}
613
464/* 614/*
465 * Unlike exported interface, "oom" parameter is added. if oom==true, 615 * Unlike exported interface, "oom" parameter is added. if oom==true,
466 * oom-killer can be invoked. 616 * oom-killer can be invoked.
@@ -469,7 +619,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
469 gfp_t gfp_mask, struct mem_cgroup **memcg, 619 gfp_t gfp_mask, struct mem_cgroup **memcg,
470 bool oom) 620 bool oom)
471{ 621{
472 struct mem_cgroup *mem; 622 struct mem_cgroup *mem, *mem_over_limit;
473 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 623 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
474 struct res_counter *fail_res; 624 struct res_counter *fail_res;
475 /* 625 /*
@@ -511,12 +661,18 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
511 /* mem+swap counter fails */ 661 /* mem+swap counter fails */
512 res_counter_uncharge(&mem->res, PAGE_SIZE); 662 res_counter_uncharge(&mem->res, PAGE_SIZE);
513 noswap = true; 663 noswap = true;
514 } 664 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
665 memsw);
666 } else
667 /* mem counter fails */
668 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
669 res);
670
515 if (!(gfp_mask & __GFP_WAIT)) 671 if (!(gfp_mask & __GFP_WAIT))
516 goto nomem; 672 goto nomem;
517 673
518 if (try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap)) 674 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
519 continue; 675 noswap);
520 676
521 /* 677 /*
522 * try_to_free_mem_cgroup_pages() might not give us a full 678 * try_to_free_mem_cgroup_pages() might not give us a full
@@ -1732,6 +1888,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1732 res_counter_init(&mem->memsw, parent ? &parent->memsw : NULL); 1888 res_counter_init(&mem->memsw, parent ? &parent->memsw : NULL);
1733 1889
1734 1890
1891 mem->last_scanned_child = NULL;
1892
1735 return &mem->css; 1893 return &mem->css;
1736free_out: 1894free_out:
1737 for_each_node_state(node, N_POSSIBLE) 1895 for_each_node_state(node, N_POSSIBLE)