From ec64f51545fffbc4cb968f0cea56341a4b07e85a Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:26 -0700 Subject: cgroup: fix frequent -EBUSY at rmdir In following situation, with memory subsystem, /groupA use_hierarchy==1 /01 some tasks /02 some tasks /03 some tasks /04 empty When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim is triggered and the kernel walks tree under groupA. In this case, rmdir /groupA/04 fails with -EBUSY frequently because of temporal refcnt from the kernel. In general. cgroup can be rmdir'd if there are no children groups and no tasks. Frequent fails of rmdir() is not useful to users. (And the reason for -EBUSY is unknown to users.....in most cases) This patch tries to modify above behavior, by - retries if css_refcnt is got by someone. - add "return value" to pre_destroy() and allows subsystem to say "we're really busy!" Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8e4be9cb2a6..8ffec674c5a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2272,11 +2272,12 @@ free_out: return ERR_PTR(-ENOMEM); } -static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, +static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - mem_cgroup_force_empty(mem, false); + + return mem_cgroup_force_empty(mem, false); } static void mem_cgroup_destroy(struct cgroup_subsys *ss, -- cgit v1.2.2 From 04046e1a0a34286382e913f8fc461440c21d88e8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:33 -0700 Subject: memcg: use CSS ID Assigning CSS ID for each memcg and use css_get_next() for scanning hierarchy. Assume folloing tree. group_A (ID=3) /01 (ID=4) /0A (ID=7) /02 (ID=10) group_B (ID=5) and task in group_A/01/0A hits limit at group_A. reclaim will be done in following order (round-robin). group_A(3) -> group_A/01 (4) -> group_A/01/0A (7) -> group_A/02(10) -> group_A -> ..... Round robin by ID. The last visited cgroup is recorded and restart from it when it start reclaim again. (More smart algorithm can be implemented..) No cgroup_mutex or hierarchy_mutex is required. Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 220 +++++++++++++++++++++----------------------------------- 1 file changed, 82 insertions(+), 138 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8ffec674c5a..61fd9590c13 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -95,6 +95,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, return ret; } +static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) +{ + s64 ret; + + ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); + ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); + return ret; +} + /* * per-zone information in memory controller. */ @@ -154,9 +163,9 @@ struct mem_cgroup { /* * While reclaiming in a hiearchy, we cache the last child we - * reclaimed from. Protected by hierarchy_mutex + * reclaimed from. */ - struct mem_cgroup *last_scanned_child; + int last_scanned_child; /* * Should the accounting and control be hierarchical, per subtree? */ @@ -629,103 +638,6 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, #define mem_cgroup_from_res_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) -/* - * This routine finds the DFS walk successor. This routine should be - * called with hierarchy_mutex held - */ -static struct mem_cgroup * -__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) -{ - struct cgroup *cgroup, *curr_cgroup, *root_cgroup; - - curr_cgroup = curr->css.cgroup; - root_cgroup = root_mem->css.cgroup; - - if (!list_empty(&curr_cgroup->children)) { - /* - * Walk down to children - */ - cgroup = list_entry(curr_cgroup->children.next, - struct cgroup, sibling); - curr = mem_cgroup_from_cont(cgroup); - goto done; - } - -visit_parent: - if (curr_cgroup == root_cgroup) { - /* caller handles NULL case */ - curr = NULL; - goto done; - } - - /* - * Goto next sibling - */ - if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { - cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, - sibling); - curr = mem_cgroup_from_cont(cgroup); - goto done; - } - - /* - * Go up to next parent and next parent's sibling if need be - */ - curr_cgroup = curr_cgroup->parent; - goto visit_parent; - -done: - return curr; -} - -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. - */ -static struct mem_cgroup * -mem_cgroup_get_next_node(struct mem_cgroup *root_mem) -{ - struct cgroup *cgroup; - struct mem_cgroup *orig, *next; - bool obsolete; - - /* - * Scan all children under the mem_cgroup mem - */ - mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); - - orig = root_mem->last_scanned_child; - obsolete = mem_cgroup_is_obsolete(orig); - - if (list_empty(&root_mem->css.cgroup->children)) { - /* - * root_mem might have children before and last_scanned_child - * may point to one of them. We put it later. - */ - if (orig) - VM_BUG_ON(!obsolete); - next = NULL; - goto done; - } - - if (!orig || obsolete) { - cgroup = list_first_entry(&root_mem->css.cgroup->children, - struct cgroup, sibling); - next = mem_cgroup_from_cont(cgroup); - } else - next = __mem_cgroup_get_next_node(orig, root_mem); - -done: - if (next) - mem_cgroup_get(next); - root_mem->last_scanned_child = next; - if (orig) - mem_cgroup_put(orig); - mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); - return (next) ? next : root_mem; -} - static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) { if (do_swap_account) { @@ -755,46 +667,79 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) } /* - * Dance down the hierarchy if needed to reclaim memory. We remember the - * last child we reclaimed from, so that we don't end up penalizing - * one child extensively based on its position in the children list. + * Visit the first child (need not be the first child as per the ordering + * of the cgroup list, since we track last_scanned_child) of @mem and use + * that to reclaim free pages from. + */ +static struct mem_cgroup * +mem_cgroup_select_victim(struct mem_cgroup *root_mem) +{ + struct mem_cgroup *ret = NULL; + struct cgroup_subsys_state *css; + int nextid, found; + + if (!root_mem->use_hierarchy) { + css_get(&root_mem->css); + ret = root_mem; + } + + while (!ret) { + rcu_read_lock(); + nextid = root_mem->last_scanned_child + 1; + css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, + &found); + if (css && css_tryget(css)) + ret = container_of(css, struct mem_cgroup, css); + + rcu_read_unlock(); + /* Updates scanning parameter */ + spin_lock(&root_mem->reclaim_param_lock); + if (!css) { + /* this means start scan from ID:1 */ + root_mem->last_scanned_child = 0; + } else + root_mem->last_scanned_child = found; + spin_unlock(&root_mem->reclaim_param_lock); + } + + return ret; +} + +/* + * Scan the hierarchy if needed to reclaim memory. We remember the last child + * we reclaimed from, so that we don't end up penalizing one child extensively + * based on its position in the children list. * * root_mem is the original ancestor that we've been reclaim from. + * + * We give up and return to the caller when we visit root_mem twice. + * (other groups can be removed while we're walking....) */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, gfp_t gfp_mask, bool noswap) { - struct mem_cgroup *next_mem; - int ret = 0; - - /* - * Reclaim unconditionally and don't check for return value. - * We need to reclaim in the current group and down the tree. - * One might think about checking for children before reclaiming, - * but there might be left over accounting, even after children - * have left. - */ - ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, - get_swappiness(root_mem)); - if (mem_cgroup_check_under_limit(root_mem)) - return 1; /* indicate reclaim has succeeded */ - if (!root_mem->use_hierarchy) - return ret; - - next_mem = mem_cgroup_get_next_node(root_mem); - - while (next_mem != root_mem) { - if (mem_cgroup_is_obsolete(next_mem)) { - next_mem = mem_cgroup_get_next_node(root_mem); + struct mem_cgroup *victim; + int ret, total = 0; + int loop = 0; + + while (loop < 2) { + victim = mem_cgroup_select_victim(root_mem); + if (victim == root_mem) + loop++; + if (!mem_cgroup_local_usage(&victim->stat)) { + /* this cgroup's local usage == 0 */ + css_put(&victim->css); continue; } - ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, - get_swappiness(next_mem)); + /* we use swappiness of local cgroup */ + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, + get_swappiness(victim)); + css_put(&victim->css); + total += ret; if (mem_cgroup_check_under_limit(root_mem)) - return 1; /* indicate reclaim has succeeded */ - next_mem = mem_cgroup_get_next_node(root_mem); + return 1 + total; } - return ret; + return total; } bool mem_cgroup_oom_called(struct task_struct *task) @@ -1324,8 +1269,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) res_counter_uncharge(&mem->res, PAGE_SIZE); if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) res_counter_uncharge(&mem->memsw, PAGE_SIZE); - mem_cgroup_charge_statistics(mem, pc, false); + ClearPageCgroupUsed(pc); /* * pc->mem_cgroup is not cleared here. It will be accessed when it's @@ -2178,6 +2123,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) { int node; + free_css_id(&mem_cgroup_subsys, &mem->css); + for_each_node_state(node, N_POSSIBLE) free_mem_cgroup_per_zone_info(mem, node); @@ -2228,11 +2175,12 @@ static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem, *parent; + long error = -ENOMEM; int node; mem = mem_cgroup_alloc(); if (!mem) - return ERR_PTR(-ENOMEM); + return ERR_PTR(error); for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) @@ -2260,7 +2208,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->res, NULL); res_counter_init(&mem->memsw, NULL); } - mem->last_scanned_child = NULL; + mem->last_scanned_child = 0; spin_lock_init(&mem->reclaim_param_lock); if (parent) @@ -2269,7 +2217,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &mem->css; free_out: __mem_cgroup_free(mem); - return ERR_PTR(-ENOMEM); + return ERR_PTR(error); } static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, @@ -2284,12 +2232,7 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - struct mem_cgroup *last_scanned_child = mem->last_scanned_child; - if (last_scanned_child) { - VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); - mem_cgroup_put(last_scanned_child); - } mem_cgroup_put(mem); } @@ -2328,6 +2271,7 @@ struct cgroup_subsys mem_cgroup_subsys = { .populate = mem_cgroup_populate, .attach = mem_cgroup_move_task, .early_init = 0, + .use_id = 1, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP -- cgit v1.2.2 From 14067bb3e24b96d92e22d19c18c0119edf5575e5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:35 -0700 Subject: memcg: hierarchical stat Clean up memory.stat file routine and show "total" hierarchical stat. This patch does - renamed get_all_zonestat to be get_local_zonestat. - remove old mem_cgroup_stat_desc, which is only for per-cpu stat. - add mcs_stat to cover both of per-cpu/per-lru stat. - add "total" stat of hierarchy (*) - add a callback system to scan all memcg under a root. == "total" is added. [kamezawa@localhost ~]$ cat /opt/cgroup/xxx/memory.stat cache 0 rss 0 pgpgin 0 pgpgout 0 inactive_anon 0 active_anon 0 inactive_file 0 active_file 0 unevictable 0 hierarchical_memory_limit 50331648 hierarchical_memsw_limit 9223372036854775807 total_cache 65536 total_rss 192512 total_pgpgin 218 total_pgpgout 155 total_inactive_anon 0 total_active_anon 135168 total_inactive_file 61440 total_active_file 4096 total_unevictable 0 == (*) maybe the user can do calc hierarchical stat by his own program in userland but if it can be written in clean way, it's worth to be shown, I think. Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 160 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 119 insertions(+), 41 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 61fd9590c13..33fc0302e29 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -256,7 +256,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) return mem_cgroup_zoneinfo(mem, nid, zid); } -static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, +static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { int nid, zid; @@ -317,6 +317,42 @@ static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) return css_is_removed(&mem->css); } + +/* + * Call callback function against all cgroup under hierarchy tree. + */ +static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, + int (*func)(struct mem_cgroup *, void *)) +{ + int found, ret, nextid; + struct cgroup_subsys_state *css; + struct mem_cgroup *mem; + + if (!root->use_hierarchy) + return (*func)(root, data); + + nextid = 1; + do { + ret = 0; + mem = NULL; + + rcu_read_lock(); + css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, + &found); + if (css && css_tryget(css)) + mem = container_of(css, struct mem_cgroup, css); + rcu_read_unlock(); + + if (mem) { + ret = (*func)(mem, data); + css_put(&mem->css); + } + nextid = found + 1; + } while (!ret && css); + + return ret; +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -510,8 +546,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ unsigned long gb; unsigned long inactive_ratio; - inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); - active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); + inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); + active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1838,54 +1874,90 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) return 0; } -static const struct mem_cgroup_stat_desc { - const char *msg; - u64 unit; -} mem_cgroup_stat_desc[] = { - [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, - [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, - [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, - [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, + +/* For read statistics */ +enum { + MCS_CACHE, + MCS_RSS, + MCS_PGPGIN, + MCS_PGPGOUT, + MCS_INACTIVE_ANON, + MCS_ACTIVE_ANON, + MCS_INACTIVE_FILE, + MCS_ACTIVE_FILE, + MCS_UNEVICTABLE, + NR_MCS_STAT, +}; + +struct mcs_total_stat { + s64 stat[NR_MCS_STAT]; }; +struct { + char *local_name; + char *total_name; +} memcg_stat_strings[NR_MCS_STAT] = { + {"cache", "total_cache"}, + {"rss", "total_rss"}, + {"pgpgin", "total_pgpgin"}, + {"pgpgout", "total_pgpgout"}, + {"inactive_anon", "total_inactive_anon"}, + {"active_anon", "total_active_anon"}, + {"inactive_file", "total_inactive_file"}, + {"active_file", "total_active_file"}, + {"unevictable", "total_unevictable"} +}; + + +static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) +{ + struct mcs_total_stat *s = data; + s64 val; + + /* per cpu stat */ + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); + s->stat[MCS_CACHE] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); + s->stat[MCS_RSS] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); + s->stat[MCS_PGPGIN] += val; + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); + s->stat[MCS_PGPGOUT] += val; + + /* per zone stat */ + val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); + s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); + s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); + s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); + s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; + val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); + s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; + return 0; +} + +static void +mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) +{ + mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); +} + static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); - struct mem_cgroup_stat *stat = &mem_cont->stat; + struct mcs_total_stat mystat; int i; - for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { - s64 val; + memset(&mystat, 0, sizeof(mystat)); + mem_cgroup_get_local_stat(mem_cont, &mystat); - val = mem_cgroup_read_stat(stat, i); - val *= mem_cgroup_stat_desc[i].unit; - cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); - } - /* showing # of active pages */ - { - unsigned long active_anon, inactive_anon; - unsigned long active_file, inactive_file; - unsigned long unevictable; - - inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, - LRU_INACTIVE_ANON); - active_anon = mem_cgroup_get_all_zonestat(mem_cont, - LRU_ACTIVE_ANON); - inactive_file = mem_cgroup_get_all_zonestat(mem_cont, - LRU_INACTIVE_FILE); - active_file = mem_cgroup_get_all_zonestat(mem_cont, - LRU_ACTIVE_FILE); - unevictable = mem_cgroup_get_all_zonestat(mem_cont, - LRU_UNEVICTABLE); - - cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); - cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); - cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); - cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); - cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); + for (i = 0; i < NR_MCS_STAT; i++) + cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); - } + /* Hierarchical information */ { unsigned long long limit, memsw_limit; memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); @@ -1894,6 +1966,12 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); } + memset(&mystat, 0, sizeof(mystat)); + mem_cgroup_get_total_stat(mem_cont, &mystat); + for (i = 0; i < NR_MCS_STAT; i++) + cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); + + #ifdef CONFIG_DEBUG_VM cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); -- cgit v1.2.2 From 81d39c20f5ee2437d71709beb82597e2a38efbbc Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:36 -0700 Subject: memcg: fix shrinking memory to return -EBUSY by fixing retry algorithm As pointed out, shrinking memcg's limit should return -EBUSY after reasonable retries. This patch tries to fix the current behavior of shrink_usage. Before looking into "shrink should return -EBUSY" problem, we should fix hierarchical reclaim code. It compares current usage and current limit, but it only makes sense when the kernel reclaims memory because hit limits. This is also a problem. What this patch does are. 1. add new argument "shrink" to hierarchical reclaim. If "shrink==true", hierarchical reclaim returns immediately and the caller checks the kernel should shrink more or not. (At shrinking memory, usage is always smaller than limit. So check for usage < limit is useless.) 2. For adjusting to above change, 2 changes in "shrink"'s retry path. 2-a. retry_count depends on # of children because the kernel visits the children under hierarchy one by one. 2-b. rather than checking return value of hierarchical_reclaim's progress, compares usage-before-shrink and usage-after-shrink. If usage-before-shrink <= usage-after-shrink, retry_count is decremented. Reported-by: Li Zefan Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 12 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33fc0302e29..6f6a575e77a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -702,6 +702,23 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) return swappiness; } +static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) +{ + int *val = data; + (*val)++; + return 0; +} +/* + * This function returns the number of memcg under hierarchy tree. Returns + * 1(self count) if no children. + */ +static int mem_cgroup_count_children(struct mem_cgroup *mem) +{ + int num = 0; + mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); + return num; +} + /* * Visit the first child (need not be the first child as per the ordering * of the cgroup list, since we track last_scanned_child) of @mem and use @@ -750,9 +767,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * * We give up and return to the caller when we visit root_mem twice. * (other groups can be removed while we're walking....) + * + * If shrink==true, for avoiding to free too much, this returns immedieately. */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, - gfp_t gfp_mask, bool noswap) + gfp_t gfp_mask, bool noswap, bool shrink) { struct mem_cgroup *victim; int ret, total = 0; @@ -771,6 +790,13 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, get_swappiness(victim)); css_put(&victim->css); + /* + * At shrinking usage, we can't check we should stop here or + * reclaim more. It's depends on callers. last_scanned_child + * will work enough for keeping fairness under tree. + */ + if (shrink) + return ret; total += ret; if (mem_cgroup_check_under_limit(root_mem)) return 1 + total; @@ -856,7 +882,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, goto nomem; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, - noswap); + noswap, false); if (ret) continue; @@ -1489,7 +1515,8 @@ int mem_cgroup_shrink_usage(struct page *page, return 0; do { - progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); + progress = mem_cgroup_hierarchical_reclaim(mem, + gfp_mask, true, false); progress += mem_cgroup_check_under_limit(mem); } while (!progress && --retry); @@ -1504,11 +1531,21 @@ static DEFINE_MUTEX(set_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { - - int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int retry_count; int progress; u64 memswlimit; int ret = 0; + int children = mem_cgroup_count_children(memcg); + u64 curusage, oldusage; + + /* + * For keeping hierarchical_reclaim simple, how long we should retry + * is depends on callers. We set our retry-count to be function + * of # of children which we should visit in this loop. + */ + retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; + + oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); while (retry_count) { if (signal_pending(current)) { @@ -1534,8 +1571,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, - false); - if (!progress) retry_count--; + false, true); + curusage = res_counter_read_u64(&memcg->res, RES_USAGE); + /* Usage is reduced ? */ + if (curusage >= oldusage) + retry_count--; + else + oldusage = curusage; } return ret; @@ -1544,13 +1586,16 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, unsigned long long val) { - int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int retry_count; u64 memlimit, oldusage, curusage; - int ret; + int children = mem_cgroup_count_children(memcg); + int ret = -EBUSY; if (!do_swap_account) return -EINVAL; - + /* see mem_cgroup_resize_res_limit */ + retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; + oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); while (retry_count) { if (signal_pending(current)) { ret = -EINTR; @@ -1574,11 +1619,13 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); - mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); + mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; + else + oldusage = curusage; } return ret; } -- cgit v1.2.2 From 0b7f569e45bb6be142d87017030669a6a7d327a1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:38 -0700 Subject: memcg: fix OOM killer under memcg This patch tries to fix OOM Killer problems caused by hierarchy. Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to kill a task in memcg. But, when hierarchy is used, it's broken and correct task cannot be killed. For example, in following cgroup /groupA/ hierarchy=1, limit=1G, 01 nolimit 02 nolimit All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to groupA's 1Gbytes but OOM Killer just kills tasks in groupA. This patch provides makes the bad process be selected from all tasks under hierarchy. BTW, currently, oom_jiffies is updated against groupA in above case. oom_jiffies of tree should be updated. To see how oom_jiffies is used, please check mem_cgroup_oom_called() callers. [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: const fix] Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Daisuke Nishimura Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f6a575e77a..025f8abfae2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -295,6 +295,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *mem = NULL; + + if (!mm) + return NULL; /* * Because we have no locks, mm->owner's may be being moved to other * cgroup. We use css_tryget() here even if this looks @@ -486,10 +489,20 @@ void mem_cgroup_move_lists(struct page *page, int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; + struct mem_cgroup *curr = NULL; task_lock(task); - ret = task->mm && mm_match_cgroup(task->mm, mem); + rcu_read_lock(); + curr = try_get_mem_cgroup_from_mm(task->mm); + rcu_read_unlock(); task_unlock(task); + if (!curr) + return 0; + if (curr->use_hierarchy) + ret = css_is_ancestor(&curr->css, &mem->css); + else + ret = (curr == mem); + css_put(&curr->css); return ret; } @@ -820,6 +833,19 @@ bool mem_cgroup_oom_called(struct task_struct *task) rcu_read_unlock(); return ret; } + +static int record_last_oom_cb(struct mem_cgroup *mem, void *data) +{ + mem->last_oom_jiffies = jiffies; + return 0; +} + +static void record_last_oom(struct mem_cgroup *mem) +{ + mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); +} + + /* * Unlike exported interface, "oom" parameter is added. if oom==true, * oom-killer can be invoked. @@ -902,7 +928,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, mutex_lock(&memcg_tasklist); mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); mutex_unlock(&memcg_tasklist); - mem_over_limit->last_oom_jiffies = jiffies; + record_last_oom(mem_over_limit); } goto nomem; } -- cgit v1.2.2 From e222432bfa7dcf6ec008622a978c9f284ed5e3a9 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 2 Apr 2009 16:57:39 -0700 Subject: memcg: show memcg information during OOM Add RSS and swap to OOM output from memcg Display memcg values like failcnt, usage and limit when an OOM occurs due to memcg. Thanks to Johannes Weiner, Li Zefan, David Rientjes, Kamezawa Hiroyuki, Daisuke Nishimura and KOSAKI Motohiro for review. Sample output ------------- Task in /a/x killed as a result of limit of /a memory: usage 1048576kB, limit 1048576kB, failcnt 4183 memory+swap: usage 1400964kB, limit 9007199254740991kB, failcnt 0 [akpm@linux-foundation.org: compilation fix] [akpm@linux-foundation.org: fix kerneldoc and whitespace] [akpm@linux-foundation.org: add printk facility level] Signed-off-by: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Li Zefan Cc: Paul Menage Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 025f8abfae2..2bdb6149fae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -721,6 +722,74 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) (*val)++; return 0; } + +/** + * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. + * @memcg: The memory cgroup that went over limit + * @p: Task that is going to be killed + * + * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is + * enabled + */ +void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) +{ + struct cgroup *task_cgrp; + struct cgroup *mem_cgrp; + /* + * Need a buffer in BSS, can't rely on allocations. The code relies + * on the assumption that OOM is serialized for memory controller. + * If this assumption is broken, revisit this code. + */ + static char memcg_name[PATH_MAX]; + int ret; + + if (!memcg) + return; + + + rcu_read_lock(); + + mem_cgrp = memcg->css.cgroup; + task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); + + ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); + if (ret < 0) { + /* + * Unfortunately, we are unable to convert to a useful name + * But we'll still print out the usage information + */ + rcu_read_unlock(); + goto done; + } + rcu_read_unlock(); + + printk(KERN_INFO "Task in %s killed", memcg_name); + + rcu_read_lock(); + ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); + if (ret < 0) { + rcu_read_unlock(); + goto done; + } + rcu_read_unlock(); + + /* + * Continues from above, so we don't need an KERN_ level + */ + printk(KERN_CONT " as a result of limit of %s\n", memcg_name); +done: + + printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", + res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->res, RES_FAILCNT)); + printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " + "failcnt %llu\n", + res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, + res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, + res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); +} + /* * This function returns the number of memcg under hierarchy tree. Returns * 1(self count) if no children. -- cgit v1.2.2 From c137b5ece4b111e46981aae7da77315b9909809f Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 2 Apr 2009 16:57:40 -0700 Subject: memcg: remove mem_cgroup_calc_mapped_ratio() Currently, mem_cgroup_calc_mapped_ratio() is unused at all. it can be removed and KAMEZAWA-san suggested it. Signed-off-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bdb6149fae..7bb14fdc780 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -507,23 +507,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) return ret; } -/* - * Calculate mapped_ratio under memory controller. This will be used in - * vmscan.c for deteremining we have to reclaim mapped pages. - */ -int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) -{ - long total, rss; - - /* - * usage is recorded in bytes. But, here, we assume the number of - * physical pages can be represented by "long" on any arch. - */ - total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; - rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); - return (int)((rss * 100L) / total); -} - /* * prev_priority control...this will be used in memory reclaim path. */ -- cgit v1.2.2 From 3c776e64660028236313f0e54f3a9945764422df Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 2 Apr 2009 16:57:43 -0700 Subject: memcg: charge swapcache to proper memcg memcg_test.txt says at 4.1: This swap-in is one of the most complicated work. In do_swap_page(), following events occur when pte is unchanged. (1) the page (SwapCache) is looked up. (2) lock_page() (3) try_charge_swapin() (4) reuse_swap_page() (may call delete_swap_cache()) (5) commit_charge_swapin() (6) swap_free(). Considering following situation for example. (A) The page has not been charged before (2) and reuse_swap_page() doesn't call delete_from_swap_cache(). (B) The page has not been charged before (2) and reuse_swap_page() calls delete_from_swap_cache(). (C) The page has been charged before (2) and reuse_swap_page() doesn't call delete_from_swap_cache(). (D) The page has been charged before (2) and reuse_swap_page() calls delete_from_swap_cache(). memory.usage/memsw.usage changes to this page/swp_entry will be Case (A) (B) (C) (D) Event Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 =========================================== (3) +1/+1 +1/+1 +1/+1 +1/+1 (4) - 0/ 0 - -1/ 0 (5) 0/-1 0/ 0 -1/-1 0/ 0 (6) - 0/-1 - 0/-1 =========================================== Result 1/ 1 1/ 1 1/ 1 1/ 1 In any cases, charges to this page should be 1/ 1. In case of (D), mem_cgroup_try_get_from_swapcache() returns NULL (because lookup_swap_cgroup() returns NULL), so "+1/+1" at (3) means charges to the memcg("foo") to which the "current" belongs. OTOH, "-1/0" at (4) and "0/-1" at (6) means uncharges from the memcg("baa") to which the page has been charged. So, if the "foo" and "baa" is different(for example because of task move), this charge will be moved from "baa" to "foo". I think this is an unexpected behavior. This patch fixes this by modifying mem_cgroup_try_get_from_swapcache() to return the memcg to which the swapcache has been charged if PCG_USED bit is set. IIUC, checking PCG_USED bit of swapcache is safe under page lock. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7bb14fdc780..81b0ae8183d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -994,13 +994,24 @@ nomem: static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) { struct mem_cgroup *mem; + struct page_cgroup *pc; swp_entry_t ent; + VM_BUG_ON(!PageLocked(page)); + if (!PageSwapCache(page)) return NULL; - ent.val = page_private(page); - mem = lookup_swap_cgroup(ent); + pc = lookup_page_cgroup(page); + /* + * Used bit of swapcache is solid under page lock. + */ + if (PageCgroupUsed(pc)) + mem = pc->mem_cgroup; + else { + ent.val = page_private(page); + mem = lookup_swap_cgroup(ent); + } if (!mem) return NULL; if (!css_tryget(&mem->css)) -- cgit v1.2.2 From a3b2d692690aef228e493b1beaafe5364cab3237 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 2 Apr 2009 16:57:45 -0700 Subject: cgroups: use css id in swap cgroup for saving memory v5 Try to use CSS ID for records in swap_cgroup. By this, on 64bit machine, size of swap_cgroup goes down to 2 bytes from 8bytes. This means, when 2GB of swap is equipped, (assume the page size is 4096bytes) From size of swap_cgroup = 2G/4k * 8 = 4Mbytes. To size of swap_cgroup = 2G/4k * 2 = 1Mbytes. Reduction is large. Of course, there are trade-offs. This CSS ID will add overhead to swap-in/swap-out/swap-free. But in general, - swap is a resource which the user tend to avoid use. - If swap is never used, swap_cgroup area is not used. - Reading traditional manuals, size of swap should be proportional to size of memory. Memory size of machine is increasing now. I think reducing size of swap_cgroup makes sense. Note: - ID->CSS lookup routine has no locks, it's under RCU-Read-Side. - memcg can be obsolete at rmdir() but not freed while refcnt from swap_cgroup is available. Changelog v4->v5: - reworked on to memcg-charge-swapcache-to-proper-memcg.patch Changlog ->v4: - fixed not configured case. - deleted unnecessary comments. - fixed NULL pointer bug. - fixed message in dmesg. [nishimura@mxp.nes.nec.co.jp: css_tryget can be called twice in !PageCgroupUsed case] Signed-off-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Balbir Singh Cc: Paul Menage Cc: Hugh Dickins Signed-off-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 62 insertions(+), 12 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81b0ae8183d..55dea596846 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -991,10 +991,31 @@ nomem: return -ENOMEM; } + +/* + * A helper function to get mem_cgroup from ID. must be called under + * rcu_read_lock(). The caller must check css_is_removed() or some if + * it's concern. (dropping refcnt from swap can be called against removed + * memcg.) + */ +static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) +{ + struct cgroup_subsys_state *css; + + /* ID 0 is unused ID */ + if (!id) + return NULL; + css = css_lookup(&mem_cgroup_subsys, id); + if (!css) + return NULL; + return container_of(css, struct mem_cgroup, css); +} + static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) { struct mem_cgroup *mem; struct page_cgroup *pc; + unsigned short id; swp_entry_t ent; VM_BUG_ON(!PageLocked(page)); @@ -1006,16 +1027,19 @@ static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) /* * Used bit of swapcache is solid under page lock. */ - if (PageCgroupUsed(pc)) + if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; - else { + if (mem && !css_tryget(&mem->css)) + mem = NULL; + } else { ent.val = page_private(page); - mem = lookup_swap_cgroup(ent); + id = lookup_swap_cgroup(ent); + rcu_read_lock(); + mem = mem_cgroup_lookup(id); + if (mem && !css_tryget(&mem->css)) + mem = NULL; + rcu_read_unlock(); } - if (!mem) - return NULL; - if (!css_tryget(&mem->css)) - return NULL; return mem; } @@ -1276,12 +1300,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (do_swap_account && !ret && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + unsigned short id; /* avoid double counting */ - mem = swap_cgroup_record(ent, NULL); + id = swap_cgroup_record(ent, 0); + rcu_read_lock(); + mem = mem_cgroup_lookup(id); if (mem) { + /* + * We did swap-in. Then, this entry is doubly counted + * both in mem and memsw. We uncharge it, here. + * Recorded ID can be obsolete. We avoid calling + * css_tryget() + */ res_counter_uncharge(&mem->memsw, PAGE_SIZE); mem_cgroup_put(mem); } + rcu_read_unlock(); } return ret; } @@ -1346,13 +1380,21 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) */ if (do_swap_account && PageSwapCache(page)) { swp_entry_t ent = {.val = page_private(page)}; + unsigned short id; struct mem_cgroup *memcg; - memcg = swap_cgroup_record(ent, NULL); + + id = swap_cgroup_record(ent, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); if (memcg) { + /* + * This recorded memcg can be obsolete one. So, avoid + * calling css_tryget + */ res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_put(memcg); } - + rcu_read_unlock(); } /* add this page(page_cgroup) to the LRU we want. */ @@ -1473,7 +1515,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) MEM_CGROUP_CHARGE_TYPE_SWAPOUT); /* record memcg information */ if (do_swap_account && memcg) { - swap_cgroup_record(ent, memcg); + swap_cgroup_record(ent, css_id(&memcg->css)); mem_cgroup_get(memcg); } if (memcg) @@ -1488,15 +1530,23 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) void mem_cgroup_uncharge_swap(swp_entry_t ent) { struct mem_cgroup *memcg; + unsigned short id; if (!do_swap_account) return; - memcg = swap_cgroup_record(ent, NULL); + id = swap_cgroup_record(ent, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); if (memcg) { + /* + * We uncharge this because swap is freed. + * This memcg can be obsolete one. We avoid calling css_tryget + */ res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_put(memcg); } + rcu_read_unlock(); } #endif -- cgit v1.2.2 From 83aae4c737866da3280f51fd15da58eddd788397 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 2 Apr 2009 16:57:48 -0700 Subject: memcg: cleanup cache_charge Current mem_cgroup_cache_charge is a bit complicated especially in the case of shmem's swap-in. This patch cleans it up by using try_charge_swapin and commit_charge_swapin. Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 60 ++++++++++++++++++++++----------------------------------- 1 file changed, 23 insertions(+), 37 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 55dea596846..2fc6d6c4823 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1238,6 +1238,10 @@ int mem_cgroup_newpage_charge(struct page *page, MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } +static void +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, + enum charge_type ctype); + int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { @@ -1274,16 +1278,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, unlock_page_cgroup(pc); } - if (do_swap_account && PageSwapCache(page)) { - mem = try_get_mem_cgroup_from_swapcache(page); - if (mem) - mm = NULL; - else - mem = NULL; - /* SwapCache may be still linked to LRU now. */ - mem_cgroup_lru_del_before_commit_swapcache(page); - } - if (unlikely(!mm && !mem)) mm = &init_mm; @@ -1291,32 +1285,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); - ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); - if (mem) - css_put(&mem->css); - if (PageSwapCache(page)) - mem_cgroup_lru_add_after_commit_swapcache(page); + /* shmem */ + if (PageSwapCache(page)) { + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); + if (!ret) + __mem_cgroup_commit_charge_swapin(page, mem, + MEM_CGROUP_CHARGE_TYPE_SHMEM); + } else + ret = mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); - if (do_swap_account && !ret && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - unsigned short id; - /* avoid double counting */ - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - mem = mem_cgroup_lookup(id); - if (mem) { - /* - * We did swap-in. Then, this entry is doubly counted - * both in mem and memsw. We uncharge it, here. - * Recorded ID can be obsolete. We avoid calling - * css_tryget() - */ - res_counter_uncharge(&mem->memsw, PAGE_SIZE); - mem_cgroup_put(mem); - } - rcu_read_unlock(); - } return ret; } @@ -1359,7 +1337,9 @@ charge_cur_mm: return __mem_cgroup_try_charge(mm, mask, ptr, true); } -void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +static void +__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, + enum charge_type ctype) { struct page_cgroup *pc; @@ -1369,7 +1349,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) return; pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); + __mem_cgroup_commit_charge(ptr, pc, ctype); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -1400,6 +1380,12 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) } +void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) +{ + __mem_cgroup_commit_charge_swapin(page, ptr, + MEM_CGROUP_CHARGE_TYPE_MAPPED); +} + void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) { if (mem_cgroup_disabled()) -- cgit v1.2.2