From 856c13aa1ff6136c1968414fdea5938ea9d5ebf2 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Fri, 25 Jul 2008 01:47:04 -0700 Subject: cgroup files: convert res_counter_write() to be a cgroups write_string() handler Currently res_counter_write() is a raw file handler even though it's ultimately taking a number, since in some cases it wants to pre-process the string when converting it to a number. This patch converts res_counter_write() from a raw file handler to a write_string() handler; this allows some of the boilerplate copying/locking/checking to be removed, and simplies the cleanup path, since these functions are now performed by the cgroups framework. [lizf@cn.fujitsu.com: build fix] Signed-off-by: Paul Menage Cc: Paul Jackson Cc: Pavel Emelyanov Cc: Balbir Singh Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e46451e1d9b7..7385d58fb061 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -838,32 +838,18 @@ out: return ret; } -static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) -{ - *tmp = memparse(buf, &buf); - if (*buf != '\0') - return -EINVAL; - - /* - * Round up the value to the closest page size - */ - *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; - return 0; -} - static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, cft->private); } -static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, - struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, + const char *buffer) { return res_counter_write(&mem_cgroup_from_cont(cont)->res, - cft->private, userbuf, nbytes, ppos, - mem_cgroup_write_strategy); + cft->private, buffer, + res_counter_memparse_write_strategy); } static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) @@ -940,7 +926,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "limit_in_bytes", .private = RES_LIMIT, - .write = mem_cgroup_write, + .write_string = mem_cgroup_write, .read_u64 = mem_cgroup_read, }, { -- cgit v1.2.2 From a181b0e888a1d917edcab57cd73ccf7d8e75a46c Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:08 -0700 Subject: memcg: make global var read_mostly mem_cgroup_subsys and page_cgroup_cache should be read_mostly and MEM_CGROUP_RECLAIM_RETRIES can be just a fixed number. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7385d58fb061..c52c045f5152 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,9 +35,9 @@ #include -struct cgroup_subsys mem_cgroup_subsys; -static const int MEM_CGROUP_RECLAIM_RETRIES = 5; -static struct kmem_cache *page_cgroup_cache; +struct cgroup_subsys mem_cgroup_subsys __read_mostly; +static struct kmem_cache *page_cgroup_cache __read_mostly; +#define MEM_CGROUP_RECLAIM_RETRIES 5 /* * Statistics for memory cgroup. -- cgit v1.2.2 From 508b7be0a5b06b64203512ed9b34191cddc83f56 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:09 -0700 Subject: memcg: avoid unnecessary initialization * remove over-killing initialization (in fast path) * makeing the condition for PAGE_CGROUP_FLAG_ACTIVE be more obvious. Signed-off-by: KAMEAZAWA Hiroyuki Reviewed-by: Li Zefan Acked-by: Balbir Singh Acked-by: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c52c045f5152..90ccc1326356 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -296,7 +296,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); - list_del_init(&pc->lru); + list_del(&pc->lru); } static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, @@ -559,7 +559,7 @@ retry: } unlock_page_cgroup(page); - pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask); + pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); if (pc == NULL) goto err; @@ -606,9 +606,14 @@ retry: pc->ref_cnt = 1; pc->mem_cgroup = mem; pc->page = page; - pc->flags = PAGE_CGROUP_FLAG_ACTIVE; + /* + * If a page is accounted as a page cache, insert to inactive list. + * If anon, insert to active list. + */ if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) pc->flags = PAGE_CGROUP_FLAG_CACHE; + else + pc->flags = PAGE_CGROUP_FLAG_ACTIVE; lock_page_cgroup(page); if (page_get_page_cgroup(page)) { -- cgit v1.2.2 From e8589cc189f96b87348ae83ea4db38eaac624135 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:10 -0700 Subject: memcg: better migration handling This patch changes page migration under memory controller to use a different algorithm. (thanks to Christoph for new idea.) Before: - page_cgroup is migrated from an old page to a new page. After: - a new page is accounted , no reuse of page_cgroup. Pros: - We can avoid compliated lock depndencies and races in migration. Cons: - new param to mem_cgroup_charge_common(). - mem_cgroup_getref() is added for handling ref_cnt ping-pong. This version simplifies complicated lock dependency in page migraiton under memory resource controller. new refcnt sequence is following. a mapped page: prepage_migration() ..... +1 to NEW page try_to_unmap() ..... all refs to OLD page is gone. move_pages() ..... +1 to NEW page if page cache. remap... ..... all refs from *map* is added to NEW one. end_migration() ..... -1 to New page. page's mapcount + (page_is_cache) refs are added to NEW one. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Li Zefan Cc: YAMAMOTO Takashi Cc: Hugh Dickins Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 128 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 65 insertions(+), 63 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 90ccc1326356..da5912b84551 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -524,7 +524,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * < 0 if the cgroup is over its limit */ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) + gfp_t gfp_mask, enum charge_type ctype, + struct mem_cgroup *memcg) { struct mem_cgroup *mem; struct page_cgroup *pc; @@ -569,16 +570,21 @@ retry: * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - if (!mm) - mm = &init_mm; + if (!memcg) { + if (!mm) + mm = &init_mm; - rcu_read_lock(); - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - /* - * For every charge from the cgroup, increment reference count - */ - css_get(&mem->css); - rcu_read_unlock(); + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + /* + * For every charge from the cgroup, increment reference count + */ + css_get(&mem->css); + rcu_read_unlock(); + } else { + mem = memcg; + css_get(&memcg->css); + } while (res_counter_charge(&mem->res, PAGE_SIZE)) { if (!(gfp_mask & __GFP_WAIT)) @@ -648,7 +654,7 @@ err: int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, @@ -657,7 +663,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (!mm) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE); + MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); +} + +int mem_cgroup_getref(struct page *page) +{ + struct page_cgroup *pc; + + if (mem_cgroup_subsys.disabled) + return 0; + + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); + VM_BUG_ON(!pc); + pc->ref_cnt++; + unlock_page_cgroup(page); + return 0; } /* @@ -707,65 +728,39 @@ unlock: } /* - * Returns non-zero if a page (under migration) has valid page_cgroup member. - * Refcnt of page_cgroup is incremented. + * Before starting migration, account against new page. */ -int mem_cgroup_prepare_migration(struct page *page) +int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) { struct page_cgroup *pc; + struct mem_cgroup *mem = NULL; + enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + int ret = 0; if (mem_cgroup_subsys.disabled) return 0; lock_page_cgroup(page); pc = page_get_page_cgroup(page); - if (pc) - pc->ref_cnt++; + if (pc) { + mem = pc->mem_cgroup; + css_get(&mem->css); + if (pc->flags & PAGE_CGROUP_FLAG_CACHE) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + } unlock_page_cgroup(page); - return pc != NULL; -} - -void mem_cgroup_end_migration(struct page *page) -{ - mem_cgroup_uncharge_page(page); + if (mem) { + ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, + ctype, mem); + css_put(&mem->css); + } + return ret; } -/* - * We know both *page* and *newpage* are now not-on-LRU and PG_locked. - * And no race with uncharge() routines because page_cgroup for *page* - * has extra one reference by mem_cgroup_prepare_migration. - */ -void mem_cgroup_page_migration(struct page *page, struct page *newpage) +/* remove redundant charge */ +void mem_cgroup_end_migration(struct page *newpage) { - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - unsigned long flags; - - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (!pc) { - unlock_page_cgroup(page); - return; - } - - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); - - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); - - pc->page = newpage; - lock_page_cgroup(newpage); - page_assign_page_cgroup(newpage, pc); - - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_add_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); - - unlock_page_cgroup(newpage); + mem_cgroup_uncharge_page(newpage); } /* @@ -795,12 +790,19 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, page = pc->page; get_page(page); spin_unlock_irqrestore(&mz->lru_lock, flags); - mem_cgroup_uncharge_page(page); - put_page(page); - if (--count <= 0) { - count = FORCE_UNCHARGE_BATCH; + /* + * Check if this page is on LRU. !LRU page can be found + * if it's under page migration. + */ + if (PageLRU(page)) { + mem_cgroup_uncharge_page(page); + put_page(page); + if (--count <= 0) { + count = FORCE_UNCHARGE_BATCH; + cond_resched(); + } + } else cond_resched(); - } spin_lock_irqsave(&mz->lru_lock, flags); } spin_unlock_irqrestore(&mz->lru_lock, flags); -- cgit v1.2.2 From 69029cd550284e32de13d6dd2f77b723c8a0e444 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:14 -0700 Subject: memcg: remove refcnt from page_cgroup memcg: performance improvements Patch Description 1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed) 2/5 ... swapcache handling patch 3/5 ... add helper function for shmem's memory reclaim patch 4/5 ... optimize by likely/unlikely ppatch 5/5 ... remove redundunt check patch (shmem handling is fixed.) Unix bench result. == 2.6.26-rc2-mm1 + memory resource controller Execl Throughput 2915.4 lps (29.6 secs, 3 samples) C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples) Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples) Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples) Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples) File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples) File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples) File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples) File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples) File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples) File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples) File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples) File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples) File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples) Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples) INDEX VALUES TEST BASELINE RESULT INDEX Execl Throughput 43.0 2915.4 678.0 File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0 File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5 File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8 Shell Scripts (8 concurrent) 6.0 1097.7 1829.5 ========= FINAL SCORE 991.3 == 2.6.26-rc2-mm1 + this set == Execl Throughput 3012.9 lps (29.9 secs, 3 samples) C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples) Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples) Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples) Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples) File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples) File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples) File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples) File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples) File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples) File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples) File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples) File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples) File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples) Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples) INDEX VALUES TEST BASELINE RESULT INDEX Execl Throughput 43.0 3012.9 700.7 File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7 File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3 File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0 Shell Scripts (8 concurrent) 6.0 1120.3 1867.2 ========= FINAL SCORE 1004.6 This patch: Remove refcnt from page_cgroup(). After this, * A page is charged only when !page_mapped() && no page_cgroup is assigned. * Anon page is newly mapped. * File page is added to mapping->tree. * A page is uncharged only when * Anon page is fully unmapped. * File page is removed from LRU. There is no change in behavior from user's view. This patch also removes unnecessary calls in rmap.c which was used only for refcnt mangement. [akpm@linux-foundation.org: fix warning] [hugh@veritas.com: fix shmem_unuse_inode charging] Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: Hugh Dickins Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 109 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 63 insertions(+), 46 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da5912b84551..a61706193c31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -166,7 +166,6 @@ struct page_cgroup { struct list_head lru; /* per cgroup LRU list */ struct page *page; struct mem_cgroup *mem_cgroup; - int ref_cnt; /* cached, mapped, migrating */ int flags; }; #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ @@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc) enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, + MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ }; /* @@ -552,9 +552,7 @@ retry: */ if (pc) { VM_BUG_ON(pc->page != page); - VM_BUG_ON(pc->ref_cnt <= 0); - - pc->ref_cnt++; + VM_BUG_ON(!pc->mem_cgroup); unlock_page_cgroup(page); goto done; } @@ -570,10 +568,7 @@ retry: * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - if (!memcg) { - if (!mm) - mm = &init_mm; - + if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); /* @@ -609,7 +604,6 @@ retry: } } - pc->ref_cnt = 1; pc->mem_cgroup = mem; pc->page = page; /* @@ -653,6 +647,17 @@ err: int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + /* + * If already mapped, we don't have to account. + * If page cache, page->mapping has address_space. + * But page->mapping may have out-of-use anon_vma pointer, + * detecit it by PageAnon() check. newly-mapped-anon's page->mapping + * is NULL. + */ + if (page_mapped(page) || (page->mapping && !PageAnon(page))) + return 0; + if (unlikely(!mm)) + mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } @@ -660,32 +665,17 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - if (!mm) + if (unlikely(!mm)) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); } -int mem_cgroup_getref(struct page *page) -{ - struct page_cgroup *pc; - - if (mem_cgroup_subsys.disabled) - return 0; - - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - VM_BUG_ON(!pc); - pc->ref_cnt++; - unlock_page_cgroup(page); - return 0; -} - /* - * Uncharging is always a welcome operation, we never complain, simply - * uncharge. + * uncharge if !page_mapped(page) */ -void mem_cgroup_uncharge_page(struct page *page) +static void +__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem; @@ -704,29 +694,41 @@ void mem_cgroup_uncharge_page(struct page *page) goto unlock; VM_BUG_ON(pc->page != page); - VM_BUG_ON(pc->ref_cnt <= 0); - if (--(pc->ref_cnt) == 0) { - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); + if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) + && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) + || page_mapped(page))) + goto unlock; - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); + mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); + __mem_cgroup_remove_list(mz, pc); + spin_unlock_irqrestore(&mz->lru_lock, flags); - mem = pc->mem_cgroup; - res_counter_uncharge(&mem->res, PAGE_SIZE); - css_put(&mem->css); + page_assign_page_cgroup(page, NULL); + unlock_page_cgroup(page); - kmem_cache_free(page_cgroup_cache, pc); - return; - } + mem = pc->mem_cgroup; + res_counter_uncharge(&mem->res, PAGE_SIZE); + css_put(&mem->css); + kmem_cache_free(page_cgroup_cache, pc); + return; unlock: unlock_page_cgroup(page); } +void mem_cgroup_uncharge_page(struct page *page) +{ + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); +} + +void mem_cgroup_uncharge_cache_page(struct page *page) +{ + VM_BUG_ON(page_mapped(page)); + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); +} + /* * Before starting migration, account against new page. */ @@ -757,15 +759,29 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) return ret; } -/* remove redundant charge */ +/* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct page *newpage) { - mem_cgroup_uncharge_page(newpage); + /* + * At success, page->mapping is not NULL. + * special rollback care is necessary when + * 1. at migration failure. (newpage->mapping is cleared in this case) + * 2. the newpage was moved but not remapped again because the task + * exits and the newpage is obsolete. In this case, the new page + * may be a swapcache. So, we just call mem_cgroup_uncharge_page() + * always for avoiding mess. The page_cgroup will be removed if + * unnecessary. File cache pages is still on radix-tree. Don't + * care it. + */ + if (!newpage->mapping) + __mem_cgroup_uncharge_common(newpage, + MEM_CGROUP_CHARGE_TYPE_FORCE); + else if (PageAnon(newpage)) + mem_cgroup_uncharge_page(newpage); } /* * This routine traverse page_cgroup in given list and drop them all. - * This routine ignores page_cgroup->ref_cnt. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. */ #define FORCE_UNCHARGE_BATCH (128) @@ -795,7 +811,8 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, * if it's under page migration. */ if (PageLRU(page)) { - mem_cgroup_uncharge_page(page); + __mem_cgroup_uncharge_common(page, + MEM_CGROUP_CHARGE_TYPE_FORCE); put_page(page); if (--count <= 0) { count = FORCE_UNCHARGE_BATCH; -- cgit v1.2.2 From c9b0ed51483cc2fc42bb801b6675c4231b0e4634 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:15 -0700 Subject: memcg: helper function for relcaim from shmem. A new call, mem_cgroup_shrink_usage() is added for shmem handling and relacing non-standard usage of mem_cgroup_charge/uncharge. Now, shmem calls mem_cgroup_charge() just for reclaim some pages from mem_cgroup. In general, shmem is used by some process group and not for global resource (like file caches). So, it's reasonable to reclaim pages from mem_cgroup where shmem is mainly used. [hugh@veritas.com: shmem_getpage release page sooner] [hugh@veritas.com: mem_cgroup_shrink_usage css_put] Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a61706193c31..f46b8615de6c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -780,6 +780,32 @@ void mem_cgroup_end_migration(struct page *newpage) mem_cgroup_uncharge_page(newpage); } +/* + * A call to try to shrink memory usage under specified resource controller. + * This is typically used for page reclaiming for shmem for reducing side + * effect of page allocation from shmem, which is used by some mem_cgroup. + */ +int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) +{ + struct mem_cgroup *mem; + int progress = 0; + int retry = MEM_CGROUP_RECLAIM_RETRIES; + + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + css_get(&mem->css); + rcu_read_unlock(); + + do { + progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + } while (!progress && --retry); + + css_put(&mem->css); + if (!retry) + return -ENOMEM; + return 0; +} + /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. -- cgit v1.2.2 From b76734e5e34e1889ab9fc5f3756570b1129f0f50 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:16 -0700 Subject: memcg: add hints for branch Showing brach direction for obvious conditions. Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: Hugh Dickins Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f46b8615de6c..04ded27f6226 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -550,7 +550,7 @@ retry: * The page_cgroup exists and * the page has already been accounted. */ - if (pc) { + if (unlikely(pc)) { VM_BUG_ON(pc->page != page); VM_BUG_ON(!pc->mem_cgroup); unlock_page_cgroup(page); @@ -559,7 +559,7 @@ retry: unlock_page_cgroup(page); pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); - if (pc == NULL) + if (unlikely(pc == NULL)) goto err; /* @@ -616,7 +616,7 @@ retry: pc->flags = PAGE_CGROUP_FLAG_ACTIVE; lock_page_cgroup(page); - if (page_get_page_cgroup(page)) { + if (unlikely(page_get_page_cgroup(page))) { unlock_page_cgroup(page); /* * Another charge has been added to this page already. @@ -690,7 +690,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) */ lock_page_cgroup(page); pc = page_get_page_cgroup(page); - if (!pc) + if (unlikely(!pc)) goto unlock; VM_BUG_ON(pc->page != page); -- cgit v1.2.2 From accf163e6ab729f1fc5fffaa0310e498270bf4e7 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:17 -0700 Subject: memcg: remove a redundant check Because of remove refcnt patch, it's very rare case to that mem_cgroup_charge_common() is called against a page which is accounted. mem_cgroup_charge_common() is called when. 1. a page is added into file cache. 2. an anon page is _newly_ mapped. A racy case is that a newly-swapped-in anonymous page is referred from prural threads in do_swap_page() at the same time. (a page is not Locked when mem_cgroup_charge() is called from do_swap_page.) Another case is shmem. It charges its page before calling add_to_page_cache(). Then, mem_cgroup_charge_cache() is called twice. This case is handled in mem_cgroup_cache_charge(). But this check may be too hacky... Signed-off-by : KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: Hugh Dickins Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 53 +++++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 28 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 04ded27f6226..5b3759bd5494 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -536,28 +536,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, if (mem_cgroup_subsys.disabled) return 0; - /* - * Should page_cgroup's go to their own slab? - * One could optimize the performance of the charging routine - * by saving a bit in the page_flags and using it as a lock - * to see if the cgroup page already has a page_cgroup associated - * with it - */ -retry: - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - /* - * The page_cgroup exists and - * the page has already been accounted. - */ - if (unlikely(pc)) { - VM_BUG_ON(pc->page != page); - VM_BUG_ON(!pc->mem_cgroup); - unlock_page_cgroup(page); - goto done; - } - unlock_page_cgroup(page); - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); if (unlikely(pc == NULL)) goto err; @@ -618,15 +596,10 @@ retry: lock_page_cgroup(page); if (unlikely(page_get_page_cgroup(page))) { unlock_page_cgroup(page); - /* - * Another charge has been added to this page already. - * We take lock_page_cgroup(page) again and read - * page->cgroup, increment refcnt.... just retry is OK. - */ res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); kmem_cache_free(page_cgroup_cache, pc); - goto retry; + goto done; } page_assign_page_cgroup(page, pc); @@ -665,8 +638,32 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + /* + * Corner case handling. This is called from add_to_page_cache() + * in usual. But some FS (shmem) precharges this page before calling it + * and call add_to_page_cache() with GFP_NOWAIT. + * + * For GFP_NOWAIT case, the page may be pre-charged before calling + * add_to_page_cache(). (See shmem.c) check it here and avoid to call + * charge twice. (It works but has to pay a bit larger cost.) + */ + if (!(gfp_mask & __GFP_WAIT)) { + struct page_cgroup *pc; + + lock_page_cgroup(page); + pc = page_get_page_cgroup(page); + if (pc) { + VM_BUG_ON(pc->page != page); + VM_BUG_ON(!pc->mem_cgroup); + unlock_page_cgroup(page); + return 0; + } + unlock_page_cgroup(page); + } + if (unlikely(!mm)) mm = &init_mm; + return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); } -- cgit v1.2.2 From cede86acd8bd5d2205dec28db8ac86410a3a19e8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 25 Jul 2008 01:47:18 -0700 Subject: memcg: clean up checking of the disabled flag Those checks are unnecessary, because when the subsystem is disabled it can't be mounted, so those functions won't get called. The check is needed in functions which will be called in other places except cgroup. [hugh@veritas.com: further checking of disabled flag] Signed-off-by: Li Zefan Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5b3759bd5494..0c035647d36a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -354,6 +354,9 @@ void mem_cgroup_move_lists(struct page *page, bool active) struct mem_cgroup_per_zone *mz; unsigned long flags; + if (mem_cgroup_subsys.disabled) + return; + /* * We cannot lock_page_cgroup while holding zone's lru_lock, * because other holders of lock_page_cgroup can be interrupted @@ -533,9 +536,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup_per_zone *mz; - if (mem_cgroup_subsys.disabled) - return 0; - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); if (unlikely(pc == NULL)) goto err; @@ -620,6 +620,9 @@ err: int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + if (mem_cgroup_subsys.disabled) + return 0; + /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -638,6 +641,9 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + if (mem_cgroup_subsys.disabled) + return 0; + /* * Corner case handling. This is called from add_to_page_cache() * in usual. But some FS (shmem) precharges this page before calling it @@ -788,6 +794,9 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) int progress = 0; int retry = MEM_CGROUP_RECLAIM_RETRIES; + if (mem_cgroup_subsys.disabled) + return 0; + rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); css_get(&mem->css); @@ -857,9 +866,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) int ret = -EBUSY; int node, zid; - if (mem_cgroup_subsys.disabled) - return 0; - css_get(&mem->css); /* * page reclaim code (kswapd etc..) will move pages between @@ -1103,8 +1109,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, static int mem_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) { - if (mem_cgroup_subsys.disabled) - return 0; return cgroup_add_files(cont, ss, mem_cgroup_files, ARRAY_SIZE(mem_cgroup_files)); } @@ -1117,9 +1121,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct mm_struct *mm; struct mem_cgroup *mem, *old_mem; - if (mem_cgroup_subsys.disabled) - return; - mm = get_task_mm(p); if (mm == NULL) return; -- cgit v1.2.2 From 628f42355389cfb596ca3a5a5f64fb9054a2a06a Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:20 -0700 Subject: memcg: limit change shrink usage Shrinking memory usage at limit change. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Balbir Singh Acked-by: Pavel Emelyanov Signed-off-by: KAMEZAWA Hiroyuki Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 4 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0c035647d36a..fba566c51322 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -812,6 +812,30 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) return 0; } +int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) +{ + + int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int progress; + int ret = 0; + + while (res_counter_set_limit(&memcg->res, val)) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (!retry_count) { + ret = -EBUSY; + break; + } + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); + if (!progress) + retry_count--; + } + return ret; +} + + /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. @@ -896,13 +920,29 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, cft->private); } - +/* + * The user of this function is... + * RES_LIMIT. + */ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, const char *buffer) { - return res_counter_write(&mem_cgroup_from_cont(cont)->res, - cft->private, buffer, - res_counter_memparse_write_strategy); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + unsigned long long val; + int ret; + + switch (cft->private) { + case RES_LIMIT: + /* This function does all necessary parse...reuse it */ + ret = res_counter_memparse_write_strategy(buffer, &val); + if (!ret) + ret = mem_cgroup_resize_limit(memcg, val); + break; + default: + ret = -EINVAL; /* should be BUG() ? */ + break; + } + return ret; } static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) -- cgit v1.2.2