memcg: allocate all page_cgroup at boot

Allocate all page_cgroup at boot and remove page_cgroup poitner from struct page. This patch adds an interface as struct page_cgroup *lookup_page_cgroup(struct page*) All FLATMEM/DISCONTIGMEM/SPARSEMEM and MEMORY_HOTPLUG is supported. Remove page_cgroup pointer reduces the amount of memory by - 4 bytes per PAGE_SIZE. - 8 bytes per PAGE_SIZE if memory controller is disabled. (even if configured.) On usual 8GB x86-32 server, this saves 8MB of NORMAL_ZONE memory. On my x86-64 server with 48GB of memory, this saves 96MB of memory. I think this reduction makes sense. By pre-allocation, kmalloc/kfree in charge/uncharge are removed. This means - we're not necessary to be afraid of kmalloc faiulre. (this can happen because of gfp_mask type.) - we can avoid calling kmalloc/kfree. - we can avoid allocating tons of small objects which can be fragmented. - we can know what amount of memory will be used for this extra-lru handling. I added printk message as "allocated %ld bytes of page_cgroup" "please try cgroup_disable=memory option if you don't want" maybe enough informative for users. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reviewed-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> 2008-10-18 23:28:16 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-10-20 11:52:39 -0400
commit: 52d4b9ac0b985168009c2a57098324e67bae171f (patch)
tree: b3e3b854166930af893be90ea30a7ab0d65c59e7 /mm/memcontrol.c
parent: c05555b572921c464d064d9267f7f7bc06d424fa (diff)
1 files changed, 80 insertions, 167 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 031682e7ef0c..d4a92b63e98e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,11 +33,11 @@
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
+#include <linux/page_cgroup.h>
 #include <asm/uaccess.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-static struct kmem_cache *page_cgroup_cache __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 /*
@@ -135,79 +135,6 @@ struct mem_cgroup {
 };
 static struct mem_cgroup init_mem_cgroup;
-/*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock.  We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin).  But since
- * bit_spin_lock doesn't actually set that lock bit in a non-debug
- * uniprocessor kernel, we should avoid setting it here too.
- */
-#define PAGE_CGROUP_LOCK_BIT    0x0
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define PAGE_CGROUP_LOCK        (1 << PAGE_CGROUP_LOCK_BIT)
-#else
-#define PAGE_CGROUP_LOCK        0x0
-#endif
-/*
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- */
-struct page_cgroup {
-        struct list_head lru;           /* per cgroup LRU list */
-        struct page *page;
-        struct mem_cgroup *mem_cgroup;
-        unsigned long flags;
-};
-enum {
-        /* flags for mem_cgroup */
-        PCG_CACHE, /* charged as cache */
-        /* flags for LRU placement */
-        PCG_ACTIVE, /* page is active in this cgroup */
-        PCG_FILE, /* page is file system backed */
-        PCG_UNEVICTABLE, /* page is unevictableable */
-};
-#define TESTPCGFLAG(uname, lname)                       \
-static inline int PageCgroup##uname(struct page_cgroup *pc)     \
-        { return test_bit(PCG_##lname, &pc->flags); }
-#define SETPCGFLAG(uname, lname)                        \
-static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
-        { set_bit(PCG_##lname, &pc->flags);  }
-#define CLEARPCGFLAG(uname, lname)                      \
-static inline void ClearPageCgroup##uname(struct page_cgroup *pc)       \
-        { clear_bit(PCG_##lname, &pc->flags);  }
-/* Cache flag is set only once (at allocation) */
-TESTPCGFLAG(Cache, CACHE)
-/* LRU management flags (from global-lru definition) */
-TESTPCGFLAG(File, FILE)
-SETPCGFLAG(File, FILE)
-CLEARPCGFLAG(File, FILE)
-TESTPCGFLAG(Active, ACTIVE)
-SETPCGFLAG(Active, ACTIVE)
-CLEARPCGFLAG(Active, ACTIVE)
-TESTPCGFLAG(Unevictable, UNEVICTABLE)
-SETPCGFLAG(Unevictable, UNEVICTABLE)
-CLEARPCGFLAG(Unevictable, UNEVICTABLE)
-static int page_cgroup_nid(struct page_cgroup *pc)
-{
-        return page_to_nid(pc->page);
-}
-static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
-{
-        return page_zonenum(pc->page);
-}
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
        MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -216,12 +143,18 @@ enum charge_type {
        NR_CHARGE_TYPE,
 };
+/* only for here (for easy reading.) */
+#define PCGF_CACHE      (1UL << PCG_CACHE)
+#define PCGF_USED       (1UL << PCG_USED)
+#define PCGF_ACTIVE     (1UL << PCG_ACTIVE)
+#define PCGF_LOCK       (1UL << PCG_LOCK)
+#define PCGF_FILE       (1UL << PCG_FILE)
 static const unsigned long
 pcg_default_flags[NR_CHARGE_TYPE] = {
-        ((1 << PCG_CACHE) | (1 << PCG_FILE)),
+        PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
-        ((1 << PCG_ACTIVE)),
+        PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
-        ((1 << PCG_ACTIVE) | (1 << PCG_CACHE)),
+        PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
-        0,
+        0, /* FORCE */
 };
 /*
@@ -303,37 +236,6 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
                                struct mem_cgroup, css);
 }
-static inline int page_cgroup_locked(struct page *page)
-{
-        return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
-{
-        VM_BUG_ON(!page_cgroup_locked(page));
-        page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
-}
-struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
-        return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
-}
-static void lock_page_cgroup(struct page *page)
-{
-        bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-static int try_lock_page_cgroup(struct page *page)
-{
-        return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-static void unlock_page_cgroup(struct page *page)
-{
-        bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
                        struct page_cgroup *pc)
 {
@@ -436,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
         * safely get to page_cgroup without it, so just try_lock it:
         * mem_cgroup_isolate_pages allows for page left on wrong list.
         */
-        if (!try_lock_page_cgroup(page))
+        pc = lookup_page_cgroup(page);
+        if (!trylock_page_cgroup(pc))
                return;
+        if (pc && PageCgroupUsed(pc)) {
-        pc = page_get_page_cgroup(page);
-        if (pc) {
                mz = page_cgroup_zoneinfo(pc);
                spin_lock_irqsave(&mz->lru_lock, flags);
                __mem_cgroup_move_lists(pc, lru);
                spin_unlock_irqrestore(&mz->lru_lock, flags);
        }
-        unlock_page_cgroup(page);
+        unlock_page_cgroup(pc);
 }
 /*
@@ -533,6 +434,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
                if (scan >= nr_to_scan)
                        break;
+                if (unlikely(!PageCgroupUsed(pc)))
+                        continue;
                page = pc->page;
                if (unlikely(!PageLRU(page)))
@@ -576,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 {
        struct mem_cgroup *mem;
        struct page_cgroup *pc;
-        unsigned long flags;
        unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup_per_zone *mz;
+        unsigned long flags;
-        pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
+        pc = lookup_page_cgroup(page);
-        if (unlikely(pc == NULL))
+        /* can happen at boot */
-                goto err;
+        if (unlikely(!pc))
+                return 0;
+        prefetchw(pc);
        /*
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
        if (likely(!memcg)) {
                rcu_read_lock();
                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
                if (unlikely(!mem)) {
                        rcu_read_unlock();
-                        kmem_cache_free(page_cgroup_cache, pc);
                        return 0;
                }
                /*
@@ -631,36 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                }
        }
+        lock_page_cgroup(pc);
+        if (unlikely(PageCgroupUsed(pc))) {
+                unlock_page_cgroup(pc);
+                res_counter_uncharge(&mem->res, PAGE_SIZE);
+                css_put(&mem->css);
+                goto done;
+        }
        pc->mem_cgroup = mem;
-        pc->page = page;
        /*
         * If a page is accounted as a page cache, insert to inactive list.
         * If anon, insert to active list.
         */
        pc->flags = pcg_default_flags[ctype];
-        lock_page_cgroup(page);
-        if (unlikely(page_get_page_cgroup(page))) {
-                unlock_page_cgroup(page);
-                res_counter_uncharge(&mem->res, PAGE_SIZE);
-                css_put(&mem->css);
-                kmem_cache_free(page_cgroup_cache, pc);
-                goto done;
-        }
-        page_assign_page_cgroup(page, pc);
        mz = page_cgroup_zoneinfo(pc);
        spin_lock_irqsave(&mz->lru_lock, flags);
        __mem_cgroup_add_list(mz, pc);
        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        unlock_page_cgroup(pc);
-        unlock_page_cgroup(page);
 done:
        return 0;
 out:
        css_put(&mem->css);
-        kmem_cache_free(page_cgroup_cache, pc);
-err:
        return -ENOMEM;
 }
@@ -668,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
        if (mem_cgroup_subsys.disabled)
                return 0;
+        if (PageCompound(page))
+                return 0;
        /*
         * If already mapped, we don't have to account.
         * If page cache, page->mapping has address_space.
@@ -689,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 {
        if (mem_cgroup_subsys.disabled)
                return 0;
+        if (PageCompound(page))
+                return 0;
        /*
         * Corner case handling. This is called from add_to_page_cache()
         * in usual. But some FS (shmem) precharges this page before calling it
@@ -702,15 +605,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
        if (!(gfp_mask & __GFP_WAIT)) {
                struct page_cgroup *pc;
-                lock_page_cgroup(page);
-                pc = page_get_page_cgroup(page);
+                pc = lookup_page_cgroup(page);
-                if (pc) {
+                if (!pc)
-                        VM_BUG_ON(pc->page != page);
+                        return 0;
-                        VM_BUG_ON(!pc->mem_cgroup);
+                lock_page_cgroup(pc);
-                        unlock_page_cgroup(page);
+                if (PageCgroupUsed(pc)) {
+                        unlock_page_cgroup(pc);
                        return 0;
                }
-                unlock_page_cgroup(page);
+                unlock_page_cgroup(pc);
        }
        if (unlikely(!mm))
@@ -741,37 +645,39 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        /*
         * Check if our page_cgroup is valid
         */
-        lock_page_cgroup(page);
+        pc = lookup_page_cgroup(page);
-        pc = page_get_page_cgroup(page);
+        if (unlikely(!pc || !PageCgroupUsed(pc)))
-        if (unlikely(!pc))
+                return;
-                goto unlock;
-        VM_BUG_ON(pc->page != page);
-        if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+        lock_page_cgroup(pc);
-            && ((PageCgroupCache(pc) || page_mapped(page))))
+        if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
-                goto unlock;
+             || !PageCgroupUsed(pc)) {
+                /* This happens at race in zap_pte_range() and do_swap_page()*/
+                unlock_page_cgroup(pc);
+                return;
+        }
+        ClearPageCgroupUsed(pc);
+        mem = pc->mem_cgroup;
        mz = page_cgroup_zoneinfo(pc);
        spin_lock_irqsave(&mz->lru_lock, flags);
        __mem_cgroup_remove_list(mz, pc);
        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        unlock_page_cgroup(pc);
-        page_assign_page_cgroup(page, NULL);
-        unlock_page_cgroup(page);
-        mem = pc->mem_cgroup;
        res_counter_uncharge(&mem->res, PAGE_SIZE);
        css_put(&mem->css);
-        kmem_cache_free(page_cgroup_cache, pc);
        return;
-unlock:
-        unlock_page_cgroup(page);
 }
 void mem_cgroup_uncharge_page(struct page *page)
 {
+        /* early check. */
+        if (page_mapped(page))
+                return;
+        if (page->mapping && !PageAnon(page))
+                return;
        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
@@ -795,9 +701,9 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
        if (mem_cgroup_subsys.disabled)
                return 0;
-        lock_page_cgroup(page);
+        pc = lookup_page_cgroup(page);
-        pc = page_get_page_cgroup(page);
+        lock_page_cgroup(pc);
-        if (pc) {
+        if (PageCgroupUsed(pc)) {
                mem = pc->mem_cgroup;
                css_get(&mem->css);
                if (PageCgroupCache(pc)) {
@@ -807,7 +713,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
                                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
                }
        }
-        unlock_page_cgroup(page);
+        unlock_page_cgroup(pc);
        if (mem) {
                ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
                        ctype, mem);
@@ -832,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
         */
        if (!newpage->mapping)
                __mem_cgroup_uncharge_common(newpage,
-                                         MEM_CGROUP_CHARGE_TYPE_FORCE);
+                                MEM_CGROUP_CHARGE_TYPE_FORCE);
        else if (PageAnon(newpage))
                mem_cgroup_uncharge_page(newpage);
 }
@@ -918,6 +824,8 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
        while (!list_empty(list)) {
                pc = list_entry(list->prev, struct page_cgroup, lru);
                page = pc->page;
+                if (!PageCgroupUsed(pc))
+                        break;
                get_page(page);
                spin_unlock_irqrestore(&mz->lru_lock, flags);
                /*
@@ -932,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
                                count = FORCE_UNCHARGE_BATCH;
                                cond_resched();
                        }
-                } else
+                } else {
-                        cond_resched();
+                        spin_lock_irqsave(&mz->lru_lock, flags);
+                        break;
+                }
                spin_lock_irqsave(&mz->lru_lock, flags);
        }
        spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -957,6 +867,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
        while (mem->res.usage > 0) {
                if (atomic_read(&mem->css.cgroup->count) > 0)
                        goto out;
+                /* This is for making all *used* pages to be on LRU. */
+                lru_add_drain_all();
                for_each_node_state(node, N_POSSIBLE)
                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                                struct mem_cgroup_per_zone *mz;
@@ -965,6 +877,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
                                for_each_lru(l)
                                        mem_cgroup_force_empty_list(mem, mz, l);
                        }
+                cond_resched();
        }
        ret = 0;
 out:
@@ -1175,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        int node;
        if (unlikely((cont->parent) == NULL)) {
+                page_cgroup_init();
                mem = &init_mem_cgroup;
-                page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
        } else {
                mem = mem_cgroup_alloc();
                if (!mem)
author	KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>	2008-10-18 23:28:16 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-10-20 11:52:39 -0400
commit	52d4b9ac0b985168009c2a57098324e67bae171f (patch)
tree	b3e3b854166930af893be90ea30a7ab0d65c59e7 /mm/memcontrol.c
parent	c05555b572921c464d064d9267f7f7bc06d424fa (diff)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 031682e7ef0c..d4a92b63e98e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -33,11 +33,11 @@
33	#include <linux/seq_file.h>	33	#include <linux/seq_file.h>
34	#include <linux/vmalloc.h>	34	#include <linux/vmalloc.h>
35	#include <linux/mm_inline.h>	35	#include <linux/mm_inline.h>
		36	#include <linux/page_cgroup.h>
36		37
37	#include <asm/uaccess.h>	38	#include <asm/uaccess.h>
38		39
39	struct cgroup_subsys mem_cgroup_subsys __read_mostly;	40	struct cgroup_subsys mem_cgroup_subsys __read_mostly;
40	static struct kmem_cache *page_cgroup_cache __read_mostly;
41	#define MEM_CGROUP_RECLAIM_RETRIES 5	41	#define MEM_CGROUP_RECLAIM_RETRIES 5
42		42
43	/*	43	/*
@@ -135,79 +135,6 @@ struct mem_cgroup {
135	};	135	};
136	static struct mem_cgroup init_mem_cgroup;	136	static struct mem_cgroup init_mem_cgroup;
137		137
138	/*
139	* We use the lower bit of the page->page_cgroup pointer as a bit spin
140	* lock. We need to ensure that page->page_cgroup is at least two
141	* byte aligned (based on comments from Nick Piggin). But since
142	* bit_spin_lock doesn't actually set that lock bit in a non-debug
143	* uniprocessor kernel, we should avoid setting it here too.
144	*/
145	#define PAGE_CGROUP_LOCK_BIT 0x0
146	#if defined(CONFIG_SMP) \|\| defined(CONFIG_DEBUG_SPINLOCK)
147	#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
148	#else
149	#define PAGE_CGROUP_LOCK 0x0
150	#endif
151
152	/*
153	* A page_cgroup page is associated with every page descriptor. The
154	* page_cgroup helps us identify information about the cgroup
155	*/
156	struct page_cgroup {
157	struct list_head lru; /* per cgroup LRU list */
158	struct page *page;
159	struct mem_cgroup *mem_cgroup;
160	unsigned long flags;
161	};
162
163	enum {
164	/* flags for mem_cgroup */
165	PCG_CACHE, /* charged as cache */
166	/* flags for LRU placement */
167	PCG_ACTIVE, /* page is active in this cgroup */
168	PCG_FILE, /* page is file system backed */
169	PCG_UNEVICTABLE, /* page is unevictableable */
170	};
171
172	#define TESTPCGFLAG(uname, lname) \
173	static inline int PageCgroup##uname(struct page_cgroup *pc) \
174	{ return test_bit(PCG_##lname, &pc->flags); }
175
176	#define SETPCGFLAG(uname, lname) \
177	static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
178	{ set_bit(PCG_##lname, &pc->flags); }
179
180	#define CLEARPCGFLAG(uname, lname) \
181	static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \
182	{ clear_bit(PCG_##lname, &pc->flags); }
183
184
185	/* Cache flag is set only once (at allocation) */
186	TESTPCGFLAG(Cache, CACHE)
187
188	/* LRU management flags (from global-lru definition) */
189	TESTPCGFLAG(File, FILE)
190	SETPCGFLAG(File, FILE)
191	CLEARPCGFLAG(File, FILE)
192
193	TESTPCGFLAG(Active, ACTIVE)
194	SETPCGFLAG(Active, ACTIVE)
195	CLEARPCGFLAG(Active, ACTIVE)
196
197	TESTPCGFLAG(Unevictable, UNEVICTABLE)
198	SETPCGFLAG(Unevictable, UNEVICTABLE)
199	CLEARPCGFLAG(Unevictable, UNEVICTABLE)
200
201	static int page_cgroup_nid(struct page_cgroup *pc)
202	{
203	return page_to_nid(pc->page);
204	}
205
206	static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
207	{
208	return page_zonenum(pc->page);
209	}
210
211	enum charge_type {	138	enum charge_type {
212	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,	139	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
213	MEM_CGROUP_CHARGE_TYPE_MAPPED,	140	MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -216,12 +143,18 @@ enum charge_type {
216	NR_CHARGE_TYPE,	143	NR_CHARGE_TYPE,
217	};	144	};
218		145
		146	/* only for here (for easy reading.) */
		147	#define PCGF_CACHE (1UL << PCG_CACHE)
		148	#define PCGF_USED (1UL << PCG_USED)
		149	#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
		150	#define PCGF_LOCK (1UL << PCG_LOCK)
		151	#define PCGF_FILE (1UL << PCG_FILE)
219	static const unsigned long	152	static const unsigned long
220	pcg_default_flags[NR_CHARGE_TYPE] = {	153	pcg_default_flags[NR_CHARGE_TYPE] = {
221	((1 << PCG_CACHE) \| (1 << PCG_FILE)),	154	PCGF_CACHE \| PCGF_FILE \| PCGF_USED \| PCGF_LOCK, /* File Cache */
222	((1 << PCG_ACTIVE)),	155	PCGF_ACTIVE \| PCGF_USED \| PCGF_LOCK, /* Anon */
223	((1 << PCG_ACTIVE) \| (1 << PCG_CACHE)),	156	PCGF_ACTIVE \| PCGF_CACHE \| PCGF_USED \| PCGF_LOCK, /* Shmem */
224	0,	157	0, /* FORCE */
225	};	158	};
226		159
227	/*	160	/*
@@ -303,37 +236,6 @@ struct mem_cgroup mem_cgroup_from_task(struct task_struct p)
303	struct mem_cgroup, css);	236	struct mem_cgroup, css);
304	}	237	}
305		238
306	static inline int page_cgroup_locked(struct page *page)
307	{
308	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
309	}
310
311	static void page_assign_page_cgroup(struct page page, struct page_cgroup pc)
312	{
313	VM_BUG_ON(!page_cgroup_locked(page));
314	page->page_cgroup = ((unsigned long)pc \| PAGE_CGROUP_LOCK);
315	}
316
317	struct page_cgroup page_get_page_cgroup(struct page page)
318	{
319	return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
320	}
321
322	static void lock_page_cgroup(struct page *page)
323	{
324	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
325	}
326
327	static int try_lock_page_cgroup(struct page *page)
328	{
329	return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
330	}
331
332	static void unlock_page_cgroup(struct page *page)
333	{
334	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
335	}
336
337	static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,	239	static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
338	struct page_cgroup *pc)	240	struct page_cgroup *pc)
339	{	241	{
@@ -436,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
436	* safely get to page_cgroup without it, so just try_lock it:	338	* safely get to page_cgroup without it, so just try_lock it:
437	* mem_cgroup_isolate_pages allows for page left on wrong list.	339	* mem_cgroup_isolate_pages allows for page left on wrong list.
438	*/	340	*/
439	if (!try_lock_page_cgroup(page))	341	pc = lookup_page_cgroup(page);
		342	if (!trylock_page_cgroup(pc))
440	return;	343	return;
441		344	if (pc && PageCgroupUsed(pc)) {
442	pc = page_get_page_cgroup(page);
443	if (pc) {
444	mz = page_cgroup_zoneinfo(pc);	345	mz = page_cgroup_zoneinfo(pc);
445	spin_lock_irqsave(&mz->lru_lock, flags);	346	spin_lock_irqsave(&mz->lru_lock, flags);
446	__mem_cgroup_move_lists(pc, lru);	347	__mem_cgroup_move_lists(pc, lru);
447	spin_unlock_irqrestore(&mz->lru_lock, flags);	348	spin_unlock_irqrestore(&mz->lru_lock, flags);
448	}	349	}
449	unlock_page_cgroup(page);	350	unlock_page_cgroup(pc);
450	}	351	}
451		352
452	/*	353	/*
@@ -533,6 +434,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
533	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {	434	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
534	if (scan >= nr_to_scan)	435	if (scan >= nr_to_scan)
535	break;	436	break;
		437	if (unlikely(!PageCgroupUsed(pc)))
		438	continue;
536	page = pc->page;	439	page = pc->page;
537		440
538	if (unlikely(!PageLRU(page)))	441	if (unlikely(!PageLRU(page)))
@@ -576,26 +479,27 @@ static int mem_cgroup_charge_common(struct page page, struct mm_struct mm,
576	{	479	{
577	struct mem_cgroup *mem;	480	struct mem_cgroup *mem;
578	struct page_cgroup *pc;	481	struct page_cgroup *pc;
579	unsigned long flags;
580	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;	482	unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
581	struct mem_cgroup_per_zone *mz;	483	struct mem_cgroup_per_zone *mz;
		484	unsigned long flags;
582		485
583	pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);	486	pc = lookup_page_cgroup(page);
584	if (unlikely(pc == NULL))	487	/* can happen at boot */
585	goto err;	488	if (unlikely(!pc))
586		489	return 0;
		490	prefetchw(pc);
587	/*	491	/*
588	* We always charge the cgroup the mm_struct belongs to.	492	* We always charge the cgroup the mm_struct belongs to.
589	* The mm_struct's mem_cgroup changes on task migration if the	493	* The mm_struct's mem_cgroup changes on task migration if the
590	* thread group leader migrates. It's possible that mm is not	494	* thread group leader migrates. It's possible that mm is not
591	* set, if so charge the init_mm (happens for pagecache usage).	495	* set, if so charge the init_mm (happens for pagecache usage).
592	*/	496	*/
		497
593	if (likely(!memcg)) {	498	if (likely(!memcg)) {
594	rcu_read_lock();	499	rcu_read_lock();
595	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));	500	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
596	if (unlikely(!mem)) {	501	if (unlikely(!mem)) {
597	rcu_read_unlock();	502	rcu_read_unlock();
598	kmem_cache_free(page_cgroup_cache, pc);
599	return 0;	503	return 0;
600	}	504	}
601	/*	505	/*
@@ -631,36 +535,33 @@ static int mem_cgroup_charge_common(struct page page, struct mm_struct mm,
631	}	535	}
632	}	536	}
633		537
		538
		539	lock_page_cgroup(pc);
		540	if (unlikely(PageCgroupUsed(pc))) {
		541	unlock_page_cgroup(pc);
		542	res_counter_uncharge(&mem->res, PAGE_SIZE);
		543	css_put(&mem->css);
		544
		545	goto done;
		546	}
634	pc->mem_cgroup = mem;	547	pc->mem_cgroup = mem;
635	pc->page = page;
636	/*	548	/*
637	* If a page is accounted as a page cache, insert to inactive list.	549	* If a page is accounted as a page cache, insert to inactive list.
638	* If anon, insert to active list.	550	* If anon, insert to active list.
639	*/	551	*/
640	pc->flags = pcg_default_flags[ctype];	552	pc->flags = pcg_default_flags[ctype];
641		553
642	lock_page_cgroup(page);
643	if (unlikely(page_get_page_cgroup(page))) {
644	unlock_page_cgroup(page);
645	res_counter_uncharge(&mem->res, PAGE_SIZE);
646	css_put(&mem->css);
647	kmem_cache_free(page_cgroup_cache, pc);
648	goto done;
649	}
650	page_assign_page_cgroup(page, pc);
651
652	mz = page_cgroup_zoneinfo(pc);	554	mz = page_cgroup_zoneinfo(pc);
		555
653	spin_lock_irqsave(&mz->lru_lock, flags);	556	spin_lock_irqsave(&mz->lru_lock, flags);
654	__mem_cgroup_add_list(mz, pc);	557	__mem_cgroup_add_list(mz, pc);
655	spin_unlock_irqrestore(&mz->lru_lock, flags);	558	spin_unlock_irqrestore(&mz->lru_lock, flags);
		559	unlock_page_cgroup(pc);
656		560
657	unlock_page_cgroup(page);
658	done:	561	done:
659	return 0;	562	return 0;
660	out:	563	out:
661	css_put(&mem->css);	564	css_put(&mem->css);
662	kmem_cache_free(page_cgroup_cache, pc);
663	err:
664	return -ENOMEM;	565	return -ENOMEM;
665	}	566	}
666		567
@@ -668,7 +569,8 @@ int mem_cgroup_charge(struct page page, struct mm_struct mm, gfp_t gfp_mask)
668	{	569	{
669	if (mem_cgroup_subsys.disabled)	570	if (mem_cgroup_subsys.disabled)
670	return 0;	571	return 0;
671		572	if (PageCompound(page))
		573	return 0;
672	/*	574	/*
673	* If already mapped, we don't have to account.	575	* If already mapped, we don't have to account.
674	* If page cache, page->mapping has address_space.	576	* If page cache, page->mapping has address_space.
@@ -689,7 +591,8 @@ int mem_cgroup_cache_charge(struct page page, struct mm_struct mm,
689	{	591	{
690	if (mem_cgroup_subsys.disabled)	592	if (mem_cgroup_subsys.disabled)
691	return 0;	593	return 0;
692		594	if (PageCompound(page))
		595	return 0;
693	/*	596	/*
694	* Corner case handling. This is called from add_to_page_cache()	597	* Corner case handling. This is called from add_to_page_cache()
695	* in usual. But some FS (shmem) precharges this page before calling it	598	* in usual. But some FS (shmem) precharges this page before calling it
@@ -702,15 +605,16 @@ int mem_cgroup_cache_charge(struct page page, struct mm_struct mm,
702	if (!(gfp_mask & __GFP_WAIT)) {	605	if (!(gfp_mask & __GFP_WAIT)) {
703	struct page_cgroup *pc;	606	struct page_cgroup *pc;
704		607
705	lock_page_cgroup(page);	608
706	pc = page_get_page_cgroup(page);	609	pc = lookup_page_cgroup(page);
707	if (pc) {	610	if (!pc)
708	VM_BUG_ON(pc->page != page);	611	return 0;
709	VM_BUG_ON(!pc->mem_cgroup);	612	lock_page_cgroup(pc);
710	unlock_page_cgroup(page);	613	if (PageCgroupUsed(pc)) {
		614	unlock_page_cgroup(pc);
711	return 0;	615	return 0;
712	}	616	}
713	unlock_page_cgroup(page);	617	unlock_page_cgroup(pc);
714	}	618	}
715		619
716	if (unlikely(!mm))	620	if (unlikely(!mm))
@@ -741,37 +645,39 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
741	/*	645	/*
742	* Check if our page_cgroup is valid	646	* Check if our page_cgroup is valid
743	*/	647	*/
744	lock_page_cgroup(page);	648	pc = lookup_page_cgroup(page);
745	pc = page_get_page_cgroup(page);	649	if (unlikely(!pc \|\| !PageCgroupUsed(pc)))
746	if (unlikely(!pc))	650	return;
747	goto unlock;
748
749	VM_BUG_ON(pc->page != page);
750		651
751	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)	652	lock_page_cgroup(pc);
752	&& ((PageCgroupCache(pc) \|\| page_mapped(page))))	653	if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
753	goto unlock;	654	\|\| !PageCgroupUsed(pc)) {
		655	/* This happens at race in zap_pte_range() and do_swap_page()*/
		656	unlock_page_cgroup(pc);
		657	return;
		658	}
		659	ClearPageCgroupUsed(pc);
		660	mem = pc->mem_cgroup;
754		661
755	mz = page_cgroup_zoneinfo(pc);	662	mz = page_cgroup_zoneinfo(pc);
756	spin_lock_irqsave(&mz->lru_lock, flags);	663	spin_lock_irqsave(&mz->lru_lock, flags);
757	__mem_cgroup_remove_list(mz, pc);	664	__mem_cgroup_remove_list(mz, pc);
758	spin_unlock_irqrestore(&mz->lru_lock, flags);	665	spin_unlock_irqrestore(&mz->lru_lock, flags);
		666	unlock_page_cgroup(pc);
759		667
760	page_assign_page_cgroup(page, NULL);
761	unlock_page_cgroup(page);
762
763	mem = pc->mem_cgroup;
764	res_counter_uncharge(&mem->res, PAGE_SIZE);	668	res_counter_uncharge(&mem->res, PAGE_SIZE);
765	css_put(&mem->css);	669	css_put(&mem->css);
766		670
767	kmem_cache_free(page_cgroup_cache, pc);
768	return;	671	return;
769	unlock:
770	unlock_page_cgroup(page);
771	}	672	}
772		673
773	void mem_cgroup_uncharge_page(struct page *page)	674	void mem_cgroup_uncharge_page(struct page *page)
774	{	675	{
		676	/* early check. */
		677	if (page_mapped(page))
		678	return;
		679	if (page->mapping && !PageAnon(page))
		680	return;
775	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);	681	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
776	}	682	}
777		683
@@ -795,9 +701,9 @@ int mem_cgroup_prepare_migration(struct page page, struct page newpage)
795	if (mem_cgroup_subsys.disabled)	701	if (mem_cgroup_subsys.disabled)
796	return 0;	702	return 0;
797		703
798	lock_page_cgroup(page);	704	pc = lookup_page_cgroup(page);
799	pc = page_get_page_cgroup(page);	705	lock_page_cgroup(pc);
800	if (pc) {	706	if (PageCgroupUsed(pc)) {
801	mem = pc->mem_cgroup;	707	mem = pc->mem_cgroup;
802	css_get(&mem->css);	708	css_get(&mem->css);
803	if (PageCgroupCache(pc)) {	709	if (PageCgroupCache(pc)) {
@@ -807,7 +713,7 @@ int mem_cgroup_prepare_migration(struct page page, struct page newpage)
807	ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;	713	ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
808	}	714	}
809	}	715	}
810	unlock_page_cgroup(page);	716	unlock_page_cgroup(pc);
811	if (mem) {	717	if (mem) {
812	ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,	718	ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
813	ctype, mem);	719	ctype, mem);
@@ -832,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
832	*/	738	*/
833	if (!newpage->mapping)	739	if (!newpage->mapping)
834	__mem_cgroup_uncharge_common(newpage,	740	__mem_cgroup_uncharge_common(newpage,
835	MEM_CGROUP_CHARGE_TYPE_FORCE);	741	MEM_CGROUP_CHARGE_TYPE_FORCE);
836	else if (PageAnon(newpage))	742	else if (PageAnon(newpage))
837	mem_cgroup_uncharge_page(newpage);	743	mem_cgroup_uncharge_page(newpage);
838	}	744	}
@@ -918,6 +824,8 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
918	while (!list_empty(list)) {	824	while (!list_empty(list)) {
919	pc = list_entry(list->prev, struct page_cgroup, lru);	825	pc = list_entry(list->prev, struct page_cgroup, lru);
920	page = pc->page;	826	page = pc->page;
		827	if (!PageCgroupUsed(pc))
		828	break;
921	get_page(page);	829	get_page(page);
922	spin_unlock_irqrestore(&mz->lru_lock, flags);	830	spin_unlock_irqrestore(&mz->lru_lock, flags);
923	/*	831	/*
@@ -932,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
932	count = FORCE_UNCHARGE_BATCH;	840	count = FORCE_UNCHARGE_BATCH;
933	cond_resched();	841	cond_resched();
934	}	842	}
935	} else	843	} else {
936	cond_resched();	844	spin_lock_irqsave(&mz->lru_lock, flags);
		845	break;
		846	}
937	spin_lock_irqsave(&mz->lru_lock, flags);	847	spin_lock_irqsave(&mz->lru_lock, flags);
938	}	848	}
939	spin_unlock_irqrestore(&mz->lru_lock, flags);	849	spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -957,6 +867,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
957	while (mem->res.usage > 0) {	867	while (mem->res.usage > 0) {
958	if (atomic_read(&mem->css.cgroup->count) > 0)	868	if (atomic_read(&mem->css.cgroup->count) > 0)
959	goto out;	869	goto out;
		870	/* This is for making all used pages to be on LRU. */
		871	lru_add_drain_all();
960	for_each_node_state(node, N_POSSIBLE)	872	for_each_node_state(node, N_POSSIBLE)
961	for (zid = 0; zid < MAX_NR_ZONES; zid++) {	873	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
962	struct mem_cgroup_per_zone *mz;	874	struct mem_cgroup_per_zone *mz;
@@ -965,6 +877,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
965	for_each_lru(l)	877	for_each_lru(l)
966	mem_cgroup_force_empty_list(mem, mz, l);	878	mem_cgroup_force_empty_list(mem, mz, l);
967	}	879	}
		880	cond_resched();
968	}	881	}
969	ret = 0;	882	ret = 0;
970	out:	883	out:
@@ -1175,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
1175	int node;	1088	int node;
1176		1089
1177	if (unlikely((cont->parent) == NULL)) {	1090	if (unlikely((cont->parent) == NULL)) {
		1091	page_cgroup_init();
1178	mem = &init_mem_cgroup;	1092	mem = &init_mem_cgroup;
1179	page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
1180	} else {	1093	} else {
1181	mem = mem_cgroup_alloc();	1094	mem = mem_cgroup_alloc();
1182	if (!mem)	1095	if (!mem)