1 files changed, 163 insertions, 3 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4d4805eb37c7..ebca767292dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,9 @@
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rcupdate.h>
 struct cgroup_subsys mem_cgroup_subsys;
@@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys;
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark.
+ * we hit the water mark. May be even add a low water mark, such that
+ * no reclaim occurs from a cgroup at it's low water mark, this is
+ * a feature that will be implemented much later in the future.
 */
 struct mem_cgroup {
        struct cgroup_subsys_state css;
@@ -49,6 +54,14 @@ struct mem_cgroup {
 };
 /*
+ * We use the lower bit of the page->page_cgroup pointer as a bit spin
+ * lock. We need to ensure that page->page_cgroup is atleast two
+ * byte aligned (based on comments from Nick Piggin)
+ */
+#define PAGE_CGROUP_LOCK_BIT    0x0
+#define PAGE_CGROUP_LOCK                (1 << PAGE_CGROUP_LOCK_BIT)
+/*
 * A page_cgroup page is associated with every page descriptor. The
 * page_cgroup helps us identify information about the cgroup
 */
@@ -56,6 +69,8 @@ struct page_cgroup {
        struct list_head lru;           /* per cgroup LRU list */
        struct page *page;
        struct mem_cgroup *mem_cgroup;
+        atomic_t ref_cnt;               /* Helpful when pages move b/w  */
+                                        /* mapped and cached states     */
 };
@@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm)
        css_put(&mm->mem_cgroup->css);
 }
+static inline int page_cgroup_locked(struct page *page)
+{
+        return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
+                                        &page->page_cgroup);
+}
 void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
 {
-        page->page_cgroup = (unsigned long)pc;
+        int locked;
+        /*
+         * While resetting the page_cgroup we might not hold the
+         * page_cgroup lock. free_hot_cold_page() is an example
+         * of such a scenario
+         */
+        if (pc)
+                VM_BUG_ON(!page_cgroup_locked(page));
+        locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
+        page->page_cgroup = ((unsigned long)pc | locked);
 }
 struct page_cgroup *page_get_page_cgroup(struct page *page)
 {
-        return page->page_cgroup;
+        return (struct page_cgroup *)
+                (page->page_cgroup & ~PAGE_CGROUP_LOCK);
+}
+void __always_inline lock_page_cgroup(struct page *page)
+{
+        bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+        VM_BUG_ON(!page_cgroup_locked(page));
+}
+void __always_inline unlock_page_cgroup(struct page *page)
+{
+        bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+}
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
+{
+        struct mem_cgroup *mem;
+        struct page_cgroup *pc, *race_pc;
+        /*
+         * Should page_cgroup's go to their own slab?
+         * One could optimize the performance of the charging routine
+         * by saving a bit in the page_flags and using it as a lock
+         * to see if the cgroup page already has a page_cgroup associated
+         * with it
+         */
+        lock_page_cgroup(page);
+        pc = page_get_page_cgroup(page);
+        /*
+         * The page_cgroup exists and the page has already been accounted
+         */
+        if (pc) {
+                atomic_inc(&pc->ref_cnt);
+                goto done;
+        }
+        unlock_page_cgroup(page);
+        pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
+        if (pc == NULL)
+                goto err;
+        rcu_read_lock();
+        /*
+         * We always charge the cgroup the mm_struct belongs to
+         * the mm_struct's mem_cgroup changes on task migration if the
+         * thread group leader migrates. It's possible that mm is not
+         * set, if so charge the init_mm (happens for pagecache usage).
+         */
+        if (!mm)
+                mm = &init_mm;
+        mem = rcu_dereference(mm->mem_cgroup);
+        /*
+         * For every charge from the cgroup, increment reference
+         * count
+         */
+        css_get(&mem->css);
+        rcu_read_unlock();
+        /*
+         * If we created the page_cgroup, we should free it on exceeding
+         * the cgroup limit.
+         */
+        if (res_counter_charge(&mem->res, 1)) {
+                css_put(&mem->css);
+                goto free_pc;
+        }
+        lock_page_cgroup(page);
+        /*
+         * Check if somebody else beat us to allocating the page_cgroup
+         */
+        race_pc = page_get_page_cgroup(page);
+        if (race_pc) {
+                kfree(pc);
+                pc = race_pc;
+                atomic_inc(&pc->ref_cnt);
+                res_counter_uncharge(&mem->res, 1);
+                css_put(&mem->css);
+                goto done;
+        }
+        atomic_set(&pc->ref_cnt, 1);
+        pc->mem_cgroup = mem;
+        pc->page = page;
+        page_assign_page_cgroup(page, pc);
+done:
+        unlock_page_cgroup(page);
+        return 0;
+free_pc:
+        kfree(pc);
+        return -ENOMEM;
+err:
+        unlock_page_cgroup(page);
+        return -ENOMEM;
+}
+/*
+ * Uncharging is always a welcome operation, we never complain, simply
+ * uncharge.
+ */
+void mem_cgroup_uncharge(struct page_cgroup *pc)
+{
+        struct mem_cgroup *mem;
+        struct page *page;
+        if (!pc)
+                return;
+        if (atomic_dec_and_test(&pc->ref_cnt)) {
+                page = pc->page;
+                lock_page_cgroup(page);
+                mem = pc->mem_cgroup;
+                css_put(&mem->css);
+                page_assign_page_cgroup(page, NULL);
+                unlock_page_cgroup(page);
+                res_counter_uncharge(&mem->res, 1);
+                kfree(pc);
+        }
 }
 static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
@@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                return NULL;
        res_counter_init(&mem->res);
+        INIT_LIST_HEAD(&mem->active_list);
+        INIT_LIST_HEAD(&mem->inactive_list);
        return &mem->css;
 }

diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d4805eb37c7..ebca767292dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c
@@ -21,6 +21,9 @@
21	#include <linux/memcontrol.h>	21	#include <linux/memcontrol.h>
22	#include <linux/cgroup.h>	22	#include <linux/cgroup.h>
23	#include <linux/mm.h>	23	#include <linux/mm.h>
		24	#include <linux/page-flags.h>
		25	#include <linux/bit_spinlock.h>
		26	#include <linux/rcupdate.h>
24		27
25	struct cgroup_subsys mem_cgroup_subsys;	28	struct cgroup_subsys mem_cgroup_subsys;
26		29
@@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys;
31	* to help the administrator determine what knobs to tune.	34	* to help the administrator determine what knobs to tune.
32	*	35	*
33	* TODO: Add a water mark for the memory controller. Reclaim will begin when	36	* TODO: Add a water mark for the memory controller. Reclaim will begin when
34	* we hit the water mark.	37	* we hit the water mark. May be even add a low water mark, such that
		38	* no reclaim occurs from a cgroup at it's low water mark, this is
		39	* a feature that will be implemented much later in the future.
35	*/	40	*/
36	struct mem_cgroup {	41	struct mem_cgroup {
37	struct cgroup_subsys_state css;	42	struct cgroup_subsys_state css;
@@ -49,6 +54,14 @@ struct mem_cgroup {
49	};	54	};
50		55
51	/*	56	/*
		57	* We use the lower bit of the page->page_cgroup pointer as a bit spin
		58	* lock. We need to ensure that page->page_cgroup is atleast two
		59	* byte aligned (based on comments from Nick Piggin)
		60	*/
		61	#define PAGE_CGROUP_LOCK_BIT 0x0
		62	#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
		63
		64	/*
52	* A page_cgroup page is associated with every page descriptor. The	65	* A page_cgroup page is associated with every page descriptor. The
53	* page_cgroup helps us identify information about the cgroup	66	* page_cgroup helps us identify information about the cgroup
54	*/	67	*/
@@ -56,6 +69,8 @@ struct page_cgroup {
56	struct list_head lru; /* per cgroup LRU list */	69	struct list_head lru; /* per cgroup LRU list */
57	struct page *page;	70	struct page *page;
58	struct mem_cgroup *mem_cgroup;	71	struct mem_cgroup *mem_cgroup;
		72	atomic_t ref_cnt; /* Helpful when pages move b/w */
		73	/* mapped and cached states */
59	};	74	};
60		75
61		76
@@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm)
88	css_put(&mm->mem_cgroup->css);	103	css_put(&mm->mem_cgroup->css);
89	}	104	}
90		105
		106	static inline int page_cgroup_locked(struct page *page)
		107	{
		108	return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
		109	&page->page_cgroup);
		110	}
		111
91	void page_assign_page_cgroup(struct page page, struct page_cgroup pc)	112	void page_assign_page_cgroup(struct page page, struct page_cgroup pc)
92	{	113	{
93	page->page_cgroup = (unsigned long)pc;	114	int locked;
		115
		116	/*
		117	* While resetting the page_cgroup we might not hold the
		118	* page_cgroup lock. free_hot_cold_page() is an example
		119	* of such a scenario
		120	*/
		121	if (pc)
		122	VM_BUG_ON(!page_cgroup_locked(page));
		123	locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
		124	page->page_cgroup = ((unsigned long)pc \| locked);
94	}	125	}
95		126
96	struct page_cgroup page_get_page_cgroup(struct page page)	127	struct page_cgroup page_get_page_cgroup(struct page page)
97	{	128	{
98	return page->page_cgroup;	129	return (struct page_cgroup *)
		130	(page->page_cgroup & ~PAGE_CGROUP_LOCK);
		131	}
		132
		133	void __always_inline lock_page_cgroup(struct page *page)
		134	{
		135	bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
		136	VM_BUG_ON(!page_cgroup_locked(page));
		137	}
		138
		139	void __always_inline unlock_page_cgroup(struct page *page)
		140	{
		141	bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
		142	}
		143
		144	/*
		145	* Charge the memory controller for page usage.
		146	* Return
		147	* 0 if the charge was successful
		148	* < 0 if the cgroup is over its limit
		149	*/
		150	int mem_cgroup_charge(struct page page, struct mm_struct mm)
		151	{
		152	struct mem_cgroup *mem;
		153	struct page_cgroup pc, race_pc;
		154
		155	/*
		156	* Should page_cgroup's go to their own slab?
		157	* One could optimize the performance of the charging routine
		158	* by saving a bit in the page_flags and using it as a lock
		159	* to see if the cgroup page already has a page_cgroup associated
		160	* with it
		161	*/
		162	lock_page_cgroup(page);
		163	pc = page_get_page_cgroup(page);
		164	/*
		165	* The page_cgroup exists and the page has already been accounted
		166	*/
		167	if (pc) {
		168	atomic_inc(&pc->ref_cnt);
		169	goto done;
		170	}
		171
		172	unlock_page_cgroup(page);
		173
		174	pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
		175	if (pc == NULL)
		176	goto err;
		177
		178	rcu_read_lock();
		179	/*
		180	* We always charge the cgroup the mm_struct belongs to
		181	* the mm_struct's mem_cgroup changes on task migration if the
		182	* thread group leader migrates. It's possible that mm is not
		183	* set, if so charge the init_mm (happens for pagecache usage).
		184	*/
		185	if (!mm)
		186	mm = &init_mm;
		187
		188	mem = rcu_dereference(mm->mem_cgroup);
		189	/*
		190	* For every charge from the cgroup, increment reference
		191	* count
		192	*/
		193	css_get(&mem->css);
		194	rcu_read_unlock();
		195
		196	/*
		197	* If we created the page_cgroup, we should free it on exceeding
		198	* the cgroup limit.
		199	*/
		200	if (res_counter_charge(&mem->res, 1)) {
		201	css_put(&mem->css);
		202	goto free_pc;
		203	}
		204
		205	lock_page_cgroup(page);
		206	/*
		207	* Check if somebody else beat us to allocating the page_cgroup
		208	*/
		209	race_pc = page_get_page_cgroup(page);
		210	if (race_pc) {
		211	kfree(pc);
		212	pc = race_pc;
		213	atomic_inc(&pc->ref_cnt);
		214	res_counter_uncharge(&mem->res, 1);
		215	css_put(&mem->css);
		216	goto done;
		217	}
		218
		219	atomic_set(&pc->ref_cnt, 1);
		220	pc->mem_cgroup = mem;
		221	pc->page = page;
		222	page_assign_page_cgroup(page, pc);
		223
		224	done:
		225	unlock_page_cgroup(page);
		226	return 0;
		227	free_pc:
		228	kfree(pc);
		229	return -ENOMEM;
		230	err:
		231	unlock_page_cgroup(page);
		232	return -ENOMEM;
		233	}
		234
		235	/*
		236	* Uncharging is always a welcome operation, we never complain, simply
		237	* uncharge.
		238	*/
		239	void mem_cgroup_uncharge(struct page_cgroup *pc)
		240	{
		241	struct mem_cgroup *mem;
		242	struct page *page;
		243
		244	if (!pc)
		245	return;
		246
		247	if (atomic_dec_and_test(&pc->ref_cnt)) {
		248	page = pc->page;
		249	lock_page_cgroup(page);
		250	mem = pc->mem_cgroup;
		251	css_put(&mem->css);
		252	page_assign_page_cgroup(page, NULL);
		253	unlock_page_cgroup(page);
		254	res_counter_uncharge(&mem->res, 1);
		255	kfree(pc);
		256	}
99	}	257	}
100		258
101	static ssize_t mem_cgroup_read(struct cgroup cont, struct cftype cft,	259	static ssize_t mem_cgroup_read(struct cgroup cont, struct cftype cft,
@@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys ss, struct cgroup cont)
150	return NULL;	308	return NULL;
151		309
152	res_counter_init(&mem->res);	310	res_counter_init(&mem->res);
		311	INIT_LIST_HEAD(&mem->active_list);
		312	INIT_LIST_HEAD(&mem->inactive_list);
153	return &mem->css;	313	return &mem->css;
154	}	314	}
155		315