aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c166
1 files changed, 163 insertions, 3 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4d4805eb37c7..ebca767292dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,9 @@
21#include <linux/memcontrol.h> 21#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 22#include <linux/cgroup.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/page-flags.h>
25#include <linux/bit_spinlock.h>
26#include <linux/rcupdate.h>
24 27
25struct cgroup_subsys mem_cgroup_subsys; 28struct cgroup_subsys mem_cgroup_subsys;
26 29
@@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys;
31 * to help the administrator determine what knobs to tune. 34 * to help the administrator determine what knobs to tune.
32 * 35 *
33 * TODO: Add a water mark for the memory controller. Reclaim will begin when 36 * TODO: Add a water mark for the memory controller. Reclaim will begin when
34 * we hit the water mark. 37 * we hit the water mark. May be even add a low water mark, such that
38 * no reclaim occurs from a cgroup at it's low water mark, this is
39 * a feature that will be implemented much later in the future.
35 */ 40 */
36struct mem_cgroup { 41struct mem_cgroup {
37 struct cgroup_subsys_state css; 42 struct cgroup_subsys_state css;
@@ -49,6 +54,14 @@ struct mem_cgroup {
49}; 54};
50 55
51/* 56/*
57 * We use the lower bit of the page->page_cgroup pointer as a bit spin
58 * lock. We need to ensure that page->page_cgroup is atleast two
59 * byte aligned (based on comments from Nick Piggin)
60 */
61#define PAGE_CGROUP_LOCK_BIT 0x0
62#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
63
64/*
52 * A page_cgroup page is associated with every page descriptor. The 65 * A page_cgroup page is associated with every page descriptor. The
53 * page_cgroup helps us identify information about the cgroup 66 * page_cgroup helps us identify information about the cgroup
54 */ 67 */
@@ -56,6 +69,8 @@ struct page_cgroup {
56 struct list_head lru; /* per cgroup LRU list */ 69 struct list_head lru; /* per cgroup LRU list */
57 struct page *page; 70 struct page *page;
58 struct mem_cgroup *mem_cgroup; 71 struct mem_cgroup *mem_cgroup;
72 atomic_t ref_cnt; /* Helpful when pages move b/w */
73 /* mapped and cached states */
59}; 74};
60 75
61 76
@@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm)
88 css_put(&mm->mem_cgroup->css); 103 css_put(&mm->mem_cgroup->css);
89} 104}
90 105
106static inline int page_cgroup_locked(struct page *page)
107{
108 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
109 &page->page_cgroup);
110}
111
91void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 112void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
92{ 113{
93 page->page_cgroup = (unsigned long)pc; 114 int locked;
115
116 /*
117 * While resetting the page_cgroup we might not hold the
118 * page_cgroup lock. free_hot_cold_page() is an example
119 * of such a scenario
120 */
121 if (pc)
122 VM_BUG_ON(!page_cgroup_locked(page));
123 locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
124 page->page_cgroup = ((unsigned long)pc | locked);
94} 125}
95 126
96struct page_cgroup *page_get_page_cgroup(struct page *page) 127struct page_cgroup *page_get_page_cgroup(struct page *page)
97{ 128{
98 return page->page_cgroup; 129 return (struct page_cgroup *)
130 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
131}
132
133void __always_inline lock_page_cgroup(struct page *page)
134{
135 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
136 VM_BUG_ON(!page_cgroup_locked(page));
137}
138
139void __always_inline unlock_page_cgroup(struct page *page)
140{
141 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
142}
143
144/*
145 * Charge the memory controller for page usage.
146 * Return
147 * 0 if the charge was successful
148 * < 0 if the cgroup is over its limit
149 */
150int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
151{
152 struct mem_cgroup *mem;
153 struct page_cgroup *pc, *race_pc;
154
155 /*
156 * Should page_cgroup's go to their own slab?
157 * One could optimize the performance of the charging routine
158 * by saving a bit in the page_flags and using it as a lock
159 * to see if the cgroup page already has a page_cgroup associated
160 * with it
161 */
162 lock_page_cgroup(page);
163 pc = page_get_page_cgroup(page);
164 /*
165 * The page_cgroup exists and the page has already been accounted
166 */
167 if (pc) {
168 atomic_inc(&pc->ref_cnt);
169 goto done;
170 }
171
172 unlock_page_cgroup(page);
173
174 pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
175 if (pc == NULL)
176 goto err;
177
178 rcu_read_lock();
179 /*
180 * We always charge the cgroup the mm_struct belongs to
181 * the mm_struct's mem_cgroup changes on task migration if the
182 * thread group leader migrates. It's possible that mm is not
183 * set, if so charge the init_mm (happens for pagecache usage).
184 */
185 if (!mm)
186 mm = &init_mm;
187
188 mem = rcu_dereference(mm->mem_cgroup);
189 /*
190 * For every charge from the cgroup, increment reference
191 * count
192 */
193 css_get(&mem->css);
194 rcu_read_unlock();
195
196 /*
197 * If we created the page_cgroup, we should free it on exceeding
198 * the cgroup limit.
199 */
200 if (res_counter_charge(&mem->res, 1)) {
201 css_put(&mem->css);
202 goto free_pc;
203 }
204
205 lock_page_cgroup(page);
206 /*
207 * Check if somebody else beat us to allocating the page_cgroup
208 */
209 race_pc = page_get_page_cgroup(page);
210 if (race_pc) {
211 kfree(pc);
212 pc = race_pc;
213 atomic_inc(&pc->ref_cnt);
214 res_counter_uncharge(&mem->res, 1);
215 css_put(&mem->css);
216 goto done;
217 }
218
219 atomic_set(&pc->ref_cnt, 1);
220 pc->mem_cgroup = mem;
221 pc->page = page;
222 page_assign_page_cgroup(page, pc);
223
224done:
225 unlock_page_cgroup(page);
226 return 0;
227free_pc:
228 kfree(pc);
229 return -ENOMEM;
230err:
231 unlock_page_cgroup(page);
232 return -ENOMEM;
233}
234
235/*
236 * Uncharging is always a welcome operation, we never complain, simply
237 * uncharge.
238 */
239void mem_cgroup_uncharge(struct page_cgroup *pc)
240{
241 struct mem_cgroup *mem;
242 struct page *page;
243
244 if (!pc)
245 return;
246
247 if (atomic_dec_and_test(&pc->ref_cnt)) {
248 page = pc->page;
249 lock_page_cgroup(page);
250 mem = pc->mem_cgroup;
251 css_put(&mem->css);
252 page_assign_page_cgroup(page, NULL);
253 unlock_page_cgroup(page);
254 res_counter_uncharge(&mem->res, 1);
255 kfree(pc);
256 }
99} 257}
100 258
101static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, 259static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
@@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
150 return NULL; 308 return NULL;
151 309
152 res_counter_init(&mem->res); 310 res_counter_init(&mem->res);
311 INIT_LIST_HEAD(&mem->active_list);
312 INIT_LIST_HEAD(&mem->inactive_list);
153 return &mem->css; 313 return &mem->css;
154} 314}
155 315