diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 166 |
1 files changed, 163 insertions, 3 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d4805eb37c7..ebca767292dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -21,6 +21,9 @@ | |||
21 | #include <linux/memcontrol.h> | 21 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
24 | #include <linux/page-flags.h> | ||
25 | #include <linux/bit_spinlock.h> | ||
26 | #include <linux/rcupdate.h> | ||
24 | 27 | ||
25 | struct cgroup_subsys mem_cgroup_subsys; | 28 | struct cgroup_subsys mem_cgroup_subsys; |
26 | 29 | ||
@@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys; | |||
31 | * to help the administrator determine what knobs to tune. | 34 | * to help the administrator determine what knobs to tune. |
32 | * | 35 | * |
33 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | 36 | * TODO: Add a water mark for the memory controller. Reclaim will begin when |
34 | * we hit the water mark. | 37 | * we hit the water mark. May be even add a low water mark, such that |
38 | * no reclaim occurs from a cgroup at it's low water mark, this is | ||
39 | * a feature that will be implemented much later in the future. | ||
35 | */ | 40 | */ |
36 | struct mem_cgroup { | 41 | struct mem_cgroup { |
37 | struct cgroup_subsys_state css; | 42 | struct cgroup_subsys_state css; |
@@ -49,6 +54,14 @@ struct mem_cgroup { | |||
49 | }; | 54 | }; |
50 | 55 | ||
51 | /* | 56 | /* |
57 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
58 | * lock. We need to ensure that page->page_cgroup is atleast two | ||
59 | * byte aligned (based on comments from Nick Piggin) | ||
60 | */ | ||
61 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
62 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
63 | |||
64 | /* | ||
52 | * A page_cgroup page is associated with every page descriptor. The | 65 | * A page_cgroup page is associated with every page descriptor. The |
53 | * page_cgroup helps us identify information about the cgroup | 66 | * page_cgroup helps us identify information about the cgroup |
54 | */ | 67 | */ |
@@ -56,6 +69,8 @@ struct page_cgroup { | |||
56 | struct list_head lru; /* per cgroup LRU list */ | 69 | struct list_head lru; /* per cgroup LRU list */ |
57 | struct page *page; | 70 | struct page *page; |
58 | struct mem_cgroup *mem_cgroup; | 71 | struct mem_cgroup *mem_cgroup; |
72 | atomic_t ref_cnt; /* Helpful when pages move b/w */ | ||
73 | /* mapped and cached states */ | ||
59 | }; | 74 | }; |
60 | 75 | ||
61 | 76 | ||
@@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm) | |||
88 | css_put(&mm->mem_cgroup->css); | 103 | css_put(&mm->mem_cgroup->css); |
89 | } | 104 | } |
90 | 105 | ||
106 | static inline int page_cgroup_locked(struct page *page) | ||
107 | { | ||
108 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, | ||
109 | &page->page_cgroup); | ||
110 | } | ||
111 | |||
91 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | 112 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) |
92 | { | 113 | { |
93 | page->page_cgroup = (unsigned long)pc; | 114 | int locked; |
115 | |||
116 | /* | ||
117 | * While resetting the page_cgroup we might not hold the | ||
118 | * page_cgroup lock. free_hot_cold_page() is an example | ||
119 | * of such a scenario | ||
120 | */ | ||
121 | if (pc) | ||
122 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
123 | locked = (page->page_cgroup & PAGE_CGROUP_LOCK); | ||
124 | page->page_cgroup = ((unsigned long)pc | locked); | ||
94 | } | 125 | } |
95 | 126 | ||
96 | struct page_cgroup *page_get_page_cgroup(struct page *page) | 127 | struct page_cgroup *page_get_page_cgroup(struct page *page) |
97 | { | 128 | { |
98 | return page->page_cgroup; | 129 | return (struct page_cgroup *) |
130 | (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
131 | } | ||
132 | |||
133 | void __always_inline lock_page_cgroup(struct page *page) | ||
134 | { | ||
135 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
136 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
137 | } | ||
138 | |||
139 | void __always_inline unlock_page_cgroup(struct page *page) | ||
140 | { | ||
141 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
142 | } | ||
143 | |||
144 | /* | ||
145 | * Charge the memory controller for page usage. | ||
146 | * Return | ||
147 | * 0 if the charge was successful | ||
148 | * < 0 if the cgroup is over its limit | ||
149 | */ | ||
150 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | ||
151 | { | ||
152 | struct mem_cgroup *mem; | ||
153 | struct page_cgroup *pc, *race_pc; | ||
154 | |||
155 | /* | ||
156 | * Should page_cgroup's go to their own slab? | ||
157 | * One could optimize the performance of the charging routine | ||
158 | * by saving a bit in the page_flags and using it as a lock | ||
159 | * to see if the cgroup page already has a page_cgroup associated | ||
160 | * with it | ||
161 | */ | ||
162 | lock_page_cgroup(page); | ||
163 | pc = page_get_page_cgroup(page); | ||
164 | /* | ||
165 | * The page_cgroup exists and the page has already been accounted | ||
166 | */ | ||
167 | if (pc) { | ||
168 | atomic_inc(&pc->ref_cnt); | ||
169 | goto done; | ||
170 | } | ||
171 | |||
172 | unlock_page_cgroup(page); | ||
173 | |||
174 | pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL); | ||
175 | if (pc == NULL) | ||
176 | goto err; | ||
177 | |||
178 | rcu_read_lock(); | ||
179 | /* | ||
180 | * We always charge the cgroup the mm_struct belongs to | ||
181 | * the mm_struct's mem_cgroup changes on task migration if the | ||
182 | * thread group leader migrates. It's possible that mm is not | ||
183 | * set, if so charge the init_mm (happens for pagecache usage). | ||
184 | */ | ||
185 | if (!mm) | ||
186 | mm = &init_mm; | ||
187 | |||
188 | mem = rcu_dereference(mm->mem_cgroup); | ||
189 | /* | ||
190 | * For every charge from the cgroup, increment reference | ||
191 | * count | ||
192 | */ | ||
193 | css_get(&mem->css); | ||
194 | rcu_read_unlock(); | ||
195 | |||
196 | /* | ||
197 | * If we created the page_cgroup, we should free it on exceeding | ||
198 | * the cgroup limit. | ||
199 | */ | ||
200 | if (res_counter_charge(&mem->res, 1)) { | ||
201 | css_put(&mem->css); | ||
202 | goto free_pc; | ||
203 | } | ||
204 | |||
205 | lock_page_cgroup(page); | ||
206 | /* | ||
207 | * Check if somebody else beat us to allocating the page_cgroup | ||
208 | */ | ||
209 | race_pc = page_get_page_cgroup(page); | ||
210 | if (race_pc) { | ||
211 | kfree(pc); | ||
212 | pc = race_pc; | ||
213 | atomic_inc(&pc->ref_cnt); | ||
214 | res_counter_uncharge(&mem->res, 1); | ||
215 | css_put(&mem->css); | ||
216 | goto done; | ||
217 | } | ||
218 | |||
219 | atomic_set(&pc->ref_cnt, 1); | ||
220 | pc->mem_cgroup = mem; | ||
221 | pc->page = page; | ||
222 | page_assign_page_cgroup(page, pc); | ||
223 | |||
224 | done: | ||
225 | unlock_page_cgroup(page); | ||
226 | return 0; | ||
227 | free_pc: | ||
228 | kfree(pc); | ||
229 | return -ENOMEM; | ||
230 | err: | ||
231 | unlock_page_cgroup(page); | ||
232 | return -ENOMEM; | ||
233 | } | ||
234 | |||
235 | /* | ||
236 | * Uncharging is always a welcome operation, we never complain, simply | ||
237 | * uncharge. | ||
238 | */ | ||
239 | void mem_cgroup_uncharge(struct page_cgroup *pc) | ||
240 | { | ||
241 | struct mem_cgroup *mem; | ||
242 | struct page *page; | ||
243 | |||
244 | if (!pc) | ||
245 | return; | ||
246 | |||
247 | if (atomic_dec_and_test(&pc->ref_cnt)) { | ||
248 | page = pc->page; | ||
249 | lock_page_cgroup(page); | ||
250 | mem = pc->mem_cgroup; | ||
251 | css_put(&mem->css); | ||
252 | page_assign_page_cgroup(page, NULL); | ||
253 | unlock_page_cgroup(page); | ||
254 | res_counter_uncharge(&mem->res, 1); | ||
255 | kfree(pc); | ||
256 | } | ||
99 | } | 257 | } |
100 | 258 | ||
101 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | 259 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, |
@@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
150 | return NULL; | 308 | return NULL; |
151 | 309 | ||
152 | res_counter_init(&mem->res); | 310 | res_counter_init(&mem->res); |
311 | INIT_LIST_HEAD(&mem->active_list); | ||
312 | INIT_LIST_HEAD(&mem->inactive_list); | ||
153 | return &mem->css; | 313 | return &mem->css; |
154 | } | 314 | } |
155 | 315 | ||