diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 1192 |
1 files changed, 1192 insertions, 0 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c new file mode 100644 index 000000000000..5c2c702af617 --- /dev/null +++ b/mm/memcontrol.c | |||
@@ -0,0 +1,1192 @@ | |||
1 | /* memcontrol.c - Memory Controller | ||
2 | * | ||
3 | * Copyright IBM Corporation, 2007 | ||
4 | * Author Balbir Singh <balbir@linux.vnet.ibm.com> | ||
5 | * | ||
6 | * Copyright 2007 OpenVZ SWsoft Inc | ||
7 | * Author: Pavel Emelianov <xemul@openvz.org> | ||
8 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | ||
10 | * it under the terms of the GNU General Public License as published by | ||
11 | * the Free Software Foundation; either version 2 of the License, or | ||
12 | * (at your option) any later version. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
17 | * GNU General Public License for more details. | ||
18 | */ | ||
19 | |||
20 | #include <linux/res_counter.h> | ||
21 | #include <linux/memcontrol.h> | ||
22 | #include <linux/cgroup.h> | ||
23 | #include <linux/mm.h> | ||
24 | #include <linux/smp.h> | ||
25 | #include <linux/page-flags.h> | ||
26 | #include <linux/backing-dev.h> | ||
27 | #include <linux/bit_spinlock.h> | ||
28 | #include <linux/rcupdate.h> | ||
29 | #include <linux/swap.h> | ||
30 | #include <linux/spinlock.h> | ||
31 | #include <linux/fs.h> | ||
32 | #include <linux/seq_file.h> | ||
33 | |||
34 | #include <asm/uaccess.h> | ||
35 | |||
36 | struct cgroup_subsys mem_cgroup_subsys; | ||
37 | static const int MEM_CGROUP_RECLAIM_RETRIES = 5; | ||
38 | |||
39 | /* | ||
40 | * Statistics for memory cgroup. | ||
41 | */ | ||
42 | enum mem_cgroup_stat_index { | ||
43 | /* | ||
44 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | ||
45 | */ | ||
46 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | ||
47 | MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ | ||
48 | |||
49 | MEM_CGROUP_STAT_NSTATS, | ||
50 | }; | ||
51 | |||
52 | struct mem_cgroup_stat_cpu { | ||
53 | s64 count[MEM_CGROUP_STAT_NSTATS]; | ||
54 | } ____cacheline_aligned_in_smp; | ||
55 | |||
56 | struct mem_cgroup_stat { | ||
57 | struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; | ||
58 | }; | ||
59 | |||
60 | /* | ||
61 | * For accounting under irq disable, no need for increment preempt count. | ||
62 | */ | ||
63 | static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, | ||
64 | enum mem_cgroup_stat_index idx, int val) | ||
65 | { | ||
66 | int cpu = smp_processor_id(); | ||
67 | stat->cpustat[cpu].count[idx] += val; | ||
68 | } | ||
69 | |||
70 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | ||
71 | enum mem_cgroup_stat_index idx) | ||
72 | { | ||
73 | int cpu; | ||
74 | s64 ret = 0; | ||
75 | for_each_possible_cpu(cpu) | ||
76 | ret += stat->cpustat[cpu].count[idx]; | ||
77 | return ret; | ||
78 | } | ||
79 | |||
80 | /* | ||
81 | * per-zone information in memory controller. | ||
82 | */ | ||
83 | |||
84 | enum mem_cgroup_zstat_index { | ||
85 | MEM_CGROUP_ZSTAT_ACTIVE, | ||
86 | MEM_CGROUP_ZSTAT_INACTIVE, | ||
87 | |||
88 | NR_MEM_CGROUP_ZSTAT, | ||
89 | }; | ||
90 | |||
91 | struct mem_cgroup_per_zone { | ||
92 | /* | ||
93 | * spin_lock to protect the per cgroup LRU | ||
94 | */ | ||
95 | spinlock_t lru_lock; | ||
96 | struct list_head active_list; | ||
97 | struct list_head inactive_list; | ||
98 | unsigned long count[NR_MEM_CGROUP_ZSTAT]; | ||
99 | }; | ||
100 | /* Macro for accessing counter */ | ||
101 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | ||
102 | |||
103 | struct mem_cgroup_per_node { | ||
104 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | ||
105 | }; | ||
106 | |||
107 | struct mem_cgroup_lru_info { | ||
108 | struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; | ||
109 | }; | ||
110 | |||
111 | /* | ||
112 | * The memory controller data structure. The memory controller controls both | ||
113 | * page cache and RSS per cgroup. We would eventually like to provide | ||
114 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | ||
115 | * to help the administrator determine what knobs to tune. | ||
116 | * | ||
117 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | ||
118 | * we hit the water mark. May be even add a low water mark, such that | ||
119 | * no reclaim occurs from a cgroup at it's low water mark, this is | ||
120 | * a feature that will be implemented much later in the future. | ||
121 | */ | ||
122 | struct mem_cgroup { | ||
123 | struct cgroup_subsys_state css; | ||
124 | /* | ||
125 | * the counter to account for memory usage | ||
126 | */ | ||
127 | struct res_counter res; | ||
128 | /* | ||
129 | * Per cgroup active and inactive list, similar to the | ||
130 | * per zone LRU lists. | ||
131 | */ | ||
132 | struct mem_cgroup_lru_info info; | ||
133 | |||
134 | int prev_priority; /* for recording reclaim priority */ | ||
135 | /* | ||
136 | * statistics. | ||
137 | */ | ||
138 | struct mem_cgroup_stat stat; | ||
139 | }; | ||
140 | |||
141 | /* | ||
142 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
143 | * lock. We need to ensure that page->page_cgroup is atleast two | ||
144 | * byte aligned (based on comments from Nick Piggin) | ||
145 | */ | ||
146 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
147 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
148 | |||
149 | /* | ||
150 | * A page_cgroup page is associated with every page descriptor. The | ||
151 | * page_cgroup helps us identify information about the cgroup | ||
152 | */ | ||
153 | struct page_cgroup { | ||
154 | struct list_head lru; /* per cgroup LRU list */ | ||
155 | struct page *page; | ||
156 | struct mem_cgroup *mem_cgroup; | ||
157 | atomic_t ref_cnt; /* Helpful when pages move b/w */ | ||
158 | /* mapped and cached states */ | ||
159 | int flags; | ||
160 | }; | ||
161 | #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ | ||
162 | #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ | ||
163 | |||
164 | static inline int page_cgroup_nid(struct page_cgroup *pc) | ||
165 | { | ||
166 | return page_to_nid(pc->page); | ||
167 | } | ||
168 | |||
169 | static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) | ||
170 | { | ||
171 | return page_zonenum(pc->page); | ||
172 | } | ||
173 | |||
174 | enum { | ||
175 | MEM_CGROUP_TYPE_UNSPEC = 0, | ||
176 | MEM_CGROUP_TYPE_MAPPED, | ||
177 | MEM_CGROUP_TYPE_CACHED, | ||
178 | MEM_CGROUP_TYPE_ALL, | ||
179 | MEM_CGROUP_TYPE_MAX, | ||
180 | }; | ||
181 | |||
182 | enum charge_type { | ||
183 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | ||
184 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | ||
185 | }; | ||
186 | |||
187 | |||
188 | /* | ||
189 | * Always modified under lru lock. Then, not necessary to preempt_disable() | ||
190 | */ | ||
191 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, | ||
192 | bool charge) | ||
193 | { | ||
194 | int val = (charge)? 1 : -1; | ||
195 | struct mem_cgroup_stat *stat = &mem->stat; | ||
196 | VM_BUG_ON(!irqs_disabled()); | ||
197 | |||
198 | if (flags & PAGE_CGROUP_FLAG_CACHE) | ||
199 | __mem_cgroup_stat_add_safe(stat, | ||
200 | MEM_CGROUP_STAT_CACHE, val); | ||
201 | else | ||
202 | __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); | ||
203 | } | ||
204 | |||
205 | static inline struct mem_cgroup_per_zone * | ||
206 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
207 | { | ||
208 | BUG_ON(!mem->info.nodeinfo[nid]); | ||
209 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
210 | } | ||
211 | |||
212 | static inline struct mem_cgroup_per_zone * | ||
213 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
214 | { | ||
215 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
216 | int nid = page_cgroup_nid(pc); | ||
217 | int zid = page_cgroup_zid(pc); | ||
218 | |||
219 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
220 | } | ||
221 | |||
222 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, | ||
223 | enum mem_cgroup_zstat_index idx) | ||
224 | { | ||
225 | int nid, zid; | ||
226 | struct mem_cgroup_per_zone *mz; | ||
227 | u64 total = 0; | ||
228 | |||
229 | for_each_online_node(nid) | ||
230 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
231 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
232 | total += MEM_CGROUP_ZSTAT(mz, idx); | ||
233 | } | ||
234 | return total; | ||
235 | } | ||
236 | |||
237 | static struct mem_cgroup init_mem_cgroup; | ||
238 | |||
239 | static inline | ||
240 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | ||
241 | { | ||
242 | return container_of(cgroup_subsys_state(cont, | ||
243 | mem_cgroup_subsys_id), struct mem_cgroup, | ||
244 | css); | ||
245 | } | ||
246 | |||
247 | static inline | ||
248 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | ||
249 | { | ||
250 | return container_of(task_subsys_state(p, mem_cgroup_subsys_id), | ||
251 | struct mem_cgroup, css); | ||
252 | } | ||
253 | |||
254 | void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p) | ||
255 | { | ||
256 | struct mem_cgroup *mem; | ||
257 | |||
258 | mem = mem_cgroup_from_task(p); | ||
259 | css_get(&mem->css); | ||
260 | mm->mem_cgroup = mem; | ||
261 | } | ||
262 | |||
263 | void mm_free_cgroup(struct mm_struct *mm) | ||
264 | { | ||
265 | css_put(&mm->mem_cgroup->css); | ||
266 | } | ||
267 | |||
268 | static inline int page_cgroup_locked(struct page *page) | ||
269 | { | ||
270 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, | ||
271 | &page->page_cgroup); | ||
272 | } | ||
273 | |||
274 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | ||
275 | { | ||
276 | int locked; | ||
277 | |||
278 | /* | ||
279 | * While resetting the page_cgroup we might not hold the | ||
280 | * page_cgroup lock. free_hot_cold_page() is an example | ||
281 | * of such a scenario | ||
282 | */ | ||
283 | if (pc) | ||
284 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
285 | locked = (page->page_cgroup & PAGE_CGROUP_LOCK); | ||
286 | page->page_cgroup = ((unsigned long)pc | locked); | ||
287 | } | ||
288 | |||
289 | struct page_cgroup *page_get_page_cgroup(struct page *page) | ||
290 | { | ||
291 | return (struct page_cgroup *) | ||
292 | (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
293 | } | ||
294 | |||
295 | static void __always_inline lock_page_cgroup(struct page *page) | ||
296 | { | ||
297 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
298 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
299 | } | ||
300 | |||
301 | static void __always_inline unlock_page_cgroup(struct page *page) | ||
302 | { | ||
303 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
304 | } | ||
305 | |||
306 | /* | ||
307 | * Tie new page_cgroup to struct page under lock_page_cgroup() | ||
308 | * This can fail if the page has been tied to a page_cgroup. | ||
309 | * If success, returns 0. | ||
310 | */ | ||
311 | static int page_cgroup_assign_new_page_cgroup(struct page *page, | ||
312 | struct page_cgroup *pc) | ||
313 | { | ||
314 | int ret = 0; | ||
315 | |||
316 | lock_page_cgroup(page); | ||
317 | if (!page_get_page_cgroup(page)) | ||
318 | page_assign_page_cgroup(page, pc); | ||
319 | else /* A page is tied to other pc. */ | ||
320 | ret = 1; | ||
321 | unlock_page_cgroup(page); | ||
322 | return ret; | ||
323 | } | ||
324 | |||
325 | /* | ||
326 | * Clear page->page_cgroup member under lock_page_cgroup(). | ||
327 | * If given "pc" value is different from one page->page_cgroup, | ||
328 | * page->cgroup is not cleared. | ||
329 | * Returns a value of page->page_cgroup at lock taken. | ||
330 | * A can can detect failure of clearing by following | ||
331 | * clear_page_cgroup(page, pc) == pc | ||
332 | */ | ||
333 | |||
334 | static struct page_cgroup *clear_page_cgroup(struct page *page, | ||
335 | struct page_cgroup *pc) | ||
336 | { | ||
337 | struct page_cgroup *ret; | ||
338 | /* lock and clear */ | ||
339 | lock_page_cgroup(page); | ||
340 | ret = page_get_page_cgroup(page); | ||
341 | if (likely(ret == pc)) | ||
342 | page_assign_page_cgroup(page, NULL); | ||
343 | unlock_page_cgroup(page); | ||
344 | return ret; | ||
345 | } | ||
346 | |||
347 | static void __mem_cgroup_remove_list(struct page_cgroup *pc) | ||
348 | { | ||
349 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
350 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | ||
351 | |||
352 | if (from) | ||
353 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | ||
354 | else | ||
355 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
356 | |||
357 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); | ||
358 | list_del_init(&pc->lru); | ||
359 | } | ||
360 | |||
361 | static void __mem_cgroup_add_list(struct page_cgroup *pc) | ||
362 | { | ||
363 | int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
364 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | ||
365 | |||
366 | if (!to) { | ||
367 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | ||
368 | list_add(&pc->lru, &mz->inactive_list); | ||
369 | } else { | ||
370 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | ||
371 | list_add(&pc->lru, &mz->active_list); | ||
372 | } | ||
373 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); | ||
374 | } | ||
375 | |||
376 | static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | ||
377 | { | ||
378 | int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; | ||
379 | struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); | ||
380 | |||
381 | if (from) | ||
382 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; | ||
383 | else | ||
384 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | ||
385 | |||
386 | if (active) { | ||
387 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; | ||
388 | pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; | ||
389 | list_move(&pc->lru, &mz->active_list); | ||
390 | } else { | ||
391 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; | ||
392 | pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; | ||
393 | list_move(&pc->lru, &mz->inactive_list); | ||
394 | } | ||
395 | } | ||
396 | |||
397 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | ||
398 | { | ||
399 | int ret; | ||
400 | |||
401 | task_lock(task); | ||
402 | ret = task->mm && mm_cgroup(task->mm) == mem; | ||
403 | task_unlock(task); | ||
404 | return ret; | ||
405 | } | ||
406 | |||
407 | /* | ||
408 | * This routine assumes that the appropriate zone's lru lock is already held | ||
409 | */ | ||
410 | void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | ||
411 | { | ||
412 | struct mem_cgroup_per_zone *mz; | ||
413 | unsigned long flags; | ||
414 | |||
415 | if (!pc) | ||
416 | return; | ||
417 | |||
418 | mz = page_cgroup_zoneinfo(pc); | ||
419 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
420 | __mem_cgroup_move_lists(pc, active); | ||
421 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
422 | } | ||
423 | |||
424 | /* | ||
425 | * Calculate mapped_ratio under memory controller. This will be used in | ||
426 | * vmscan.c for deteremining we have to reclaim mapped pages. | ||
427 | */ | ||
428 | int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) | ||
429 | { | ||
430 | long total, rss; | ||
431 | |||
432 | /* | ||
433 | * usage is recorded in bytes. But, here, we assume the number of | ||
434 | * physical pages can be represented by "long" on any arch. | ||
435 | */ | ||
436 | total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; | ||
437 | rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | ||
438 | return (int)((rss * 100L) / total); | ||
439 | } | ||
440 | /* | ||
441 | * This function is called from vmscan.c. In page reclaiming loop. balance | ||
442 | * between active and inactive list is calculated. For memory controller | ||
443 | * page reclaiming, we should use using mem_cgroup's imbalance rather than | ||
444 | * zone's global lru imbalance. | ||
445 | */ | ||
446 | long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) | ||
447 | { | ||
448 | unsigned long active, inactive; | ||
449 | /* active and inactive are the number of pages. 'long' is ok.*/ | ||
450 | active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); | ||
451 | inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); | ||
452 | return (long) (active / (inactive + 1)); | ||
453 | } | ||
454 | |||
455 | /* | ||
456 | * prev_priority control...this will be used in memory reclaim path. | ||
457 | */ | ||
458 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) | ||
459 | { | ||
460 | return mem->prev_priority; | ||
461 | } | ||
462 | |||
463 | void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) | ||
464 | { | ||
465 | if (priority < mem->prev_priority) | ||
466 | mem->prev_priority = priority; | ||
467 | } | ||
468 | |||
469 | void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) | ||
470 | { | ||
471 | mem->prev_priority = priority; | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * Calculate # of pages to be scanned in this priority/zone. | ||
476 | * See also vmscan.c | ||
477 | * | ||
478 | * priority starts from "DEF_PRIORITY" and decremented in each loop. | ||
479 | * (see include/linux/mmzone.h) | ||
480 | */ | ||
481 | |||
482 | long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, | ||
483 | struct zone *zone, int priority) | ||
484 | { | ||
485 | long nr_active; | ||
486 | int nid = zone->zone_pgdat->node_id; | ||
487 | int zid = zone_idx(zone); | ||
488 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
489 | |||
490 | nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); | ||
491 | return (nr_active >> priority); | ||
492 | } | ||
493 | |||
494 | long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, | ||
495 | struct zone *zone, int priority) | ||
496 | { | ||
497 | long nr_inactive; | ||
498 | int nid = zone->zone_pgdat->node_id; | ||
499 | int zid = zone_idx(zone); | ||
500 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
501 | |||
502 | nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); | ||
503 | |||
504 | return (nr_inactive >> priority); | ||
505 | } | ||
506 | |||
507 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | ||
508 | struct list_head *dst, | ||
509 | unsigned long *scanned, int order, | ||
510 | int mode, struct zone *z, | ||
511 | struct mem_cgroup *mem_cont, | ||
512 | int active) | ||
513 | { | ||
514 | unsigned long nr_taken = 0; | ||
515 | struct page *page; | ||
516 | unsigned long scan; | ||
517 | LIST_HEAD(pc_list); | ||
518 | struct list_head *src; | ||
519 | struct page_cgroup *pc, *tmp; | ||
520 | int nid = z->zone_pgdat->node_id; | ||
521 | int zid = zone_idx(z); | ||
522 | struct mem_cgroup_per_zone *mz; | ||
523 | |||
524 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | ||
525 | if (active) | ||
526 | src = &mz->active_list; | ||
527 | else | ||
528 | src = &mz->inactive_list; | ||
529 | |||
530 | |||
531 | spin_lock(&mz->lru_lock); | ||
532 | scan = 0; | ||
533 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | ||
534 | if (scan >= nr_to_scan) | ||
535 | break; | ||
536 | page = pc->page; | ||
537 | VM_BUG_ON(!pc); | ||
538 | |||
539 | if (unlikely(!PageLRU(page))) | ||
540 | continue; | ||
541 | |||
542 | if (PageActive(page) && !active) { | ||
543 | __mem_cgroup_move_lists(pc, true); | ||
544 | continue; | ||
545 | } | ||
546 | if (!PageActive(page) && active) { | ||
547 | __mem_cgroup_move_lists(pc, false); | ||
548 | continue; | ||
549 | } | ||
550 | |||
551 | scan++; | ||
552 | list_move(&pc->lru, &pc_list); | ||
553 | |||
554 | if (__isolate_lru_page(page, mode) == 0) { | ||
555 | list_move(&page->lru, dst); | ||
556 | nr_taken++; | ||
557 | } | ||
558 | } | ||
559 | |||
560 | list_splice(&pc_list, src); | ||
561 | spin_unlock(&mz->lru_lock); | ||
562 | |||
563 | *scanned = scan; | ||
564 | return nr_taken; | ||
565 | } | ||
566 | |||
567 | /* | ||
568 | * Charge the memory controller for page usage. | ||
569 | * Return | ||
570 | * 0 if the charge was successful | ||
571 | * < 0 if the cgroup is over its limit | ||
572 | */ | ||
573 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | ||
574 | gfp_t gfp_mask, enum charge_type ctype) | ||
575 | { | ||
576 | struct mem_cgroup *mem; | ||
577 | struct page_cgroup *pc; | ||
578 | unsigned long flags; | ||
579 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
580 | struct mem_cgroup_per_zone *mz; | ||
581 | |||
582 | /* | ||
583 | * Should page_cgroup's go to their own slab? | ||
584 | * One could optimize the performance of the charging routine | ||
585 | * by saving a bit in the page_flags and using it as a lock | ||
586 | * to see if the cgroup page already has a page_cgroup associated | ||
587 | * with it | ||
588 | */ | ||
589 | retry: | ||
590 | if (page) { | ||
591 | lock_page_cgroup(page); | ||
592 | pc = page_get_page_cgroup(page); | ||
593 | /* | ||
594 | * The page_cgroup exists and | ||
595 | * the page has already been accounted. | ||
596 | */ | ||
597 | if (pc) { | ||
598 | if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { | ||
599 | /* this page is under being uncharged ? */ | ||
600 | unlock_page_cgroup(page); | ||
601 | cpu_relax(); | ||
602 | goto retry; | ||
603 | } else { | ||
604 | unlock_page_cgroup(page); | ||
605 | goto done; | ||
606 | } | ||
607 | } | ||
608 | unlock_page_cgroup(page); | ||
609 | } | ||
610 | |||
611 | pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); | ||
612 | if (pc == NULL) | ||
613 | goto err; | ||
614 | |||
615 | /* | ||
616 | * We always charge the cgroup the mm_struct belongs to. | ||
617 | * The mm_struct's mem_cgroup changes on task migration if the | ||
618 | * thread group leader migrates. It's possible that mm is not | ||
619 | * set, if so charge the init_mm (happens for pagecache usage). | ||
620 | */ | ||
621 | if (!mm) | ||
622 | mm = &init_mm; | ||
623 | |||
624 | rcu_read_lock(); | ||
625 | mem = rcu_dereference(mm->mem_cgroup); | ||
626 | /* | ||
627 | * For every charge from the cgroup, increment reference | ||
628 | * count | ||
629 | */ | ||
630 | css_get(&mem->css); | ||
631 | rcu_read_unlock(); | ||
632 | |||
633 | /* | ||
634 | * If we created the page_cgroup, we should free it on exceeding | ||
635 | * the cgroup limit. | ||
636 | */ | ||
637 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { | ||
638 | if (!(gfp_mask & __GFP_WAIT)) | ||
639 | goto out; | ||
640 | |||
641 | if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) | ||
642 | continue; | ||
643 | |||
644 | /* | ||
645 | * try_to_free_mem_cgroup_pages() might not give us a full | ||
646 | * picture of reclaim. Some pages are reclaimed and might be | ||
647 | * moved to swap cache or just unmapped from the cgroup. | ||
648 | * Check the limit again to see if the reclaim reduced the | ||
649 | * current usage of the cgroup before giving up | ||
650 | */ | ||
651 | if (res_counter_check_under_limit(&mem->res)) | ||
652 | continue; | ||
653 | |||
654 | if (!nr_retries--) { | ||
655 | mem_cgroup_out_of_memory(mem, gfp_mask); | ||
656 | goto out; | ||
657 | } | ||
658 | congestion_wait(WRITE, HZ/10); | ||
659 | } | ||
660 | |||
661 | atomic_set(&pc->ref_cnt, 1); | ||
662 | pc->mem_cgroup = mem; | ||
663 | pc->page = page; | ||
664 | pc->flags = PAGE_CGROUP_FLAG_ACTIVE; | ||
665 | if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) | ||
666 | pc->flags |= PAGE_CGROUP_FLAG_CACHE; | ||
667 | |||
668 | if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) { | ||
669 | /* | ||
670 | * Another charge has been added to this page already. | ||
671 | * We take lock_page_cgroup(page) again and read | ||
672 | * page->cgroup, increment refcnt.... just retry is OK. | ||
673 | */ | ||
674 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
675 | css_put(&mem->css); | ||
676 | kfree(pc); | ||
677 | if (!page) | ||
678 | goto done; | ||
679 | goto retry; | ||
680 | } | ||
681 | |||
682 | mz = page_cgroup_zoneinfo(pc); | ||
683 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
684 | /* Update statistics vector */ | ||
685 | __mem_cgroup_add_list(pc); | ||
686 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
687 | |||
688 | done: | ||
689 | return 0; | ||
690 | out: | ||
691 | css_put(&mem->css); | ||
692 | kfree(pc); | ||
693 | err: | ||
694 | return -ENOMEM; | ||
695 | } | ||
696 | |||
697 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, | ||
698 | gfp_t gfp_mask) | ||
699 | { | ||
700 | return mem_cgroup_charge_common(page, mm, gfp_mask, | ||
701 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | ||
702 | } | ||
703 | |||
704 | /* | ||
705 | * See if the cached pages should be charged at all? | ||
706 | */ | ||
707 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | ||
708 | gfp_t gfp_mask) | ||
709 | { | ||
710 | int ret = 0; | ||
711 | if (!mm) | ||
712 | mm = &init_mm; | ||
713 | |||
714 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | ||
715 | MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
716 | return ret; | ||
717 | } | ||
718 | |||
719 | /* | ||
720 | * Uncharging is always a welcome operation, we never complain, simply | ||
721 | * uncharge. This routine should be called with lock_page_cgroup held | ||
722 | */ | ||
723 | void mem_cgroup_uncharge(struct page_cgroup *pc) | ||
724 | { | ||
725 | struct mem_cgroup *mem; | ||
726 | struct mem_cgroup_per_zone *mz; | ||
727 | struct page *page; | ||
728 | unsigned long flags; | ||
729 | |||
730 | /* | ||
731 | * Check if our page_cgroup is valid | ||
732 | */ | ||
733 | if (!pc) | ||
734 | return; | ||
735 | |||
736 | if (atomic_dec_and_test(&pc->ref_cnt)) { | ||
737 | page = pc->page; | ||
738 | mz = page_cgroup_zoneinfo(pc); | ||
739 | /* | ||
740 | * get page->cgroup and clear it under lock. | ||
741 | * force_empty can drop page->cgroup without checking refcnt. | ||
742 | */ | ||
743 | unlock_page_cgroup(page); | ||
744 | if (clear_page_cgroup(page, pc) == pc) { | ||
745 | mem = pc->mem_cgroup; | ||
746 | css_put(&mem->css); | ||
747 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
748 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
749 | __mem_cgroup_remove_list(pc); | ||
750 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
751 | kfree(pc); | ||
752 | } | ||
753 | lock_page_cgroup(page); | ||
754 | } | ||
755 | } | ||
756 | |||
757 | void mem_cgroup_uncharge_page(struct page *page) | ||
758 | { | ||
759 | lock_page_cgroup(page); | ||
760 | mem_cgroup_uncharge(page_get_page_cgroup(page)); | ||
761 | unlock_page_cgroup(page); | ||
762 | } | ||
763 | |||
764 | /* | ||
765 | * Returns non-zero if a page (under migration) has valid page_cgroup member. | ||
766 | * Refcnt of page_cgroup is incremented. | ||
767 | */ | ||
768 | |||
769 | int mem_cgroup_prepare_migration(struct page *page) | ||
770 | { | ||
771 | struct page_cgroup *pc; | ||
772 | int ret = 0; | ||
773 | lock_page_cgroup(page); | ||
774 | pc = page_get_page_cgroup(page); | ||
775 | if (pc && atomic_inc_not_zero(&pc->ref_cnt)) | ||
776 | ret = 1; | ||
777 | unlock_page_cgroup(page); | ||
778 | return ret; | ||
779 | } | ||
780 | |||
781 | void mem_cgroup_end_migration(struct page *page) | ||
782 | { | ||
783 | struct page_cgroup *pc; | ||
784 | |||
785 | lock_page_cgroup(page); | ||
786 | pc = page_get_page_cgroup(page); | ||
787 | mem_cgroup_uncharge(pc); | ||
788 | unlock_page_cgroup(page); | ||
789 | } | ||
790 | /* | ||
791 | * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. | ||
792 | * And no race with uncharge() routines because page_cgroup for *page* | ||
793 | * has extra one reference by mem_cgroup_prepare_migration. | ||
794 | */ | ||
795 | |||
796 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) | ||
797 | { | ||
798 | struct page_cgroup *pc; | ||
799 | struct mem_cgroup *mem; | ||
800 | unsigned long flags; | ||
801 | struct mem_cgroup_per_zone *mz; | ||
802 | retry: | ||
803 | pc = page_get_page_cgroup(page); | ||
804 | if (!pc) | ||
805 | return; | ||
806 | mem = pc->mem_cgroup; | ||
807 | mz = page_cgroup_zoneinfo(pc); | ||
808 | if (clear_page_cgroup(page, pc) != pc) | ||
809 | goto retry; | ||
810 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
811 | |||
812 | __mem_cgroup_remove_list(pc); | ||
813 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
814 | |||
815 | pc->page = newpage; | ||
816 | lock_page_cgroup(newpage); | ||
817 | page_assign_page_cgroup(newpage, pc); | ||
818 | unlock_page_cgroup(newpage); | ||
819 | |||
820 | mz = page_cgroup_zoneinfo(pc); | ||
821 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
822 | __mem_cgroup_add_list(pc); | ||
823 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
824 | return; | ||
825 | } | ||
826 | |||
827 | /* | ||
828 | * This routine traverse page_cgroup in given list and drop them all. | ||
829 | * This routine ignores page_cgroup->ref_cnt. | ||
830 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | ||
831 | */ | ||
832 | #define FORCE_UNCHARGE_BATCH (128) | ||
833 | static void | ||
834 | mem_cgroup_force_empty_list(struct mem_cgroup *mem, | ||
835 | struct mem_cgroup_per_zone *mz, | ||
836 | int active) | ||
837 | { | ||
838 | struct page_cgroup *pc; | ||
839 | struct page *page; | ||
840 | int count; | ||
841 | unsigned long flags; | ||
842 | struct list_head *list; | ||
843 | |||
844 | if (active) | ||
845 | list = &mz->active_list; | ||
846 | else | ||
847 | list = &mz->inactive_list; | ||
848 | |||
849 | if (list_empty(list)) | ||
850 | return; | ||
851 | retry: | ||
852 | count = FORCE_UNCHARGE_BATCH; | ||
853 | spin_lock_irqsave(&mz->lru_lock, flags); | ||
854 | |||
855 | while (--count && !list_empty(list)) { | ||
856 | pc = list_entry(list->prev, struct page_cgroup, lru); | ||
857 | page = pc->page; | ||
858 | /* Avoid race with charge */ | ||
859 | atomic_set(&pc->ref_cnt, 0); | ||
860 | if (clear_page_cgroup(page, pc) == pc) { | ||
861 | css_put(&mem->css); | ||
862 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
863 | __mem_cgroup_remove_list(pc); | ||
864 | kfree(pc); | ||
865 | } else /* being uncharged ? ...do relax */ | ||
866 | break; | ||
867 | } | ||
868 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
869 | if (!list_empty(list)) { | ||
870 | cond_resched(); | ||
871 | goto retry; | ||
872 | } | ||
873 | return; | ||
874 | } | ||
875 | |||
876 | /* | ||
877 | * make mem_cgroup's charge to be 0 if there is no task. | ||
878 | * This enables deleting this mem_cgroup. | ||
879 | */ | ||
880 | |||
881 | int mem_cgroup_force_empty(struct mem_cgroup *mem) | ||
882 | { | ||
883 | int ret = -EBUSY; | ||
884 | int node, zid; | ||
885 | css_get(&mem->css); | ||
886 | /* | ||
887 | * page reclaim code (kswapd etc..) will move pages between | ||
888 | ` * active_list <-> inactive_list while we don't take a lock. | ||
889 | * So, we have to do loop here until all lists are empty. | ||
890 | */ | ||
891 | while (mem->res.usage > 0) { | ||
892 | if (atomic_read(&mem->css.cgroup->count) > 0) | ||
893 | goto out; | ||
894 | for_each_node_state(node, N_POSSIBLE) | ||
895 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
896 | struct mem_cgroup_per_zone *mz; | ||
897 | mz = mem_cgroup_zoneinfo(mem, node, zid); | ||
898 | /* drop all page_cgroup in active_list */ | ||
899 | mem_cgroup_force_empty_list(mem, mz, 1); | ||
900 | /* drop all page_cgroup in inactive_list */ | ||
901 | mem_cgroup_force_empty_list(mem, mz, 0); | ||
902 | } | ||
903 | } | ||
904 | ret = 0; | ||
905 | out: | ||
906 | css_put(&mem->css); | ||
907 | return ret; | ||
908 | } | ||
909 | |||
910 | |||
911 | |||
912 | int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) | ||
913 | { | ||
914 | *tmp = memparse(buf, &buf); | ||
915 | if (*buf != '\0') | ||
916 | return -EINVAL; | ||
917 | |||
918 | /* | ||
919 | * Round up the value to the closest page size | ||
920 | */ | ||
921 | *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; | ||
922 | return 0; | ||
923 | } | ||
924 | |||
925 | static ssize_t mem_cgroup_read(struct cgroup *cont, | ||
926 | struct cftype *cft, struct file *file, | ||
927 | char __user *userbuf, size_t nbytes, loff_t *ppos) | ||
928 | { | ||
929 | return res_counter_read(&mem_cgroup_from_cont(cont)->res, | ||
930 | cft->private, userbuf, nbytes, ppos, | ||
931 | NULL); | ||
932 | } | ||
933 | |||
934 | static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | ||
935 | struct file *file, const char __user *userbuf, | ||
936 | size_t nbytes, loff_t *ppos) | ||
937 | { | ||
938 | return res_counter_write(&mem_cgroup_from_cont(cont)->res, | ||
939 | cft->private, userbuf, nbytes, ppos, | ||
940 | mem_cgroup_write_strategy); | ||
941 | } | ||
942 | |||
943 | static ssize_t mem_force_empty_write(struct cgroup *cont, | ||
944 | struct cftype *cft, struct file *file, | ||
945 | const char __user *userbuf, | ||
946 | size_t nbytes, loff_t *ppos) | ||
947 | { | ||
948 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
949 | int ret; | ||
950 | ret = mem_cgroup_force_empty(mem); | ||
951 | if (!ret) | ||
952 | ret = nbytes; | ||
953 | return ret; | ||
954 | } | ||
955 | |||
956 | /* | ||
957 | * Note: This should be removed if cgroup supports write-only file. | ||
958 | */ | ||
959 | |||
960 | static ssize_t mem_force_empty_read(struct cgroup *cont, | ||
961 | struct cftype *cft, | ||
962 | struct file *file, char __user *userbuf, | ||
963 | size_t nbytes, loff_t *ppos) | ||
964 | { | ||
965 | return -EINVAL; | ||
966 | } | ||
967 | |||
968 | |||
969 | static const struct mem_cgroup_stat_desc { | ||
970 | const char *msg; | ||
971 | u64 unit; | ||
972 | } mem_cgroup_stat_desc[] = { | ||
973 | [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, | ||
974 | [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, | ||
975 | }; | ||
976 | |||
977 | static int mem_control_stat_show(struct seq_file *m, void *arg) | ||
978 | { | ||
979 | struct cgroup *cont = m->private; | ||
980 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | ||
981 | struct mem_cgroup_stat *stat = &mem_cont->stat; | ||
982 | int i; | ||
983 | |||
984 | for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { | ||
985 | s64 val; | ||
986 | |||
987 | val = mem_cgroup_read_stat(stat, i); | ||
988 | val *= mem_cgroup_stat_desc[i].unit; | ||
989 | seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, | ||
990 | (long long)val); | ||
991 | } | ||
992 | /* showing # of active pages */ | ||
993 | { | ||
994 | unsigned long active, inactive; | ||
995 | |||
996 | inactive = mem_cgroup_get_all_zonestat(mem_cont, | ||
997 | MEM_CGROUP_ZSTAT_INACTIVE); | ||
998 | active = mem_cgroup_get_all_zonestat(mem_cont, | ||
999 | MEM_CGROUP_ZSTAT_ACTIVE); | ||
1000 | seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); | ||
1001 | seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); | ||
1002 | } | ||
1003 | return 0; | ||
1004 | } | ||
1005 | |||
1006 | static const struct file_operations mem_control_stat_file_operations = { | ||
1007 | .read = seq_read, | ||
1008 | .llseek = seq_lseek, | ||
1009 | .release = single_release, | ||
1010 | }; | ||
1011 | |||
1012 | static int mem_control_stat_open(struct inode *unused, struct file *file) | ||
1013 | { | ||
1014 | /* XXX __d_cont */ | ||
1015 | struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; | ||
1016 | |||
1017 | file->f_op = &mem_control_stat_file_operations; | ||
1018 | return single_open(file, mem_control_stat_show, cont); | ||
1019 | } | ||
1020 | |||
1021 | |||
1022 | |||
1023 | static struct cftype mem_cgroup_files[] = { | ||
1024 | { | ||
1025 | .name = "usage_in_bytes", | ||
1026 | .private = RES_USAGE, | ||
1027 | .read = mem_cgroup_read, | ||
1028 | }, | ||
1029 | { | ||
1030 | .name = "limit_in_bytes", | ||
1031 | .private = RES_LIMIT, | ||
1032 | .write = mem_cgroup_write, | ||
1033 | .read = mem_cgroup_read, | ||
1034 | }, | ||
1035 | { | ||
1036 | .name = "failcnt", | ||
1037 | .private = RES_FAILCNT, | ||
1038 | .read = mem_cgroup_read, | ||
1039 | }, | ||
1040 | { | ||
1041 | .name = "force_empty", | ||
1042 | .write = mem_force_empty_write, | ||
1043 | .read = mem_force_empty_read, | ||
1044 | }, | ||
1045 | { | ||
1046 | .name = "stat", | ||
1047 | .open = mem_control_stat_open, | ||
1048 | }, | ||
1049 | }; | ||
1050 | |||
1051 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | ||
1052 | { | ||
1053 | struct mem_cgroup_per_node *pn; | ||
1054 | struct mem_cgroup_per_zone *mz; | ||
1055 | int zone; | ||
1056 | /* | ||
1057 | * This routine is called against possible nodes. | ||
1058 | * But it's BUG to call kmalloc() against offline node. | ||
1059 | * | ||
1060 | * TODO: this routine can waste much memory for nodes which will | ||
1061 | * never be onlined. It's better to use memory hotplug callback | ||
1062 | * function. | ||
1063 | */ | ||
1064 | if (node_state(node, N_HIGH_MEMORY)) | ||
1065 | pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node); | ||
1066 | else | ||
1067 | pn = kmalloc(sizeof(*pn), GFP_KERNEL); | ||
1068 | if (!pn) | ||
1069 | return 1; | ||
1070 | |||
1071 | mem->info.nodeinfo[node] = pn; | ||
1072 | memset(pn, 0, sizeof(*pn)); | ||
1073 | |||
1074 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
1075 | mz = &pn->zoneinfo[zone]; | ||
1076 | INIT_LIST_HEAD(&mz->active_list); | ||
1077 | INIT_LIST_HEAD(&mz->inactive_list); | ||
1078 | spin_lock_init(&mz->lru_lock); | ||
1079 | } | ||
1080 | return 0; | ||
1081 | } | ||
1082 | |||
1083 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | ||
1084 | { | ||
1085 | kfree(mem->info.nodeinfo[node]); | ||
1086 | } | ||
1087 | |||
1088 | |||
1089 | static struct mem_cgroup init_mem_cgroup; | ||
1090 | |||
1091 | static struct cgroup_subsys_state * | ||
1092 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | ||
1093 | { | ||
1094 | struct mem_cgroup *mem; | ||
1095 | int node; | ||
1096 | |||
1097 | if (unlikely((cont->parent) == NULL)) { | ||
1098 | mem = &init_mem_cgroup; | ||
1099 | init_mm.mem_cgroup = mem; | ||
1100 | } else | ||
1101 | mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); | ||
1102 | |||
1103 | if (mem == NULL) | ||
1104 | return NULL; | ||
1105 | |||
1106 | res_counter_init(&mem->res); | ||
1107 | |||
1108 | memset(&mem->info, 0, sizeof(mem->info)); | ||
1109 | |||
1110 | for_each_node_state(node, N_POSSIBLE) | ||
1111 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | ||
1112 | goto free_out; | ||
1113 | |||
1114 | return &mem->css; | ||
1115 | free_out: | ||
1116 | for_each_node_state(node, N_POSSIBLE) | ||
1117 | free_mem_cgroup_per_zone_info(mem, node); | ||
1118 | if (cont->parent != NULL) | ||
1119 | kfree(mem); | ||
1120 | return NULL; | ||
1121 | } | ||
1122 | |||
1123 | static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | ||
1124 | struct cgroup *cont) | ||
1125 | { | ||
1126 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
1127 | mem_cgroup_force_empty(mem); | ||
1128 | } | ||
1129 | |||
1130 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | ||
1131 | struct cgroup *cont) | ||
1132 | { | ||
1133 | int node; | ||
1134 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | ||
1135 | |||
1136 | for_each_node_state(node, N_POSSIBLE) | ||
1137 | free_mem_cgroup_per_zone_info(mem, node); | ||
1138 | |||
1139 | kfree(mem_cgroup_from_cont(cont)); | ||
1140 | } | ||
1141 | |||
1142 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | ||
1143 | struct cgroup *cont) | ||
1144 | { | ||
1145 | return cgroup_add_files(cont, ss, mem_cgroup_files, | ||
1146 | ARRAY_SIZE(mem_cgroup_files)); | ||
1147 | } | ||
1148 | |||
1149 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | ||
1150 | struct cgroup *cont, | ||
1151 | struct cgroup *old_cont, | ||
1152 | struct task_struct *p) | ||
1153 | { | ||
1154 | struct mm_struct *mm; | ||
1155 | struct mem_cgroup *mem, *old_mem; | ||
1156 | |||
1157 | mm = get_task_mm(p); | ||
1158 | if (mm == NULL) | ||
1159 | return; | ||
1160 | |||
1161 | mem = mem_cgroup_from_cont(cont); | ||
1162 | old_mem = mem_cgroup_from_cont(old_cont); | ||
1163 | |||
1164 | if (mem == old_mem) | ||
1165 | goto out; | ||
1166 | |||
1167 | /* | ||
1168 | * Only thread group leaders are allowed to migrate, the mm_struct is | ||
1169 | * in effect owned by the leader | ||
1170 | */ | ||
1171 | if (p->tgid != p->pid) | ||
1172 | goto out; | ||
1173 | |||
1174 | css_get(&mem->css); | ||
1175 | rcu_assign_pointer(mm->mem_cgroup, mem); | ||
1176 | css_put(&old_mem->css); | ||
1177 | |||
1178 | out: | ||
1179 | mmput(mm); | ||
1180 | return; | ||
1181 | } | ||
1182 | |||
1183 | struct cgroup_subsys mem_cgroup_subsys = { | ||
1184 | .name = "memory", | ||
1185 | .subsys_id = mem_cgroup_subsys_id, | ||
1186 | .create = mem_cgroup_create, | ||
1187 | .pre_destroy = mem_cgroup_pre_destroy, | ||
1188 | .destroy = mem_cgroup_destroy, | ||
1189 | .populate = mem_cgroup_populate, | ||
1190 | .attach = mem_cgroup_move_task, | ||
1191 | .early_init = 0, | ||
1192 | }; | ||