aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2008-04-29 04:00:16 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-29 11:06:10 -0400
commitcf475ad28ac35cc9ba612d67158f29b73b38b05d (patch)
tree2c7cd568d00357bd42643ea602884e731cc24f26
parent29486df325e1fe6e1764afcb19e3370804c2b002 (diff)
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner. This approach was suggested by Paul Menage. The advantage of this approach is that, once the mm->owner is known, using the subsystem id, the cgroup can be determined. It also allows several control groups that are virtually grouped by mm_struct, to exist independent of the memory controller i.e., without adding mem_cgroup's for each controller, to mm_struct. A new config option CONFIG_MM_OWNER is added and the memory resource controller selects this config option. This patch also adds cgroup callbacks to notify subsystems when mm->owner changes. The mm_cgroup_changed callback is called with the task_lock() of the new task held and is called just prior to changing the mm->owner. I am indebted to Paul Menage for the several reviews of this patchset and helping me make it lighter and simpler. This patch was tested on a powerpc box, it was compiled with both the MM_OWNER config turned on and off. After the thread group leader exits, it's moved to init_css_state by cgroup_exit(), thus all future charges from runnings threads would be redirected to the init_css_set's subsystem. Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Hugh Dickins <hugh@veritas.com> Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com> Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp> Cc: Hirokazu Takahashi <taka@valinux.co.jp> Cc: David Rientjes <rientjes@google.com>, Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Pekka Enberg <penberg@cs.helsinki.fi> Reviewed-by: Paul Menage <menage@google.com> Cc: Oleg Nesterov <oleg@tv-sign.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/exec.c1
-rw-r--r--include/linux/cgroup.h15
-rw-r--r--include/linux/memcontrol.h16
-rw-r--r--include/linux/mm_types.h5
-rw-r--r--include/linux/sched.h13
-rw-r--r--init/Kconfig7
-rw-r--r--init/main.c1
-rw-r--r--kernel/cgroup.c30
-rw-r--r--kernel/exit.c83
-rw-r--r--kernel/fork.c11
-rw-r--r--mm/memcontrol.c28
11 files changed, 169 insertions, 41 deletions
diff --git a/fs/exec.c b/fs/exec.c
index 7768453dc986..711bc45d789c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -735,6 +735,7 @@ static int exec_mmap(struct mm_struct *mm)
735 tsk->active_mm = mm; 735 tsk->active_mm = mm;
736 activate_mm(active_mm, mm); 736 activate_mm(active_mm, mm);
737 task_unlock(tsk); 737 task_unlock(tsk);
738 mm_update_next_owner(mm);
738 arch_pick_mmap_layout(mm); 739 arch_pick_mmap_layout(mm);
739 if (old_mm) { 740 if (old_mm) {
740 up_read(&old_mm->mmap_sem); 741 up_read(&old_mm->mmap_sem);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 095248082b7e..e155aa78d859 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -305,6 +305,12 @@ struct cgroup_subsys {
305 struct cgroup *cgrp); 305 struct cgroup *cgrp);
306 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); 306 void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp);
307 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); 307 void (*bind)(struct cgroup_subsys *ss, struct cgroup *root);
308 /*
309 * This routine is called with the task_lock of mm->owner held
310 */
311 void (*mm_owner_changed)(struct cgroup_subsys *ss,
312 struct cgroup *old,
313 struct cgroup *new);
308 int subsys_id; 314 int subsys_id;
309 int active; 315 int active;
310 int disabled; 316 int disabled;
@@ -390,4 +396,13 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
390 396
391#endif /* !CONFIG_CGROUPS */ 397#endif /* !CONFIG_CGROUPS */
392 398
399#ifdef CONFIG_MM_OWNER
400extern void
401cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new);
402#else /* !CONFIG_MM_OWNER */
403static inline void
404cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
405{
406}
407#endif /* CONFIG_MM_OWNER */
393#endif /* _LINUX_CGROUP_H */ 408#endif /* _LINUX_CGROUP_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8b1c4295848b..e6608776bc96 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -27,9 +27,6 @@ struct mm_struct;
27 27
28#ifdef CONFIG_CGROUP_MEM_RES_CTLR 28#ifdef CONFIG_CGROUP_MEM_RES_CTLR
29 29
30extern void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p);
31extern void mm_free_cgroup(struct mm_struct *mm);
32
33#define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0) 30#define page_reset_bad_cgroup(page) ((page)->page_cgroup = 0)
34 31
35extern struct page_cgroup *page_get_page_cgroup(struct page *page); 32extern struct page_cgroup *page_get_page_cgroup(struct page *page);
@@ -48,8 +45,10 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
48extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); 45extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask);
49int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); 46int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
50 47
48extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
49
51#define mm_match_cgroup(mm, cgroup) \ 50#define mm_match_cgroup(mm, cgroup) \
52 ((cgroup) == rcu_dereference((mm)->mem_cgroup)) 51 ((cgroup) == mem_cgroup_from_task((mm)->owner))
53 52
54extern int mem_cgroup_prepare_migration(struct page *page); 53extern int mem_cgroup_prepare_migration(struct page *page);
55extern void mem_cgroup_end_migration(struct page *page); 54extern void mem_cgroup_end_migration(struct page *page);
@@ -73,15 +72,6 @@ extern long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
73 struct zone *zone, int priority); 72 struct zone *zone, int priority);
74 73
75#else /* CONFIG_CGROUP_MEM_RES_CTLR */ 74#else /* CONFIG_CGROUP_MEM_RES_CTLR */
76static inline void mm_init_cgroup(struct mm_struct *mm,
77 struct task_struct *p)
78{
79}
80
81static inline void mm_free_cgroup(struct mm_struct *mm)
82{
83}
84
85static inline void page_reset_bad_cgroup(struct page *page) 75static inline void page_reset_bad_cgroup(struct page *page)
86{ 76{
87} 77}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e2bae8dde35a..bc97bd54f606 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -225,8 +225,9 @@ struct mm_struct {
225 /* aio bits */ 225 /* aio bits */
226 rwlock_t ioctx_list_lock; /* aio lock */ 226 rwlock_t ioctx_list_lock; /* aio lock */
227 struct kioctx *ioctx_list; 227 struct kioctx *ioctx_list;
228#ifdef CONFIG_CGROUP_MEM_RES_CTLR 228#ifdef CONFIG_MM_OWNER
229 struct mem_cgroup *mem_cgroup; 229 struct task_struct *owner; /* The thread group leader that */
230 /* owns the mm_struct. */
230#endif 231#endif
231}; 232};
232 233
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 024d72b47a0c..1d02babdb2c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2148,6 +2148,19 @@ static inline void migration_init(void)
2148#define TASK_SIZE_OF(tsk) TASK_SIZE 2148#define TASK_SIZE_OF(tsk) TASK_SIZE
2149#endif 2149#endif
2150 2150
2151#ifdef CONFIG_MM_OWNER
2152extern void mm_update_next_owner(struct mm_struct *mm);
2153extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
2154#else
2155static inline void mm_update_next_owner(struct mm_struct *mm)
2156{
2157}
2158
2159static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
2160{
2161}
2162#endif /* CONFIG_MM_OWNER */
2163
2151#endif /* __KERNEL__ */ 2164#endif /* __KERNEL__ */
2152 2165
2153#endif 2166#endif
diff --git a/init/Kconfig b/init/Kconfig
index a3457926342a..98fa96eac415 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -378,9 +378,13 @@ config RESOURCE_COUNTERS
378 infrastructure that works with cgroups 378 infrastructure that works with cgroups
379 depends on CGROUPS 379 depends on CGROUPS
380 380
381config MM_OWNER
382 bool
383
381config CGROUP_MEM_RES_CTLR 384config CGROUP_MEM_RES_CTLR
382 bool "Memory Resource Controller for Control Groups" 385 bool "Memory Resource Controller for Control Groups"
383 depends on CGROUPS && RESOURCE_COUNTERS 386 depends on CGROUPS && RESOURCE_COUNTERS
387 select MM_OWNER
384 help 388 help
385 Provides a memory resource controller that manages both page cache and 389 Provides a memory resource controller that manages both page cache and
386 RSS memory. 390 RSS memory.
@@ -393,6 +397,9 @@ config CGROUP_MEM_RES_CTLR
393 Only enable when you're ok with these trade offs and really 397 Only enable when you're ok with these trade offs and really
394 sure you need the memory resource controller. 398 sure you need the memory resource controller.
395 399
400 This config option also selects MM_OWNER config option, which
401 could in turn add some fork/exit overhead.
402
396config SYSFS_DEPRECATED 403config SYSFS_DEPRECATED
397 bool 404 bool
398 405
diff --git a/init/main.c b/init/main.c
index 1116d2f40cc1..c62c98f381f2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -559,6 +559,7 @@ asmlinkage void __init start_kernel(void)
559 printk(KERN_NOTICE); 559 printk(KERN_NOTICE);
560 printk(linux_banner); 560 printk(linux_banner);
561 setup_arch(&command_line); 561 setup_arch(&command_line);
562 mm_init_owner(&init_mm, &init_task);
562 setup_command_line(command_line); 563 setup_command_line(command_line);
563 unwind_setup(); 564 unwind_setup();
564 setup_per_cpu_areas(); 565 setup_per_cpu_areas();
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index abc433772e5a..b9d467d83fc1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -119,6 +119,7 @@ static int root_count;
119 * be called. 119 * be called.
120 */ 120 */
121static int need_forkexit_callback; 121static int need_forkexit_callback;
122static int need_mm_owner_callback __read_mostly;
122 123
123/* convenient tests for these bits */ 124/* convenient tests for these bits */
124inline int cgroup_is_removed(const struct cgroup *cgrp) 125inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -2498,6 +2499,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2498 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; 2499 init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
2499 2500
2500 need_forkexit_callback |= ss->fork || ss->exit; 2501 need_forkexit_callback |= ss->fork || ss->exit;
2502 need_mm_owner_callback |= !!ss->mm_owner_changed;
2501 2503
2502 /* At system boot, before all subsystems have been 2504 /* At system boot, before all subsystems have been
2503 * registered, no tasks have been forked, so we don't 2505 * registered, no tasks have been forked, so we don't
@@ -2748,6 +2750,34 @@ void cgroup_fork_callbacks(struct task_struct *child)
2748 } 2750 }
2749} 2751}
2750 2752
2753#ifdef CONFIG_MM_OWNER
2754/**
2755 * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
2756 * @p: the new owner
2757 *
2758 * Called on every change to mm->owner. mm_init_owner() does not
2759 * invoke this routine, since it assigns the mm->owner the first time
2760 * and does not change it.
2761 */
2762void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2763{
2764 struct cgroup *oldcgrp, *newcgrp;
2765
2766 if (need_mm_owner_callback) {
2767 int i;
2768 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2769 struct cgroup_subsys *ss = subsys[i];
2770 oldcgrp = task_cgroup(old, ss->subsys_id);
2771 newcgrp = task_cgroup(new, ss->subsys_id);
2772 if (oldcgrp == newcgrp)
2773 continue;
2774 if (ss->mm_owner_changed)
2775 ss->mm_owner_changed(ss, oldcgrp, newcgrp);
2776 }
2777 }
2778}
2779#endif /* CONFIG_MM_OWNER */
2780
2751/** 2781/**
2752 * cgroup_post_fork - called on a new task after adding it to the task list 2782 * cgroup_post_fork - called on a new task after adding it to the task list
2753 * @child: the task in question 2783 * @child: the task in question
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a9d98c641ac..ae0f2c4e452b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -557,6 +557,88 @@ void exit_fs(struct task_struct *tsk)
557 557
558EXPORT_SYMBOL_GPL(exit_fs); 558EXPORT_SYMBOL_GPL(exit_fs);
559 559
560#ifdef CONFIG_MM_OWNER
561/*
562 * Task p is exiting and it owned mm, lets find a new owner for it
563 */
564static inline int
565mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
566{
567 /*
568 * If there are other users of the mm and the owner (us) is exiting
569 * we need to find a new owner to take on the responsibility.
570 */
571 if (!mm)
572 return 0;
573 if (atomic_read(&mm->mm_users) <= 1)
574 return 0;
575 if (mm->owner != p)
576 return 0;
577 return 1;
578}
579
580void mm_update_next_owner(struct mm_struct *mm)
581{
582 struct task_struct *c, *g, *p = current;
583
584retry:
585 if (!mm_need_new_owner(mm, p))
586 return;
587
588 read_lock(&tasklist_lock);
589 /*
590 * Search in the children
591 */
592 list_for_each_entry(c, &p->children, sibling) {
593 if (c->mm == mm)
594 goto assign_new_owner;
595 }
596
597 /*
598 * Search in the siblings
599 */
600 list_for_each_entry(c, &p->parent->children, sibling) {
601 if (c->mm == mm)
602 goto assign_new_owner;
603 }
604
605 /*
606 * Search through everything else. We should not get
607 * here often
608 */
609 do_each_thread(g, c) {
610 if (c->mm == mm)
611 goto assign_new_owner;
612 } while_each_thread(g, c);
613
614 read_unlock(&tasklist_lock);
615 return;
616
617assign_new_owner:
618 BUG_ON(c == p);
619 get_task_struct(c);
620 /*
621 * The task_lock protects c->mm from changing.
622 * We always want mm->owner->mm == mm
623 */
624 task_lock(c);
625 /*
626 * Delay read_unlock() till we have the task_lock()
627 * to ensure that c does not slip away underneath us
628 */
629 read_unlock(&tasklist_lock);
630 if (c->mm != mm) {
631 task_unlock(c);
632 put_task_struct(c);
633 goto retry;
634 }
635 cgroup_mm_owner_callbacks(mm->owner, c);
636 mm->owner = c;
637 task_unlock(c);
638 put_task_struct(c);
639}
640#endif /* CONFIG_MM_OWNER */
641
560/* 642/*
561 * Turn us into a lazy TLB process if we 643 * Turn us into a lazy TLB process if we
562 * aren't already.. 644 * aren't already..
@@ -596,6 +678,7 @@ static void exit_mm(struct task_struct * tsk)
596 /* We don't want this task to be frozen prematurely */ 678 /* We don't want this task to be frozen prematurely */
597 clear_freeze_flag(tsk); 679 clear_freeze_flag(tsk);
598 task_unlock(tsk); 680 task_unlock(tsk);
681 mm_update_next_owner(mm);
599 mmput(mm); 682 mmput(mm);
600} 683}
601 684
diff --git a/kernel/fork.c b/kernel/fork.c
index 6067e429f281..156db96ff754 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -381,14 +381,13 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
381 mm->ioctx_list = NULL; 381 mm->ioctx_list = NULL;
382 mm->free_area_cache = TASK_UNMAPPED_BASE; 382 mm->free_area_cache = TASK_UNMAPPED_BASE;
383 mm->cached_hole_size = ~0UL; 383 mm->cached_hole_size = ~0UL;
384 mm_init_cgroup(mm, p); 384 mm_init_owner(mm, p);
385 385
386 if (likely(!mm_alloc_pgd(mm))) { 386 if (likely(!mm_alloc_pgd(mm))) {
387 mm->def_flags = 0; 387 mm->def_flags = 0;
388 return mm; 388 return mm;
389 } 389 }
390 390
391 mm_free_cgroup(mm);
392 free_mm(mm); 391 free_mm(mm);
393 return NULL; 392 return NULL;
394} 393}
@@ -438,7 +437,6 @@ void mmput(struct mm_struct *mm)
438 spin_unlock(&mmlist_lock); 437 spin_unlock(&mmlist_lock);
439 } 438 }
440 put_swap_token(mm); 439 put_swap_token(mm);
441 mm_free_cgroup(mm);
442 mmdrop(mm); 440 mmdrop(mm);
443 } 441 }
444} 442}
@@ -982,6 +980,13 @@ static void rt_mutex_init_task(struct task_struct *p)
982#endif 980#endif
983} 981}
984 982
983#ifdef CONFIG_MM_OWNER
984void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
985{
986 mm->owner = p;
987}
988#endif /* CONFIG_MM_OWNER */
989
985/* 990/*
986 * This creates a new process as a copy of the old one, 991 * This creates a new process as a copy of the old one,
987 * but does not actually start it yet. 992 * but does not actually start it yet.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d12795cc7622..49d80814798b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -236,26 +236,12 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
236 css); 236 css);
237} 237}
238 238
239static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 239struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
240{ 240{
241 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 241 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
242 struct mem_cgroup, css); 242 struct mem_cgroup, css);
243} 243}
244 244
245void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
246{
247 struct mem_cgroup *mem;
248
249 mem = mem_cgroup_from_task(p);
250 css_get(&mem->css);
251 mm->mem_cgroup = mem;
252}
253
254void mm_free_cgroup(struct mm_struct *mm)
255{
256 css_put(&mm->mem_cgroup->css);
257}
258
259static inline int page_cgroup_locked(struct page *page) 245static inline int page_cgroup_locked(struct page *page)
260{ 246{
261 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 247 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
@@ -476,6 +462,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
476 int zid = zone_idx(z); 462 int zid = zone_idx(z);
477 struct mem_cgroup_per_zone *mz; 463 struct mem_cgroup_per_zone *mz;
478 464
465 BUG_ON(!mem_cont);
479 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 466 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
480 if (active) 467 if (active)
481 src = &mz->active_list; 468 src = &mz->active_list;
@@ -574,7 +561,7 @@ retry:
574 mm = &init_mm; 561 mm = &init_mm;
575 562
576 rcu_read_lock(); 563 rcu_read_lock();
577 mem = rcu_dereference(mm->mem_cgroup); 564 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
578 /* 565 /*
579 * For every charge from the cgroup, increment reference count 566 * For every charge from the cgroup, increment reference count
580 */ 567 */
@@ -985,10 +972,9 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
985 struct mem_cgroup *mem; 972 struct mem_cgroup *mem;
986 int node; 973 int node;
987 974
988 if (unlikely((cont->parent) == NULL)) { 975 if (unlikely((cont->parent) == NULL))
989 mem = &init_mem_cgroup; 976 mem = &init_mem_cgroup;
990 init_mm.mem_cgroup = mem; 977 else
991 } else
992 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); 978 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
993 979
994 if (mem == NULL) 980 if (mem == NULL)
@@ -1067,10 +1053,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1067 if (!thread_group_leader(p)) 1053 if (!thread_group_leader(p))
1068 goto out; 1054 goto out;
1069 1055
1070 css_get(&mem->css);
1071 rcu_assign_pointer(mm->mem_cgroup, mem);
1072 css_put(&old_mem->css);
1073
1074out: 1056out:
1075 mmput(mm); 1057 mmput(mm);
1076} 1058}