aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2015-05-13 16:35:17 -0400
committerTejun Heo <tj@kernel.org>2015-05-26 20:35:00 -0400
commitd59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14 (patch)
tree077533cef8f5e16c8f7fd65d7e255d75828f3820
parent7d7efec368d537226142cbe559f45797f18672f9 (diff)
sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem
The cgroup side of threadgroup locking uses signal_struct->group_rwsem to synchronize against threadgroup changes. This per-process rwsem adds small overhead to thread creation, exit and exec paths, forces cgroup code paths to do lock-verify-unlock-retry dance in a couple places and makes it impossible to atomically perform operations across multiple processes. This patch replaces signal_struct->group_rwsem with a global percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader side and contained in cgroups proper. This patch converts one-to-one. This does make writer side heavier and lower the granularity; however, cgroup process migration is a fairly cold path, we do want to optimize thread operations over it and cgroup migration operations don't take enough time for the lower granularity to matter. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org>
-rw-r--r--include/linux/cgroup-defs.h27
-rw-r--r--include/linux/init_task.h8
-rw-r--r--include/linux/sched.h12
-rw-r--r--init/Kconfig1
-rw-r--r--kernel/cgroup.c77
-rw-r--r--kernel/fork.c4
6 files changed, 46 insertions, 83 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1b8c93806dbd..7d83d7f73420 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -461,8 +461,31 @@ struct cgroup_subsys {
461 unsigned int depends_on; 461 unsigned int depends_on;
462}; 462};
463 463
464void cgroup_threadgroup_change_begin(struct task_struct *tsk); 464extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
465void cgroup_threadgroup_change_end(struct task_struct *tsk); 465
466/**
467 * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
468 * @tsk: target task
469 *
470 * Called from threadgroup_change_begin() and allows cgroup operations to
471 * synchronize against threadgroup changes using a percpu_rw_semaphore.
472 */
473static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
474{
475 percpu_down_read(&cgroup_threadgroup_rwsem);
476}
477
478/**
479 * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
480 * @tsk: target task
481 *
482 * Called from threadgroup_change_end(). Counterpart of
483 * cgroup_threadcgroup_change_begin().
484 */
485static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
486{
487 percpu_up_read(&cgroup_threadgroup_rwsem);
488}
466 489
467#else /* CONFIG_CGROUPS */ 490#else /* CONFIG_CGROUPS */
468 491
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 696d22312b31..0cc0bbf20022 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,13 +25,6 @@
25extern struct files_struct init_files; 25extern struct files_struct init_files;
26extern struct fs_struct init_fs; 26extern struct fs_struct init_fs;
27 27
28#ifdef CONFIG_CGROUPS
29#define INIT_GROUP_RWSEM(sig) \
30 .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
31#else
32#define INIT_GROUP_RWSEM(sig)
33#endif
34
35#ifdef CONFIG_CPUSETS 28#ifdef CONFIG_CPUSETS
36#define INIT_CPUSET_SEQ(tsk) \ 29#define INIT_CPUSET_SEQ(tsk) \
37 .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), 30 .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
@@ -56,7 +49,6 @@ extern struct fs_struct init_fs;
56 }, \ 49 }, \
57 .cred_guard_mutex = \ 50 .cred_guard_mutex = \
58 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ 51 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \
59 INIT_GROUP_RWSEM(sig) \
60} 52}
61 53
62extern struct nsproxy init_nsproxy; 54extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5ee290003470..add524a910bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -743,18 +743,6 @@ struct signal_struct {
743 unsigned audit_tty_log_passwd; 743 unsigned audit_tty_log_passwd;
744 struct tty_audit_buf *tty_audit_buf; 744 struct tty_audit_buf *tty_audit_buf;
745#endif 745#endif
746#ifdef CONFIG_CGROUPS
747 /*
748 * group_rwsem prevents new tasks from entering the threadgroup and
749 * member tasks from exiting,a more specifically, setting of
750 * PF_EXITING. fork and exit paths are protected with this rwsem
751 * using threadgroup_change_begin/end(). Users which require
752 * threadgroup to remain stable should use threadgroup_[un]lock()
753 * which also takes care of exec path. Currently, cgroup is the
754 * only user.
755 */
756 struct rw_semaphore group_rwsem;
757#endif
758 746
759 oom_flags_t oom_flags; 747 oom_flags_t oom_flags;
760 short oom_score_adj; /* OOM kill score adjustment */ 748 short oom_score_adj; /* OOM kill score adjustment */
diff --git a/init/Kconfig b/init/Kconfig
index dc24dec60232..b9b824bf8f6b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -938,6 +938,7 @@ config NUMA_BALANCING_DEFAULT_ENABLED
938menuconfig CGROUPS 938menuconfig CGROUPS
939 bool "Control Group support" 939 bool "Control Group support"
940 select KERNFS 940 select KERNFS
941 select PERCPU_RWSEM
941 help 942 help
942 This option adds support for grouping sets of processes together, for 943 This option adds support for grouping sets of processes together, for
943 use with process control subsystems such as Cpusets, CFS, memory 944 use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 980b1f52f39f..77578a169b8c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
46#include <linux/slab.h> 46#include <linux/slab.h>
47#include <linux/spinlock.h> 47#include <linux/spinlock.h>
48#include <linux/rwsem.h> 48#include <linux/rwsem.h>
49#include <linux/percpu-rwsem.h>
49#include <linux/string.h> 50#include <linux/string.h>
50#include <linux/sort.h> 51#include <linux/sort.h>
51#include <linux/kmod.h> 52#include <linux/kmod.h>
@@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
103 */ 104 */
104static DEFINE_SPINLOCK(release_agent_path_lock); 105static DEFINE_SPINLOCK(release_agent_path_lock);
105 106
107struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
108
106#define cgroup_assert_mutex_or_rcu_locked() \ 109#define cgroup_assert_mutex_or_rcu_locked() \
107 rcu_lockdep_assert(rcu_read_lock_held() || \ 110 rcu_lockdep_assert(rcu_read_lock_held() || \
108 lockdep_is_held(&cgroup_mutex), \ 111 lockdep_is_held(&cgroup_mutex), \
@@ -848,48 +851,6 @@ static struct css_set *find_css_set(struct css_set *old_cset,
848 return cset; 851 return cset;
849} 852}
850 853
851void cgroup_threadgroup_change_begin(struct task_struct *tsk)
852{
853 down_read(&tsk->signal->group_rwsem);
854}
855
856void cgroup_threadgroup_change_end(struct task_struct *tsk)
857{
858 up_read(&tsk->signal->group_rwsem);
859}
860
861/**
862 * threadgroup_lock - lock threadgroup
863 * @tsk: member task of the threadgroup to lock
864 *
865 * Lock the threadgroup @tsk belongs to. No new task is allowed to enter
866 * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
867 * change ->group_leader/pid. This is useful for cases where the threadgroup
868 * needs to stay stable across blockable operations.
869 *
870 * fork and exit explicitly call threadgroup_change_{begin|end}() for
871 * synchronization. While held, no new task will be added to threadgroup
872 * and no existing live task will have its PF_EXITING set.
873 *
874 * de_thread() does threadgroup_change_{begin|end}() when a non-leader
875 * sub-thread becomes a new leader.
876 */
877static void threadgroup_lock(struct task_struct *tsk)
878{
879 down_write(&tsk->signal->group_rwsem);
880}
881
882/**
883 * threadgroup_unlock - unlock threadgroup
884 * @tsk: member task of the threadgroup to unlock
885 *
886 * Reverse threadgroup_lock().
887 */
888static inline void threadgroup_unlock(struct task_struct *tsk)
889{
890 up_write(&tsk->signal->group_rwsem);
891}
892
893static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) 854static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
894{ 855{
895 struct cgroup *root_cgrp = kf_root->kn->priv; 856 struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -2095,9 +2056,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
2095 lockdep_assert_held(&css_set_rwsem); 2056 lockdep_assert_held(&css_set_rwsem);
2096 2057
2097 /* 2058 /*
2098 * We are synchronized through threadgroup_lock() against PF_EXITING 2059 * We are synchronized through cgroup_threadgroup_rwsem against
2099 * setting such that we can't race against cgroup_exit() changing the 2060 * PF_EXITING setting such that we can't race against cgroup_exit()
2100 * css_set to init_css_set and dropping the old one. 2061 * changing the css_set to init_css_set and dropping the old one.
2101 */ 2062 */
2102 WARN_ON_ONCE(tsk->flags & PF_EXITING); 2063 WARN_ON_ONCE(tsk->flags & PF_EXITING);
2103 old_cset = task_css_set(tsk); 2064 old_cset = task_css_set(tsk);
@@ -2154,10 +2115,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2154 * @src_cset and add it to @preloaded_csets, which should later be cleaned 2115 * @src_cset and add it to @preloaded_csets, which should later be cleaned
2155 * up by cgroup_migrate_finish(). 2116 * up by cgroup_migrate_finish().
2156 * 2117 *
2157 * This function may be called without holding threadgroup_lock even if the 2118 * This function may be called without holding cgroup_threadgroup_rwsem
2158 * target is a process. Threads may be created and destroyed but as long 2119 * even if the target is a process. Threads may be created and destroyed
2159 * as cgroup_mutex is not dropped, no new css_set can be put into play and 2120 * but as long as cgroup_mutex is not dropped, no new css_set can be put
2160 * the preloaded css_sets are guaranteed to cover all migrations. 2121 * into play and the preloaded css_sets are guaranteed to cover all
2122 * migrations.
2161 */ 2123 */
2162static void cgroup_migrate_add_src(struct css_set *src_cset, 2124static void cgroup_migrate_add_src(struct css_set *src_cset,
2163 struct cgroup *dst_cgrp, 2125 struct cgroup *dst_cgrp,
@@ -2260,7 +2222,7 @@ err:
2260 * @threadgroup: whether @leader points to the whole process or a single task 2222 * @threadgroup: whether @leader points to the whole process or a single task
2261 * 2223 *
2262 * Migrate a process or task denoted by @leader to @cgrp. If migrating a 2224 * Migrate a process or task denoted by @leader to @cgrp. If migrating a
2263 * process, the caller must be holding threadgroup_lock of @leader. The 2225 * process, the caller must be holding cgroup_threadgroup_rwsem. The
2264 * caller is also responsible for invoking cgroup_migrate_add_src() and 2226 * caller is also responsible for invoking cgroup_migrate_add_src() and
2265 * cgroup_migrate_prepare_dst() on the targets before invoking this 2227 * cgroup_migrate_prepare_dst() on the targets before invoking this
2266 * function and following up with cgroup_migrate_finish(). 2228 * function and following up with cgroup_migrate_finish().
@@ -2388,7 +2350,7 @@ out_release_tset:
2388 * @leader: the task or the leader of the threadgroup to be attached 2350 * @leader: the task or the leader of the threadgroup to be attached
2389 * @threadgroup: attach the whole threadgroup? 2351 * @threadgroup: attach the whole threadgroup?
2390 * 2352 *
2391 * Call holding cgroup_mutex and threadgroup_lock of @leader. 2353 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2392 */ 2354 */
2393static int cgroup_attach_task(struct cgroup *dst_cgrp, 2355static int cgroup_attach_task(struct cgroup *dst_cgrp,
2394 struct task_struct *leader, bool threadgroup) 2356 struct task_struct *leader, bool threadgroup)
@@ -2481,7 +2443,7 @@ retry_find_task:
2481 get_task_struct(tsk); 2443 get_task_struct(tsk);
2482 rcu_read_unlock(); 2444 rcu_read_unlock();
2483 2445
2484 threadgroup_lock(tsk); 2446 percpu_down_write(&cgroup_threadgroup_rwsem);
2485 if (threadgroup) { 2447 if (threadgroup) {
2486 if (!thread_group_leader(tsk)) { 2448 if (!thread_group_leader(tsk)) {
2487 /* 2449 /*
@@ -2491,7 +2453,7 @@ retry_find_task:
2491 * try again; this is 2453 * try again; this is
2492 * "double-double-toil-and-trouble-check locking". 2454 * "double-double-toil-and-trouble-check locking".
2493 */ 2455 */
2494 threadgroup_unlock(tsk); 2456 percpu_up_write(&cgroup_threadgroup_rwsem);
2495 put_task_struct(tsk); 2457 put_task_struct(tsk);
2496 goto retry_find_task; 2458 goto retry_find_task;
2497 } 2459 }
@@ -2499,7 +2461,7 @@ retry_find_task:
2499 2461
2500 ret = cgroup_attach_task(cgrp, tsk, threadgroup); 2462 ret = cgroup_attach_task(cgrp, tsk, threadgroup);
2501 2463
2502 threadgroup_unlock(tsk); 2464 percpu_up_write(&cgroup_threadgroup_rwsem);
2503 2465
2504 put_task_struct(tsk); 2466 put_task_struct(tsk);
2505out_unlock_cgroup: 2467out_unlock_cgroup:
@@ -2704,17 +2666,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
2704 goto out_finish; 2666 goto out_finish;
2705 last_task = task; 2667 last_task = task;
2706 2668
2707 threadgroup_lock(task); 2669 percpu_down_write(&cgroup_threadgroup_rwsem);
2708 /* raced against de_thread() from another thread? */ 2670 /* raced against de_thread() from another thread? */
2709 if (!thread_group_leader(task)) { 2671 if (!thread_group_leader(task)) {
2710 threadgroup_unlock(task); 2672 percpu_up_write(&cgroup_threadgroup_rwsem);
2711 put_task_struct(task); 2673 put_task_struct(task);
2712 continue; 2674 continue;
2713 } 2675 }
2714 2676
2715 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); 2677 ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
2716 2678
2717 threadgroup_unlock(task); 2679 percpu_up_write(&cgroup_threadgroup_rwsem);
2718 put_task_struct(task); 2680 put_task_struct(task);
2719 2681
2720 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) 2682 if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -5032,6 +4994,7 @@ int __init cgroup_init(void)
5032 unsigned long key; 4994 unsigned long key;
5033 int ssid, err; 4995 int ssid, err;
5034 4996
4997 BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
5035 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); 4998 BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
5036 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); 4999 BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
5037 5000
diff --git a/kernel/fork.c b/kernel/fork.c
index 03c1eaaa6ef5..9531275e12a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1144,10 +1144,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1144 tty_audit_fork(sig); 1144 tty_audit_fork(sig);
1145 sched_autogroup_fork(sig); 1145 sched_autogroup_fork(sig);
1146 1146
1147#ifdef CONFIG_CGROUPS
1148 init_rwsem(&sig->group_rwsem);
1149#endif
1150
1151 sig->oom_score_adj = current->signal->oom_score_adj; 1147 sig->oom_score_adj = current->signal->oom_score_adj;
1152 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1148 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1153 1149