diff options
| author | Tejun Heo <tj@kernel.org> | 2015-05-13 16:35:17 -0400 |
|---|---|---|
| committer | Tejun Heo <tj@kernel.org> | 2015-05-26 20:35:00 -0400 |
| commit | d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14 (patch) | |
| tree | 077533cef8f5e16c8f7fd65d7e255d75828f3820 | |
| parent | 7d7efec368d537226142cbe559f45797f18672f9 (diff) | |
sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem
The cgroup side of threadgroup locking uses signal_struct->group_rwsem
to synchronize against threadgroup changes. This per-process rwsem
adds small overhead to thread creation, exit and exec paths, forces
cgroup code paths to do lock-verify-unlock-retry dance in a couple
places and makes it impossible to atomically perform operations across
multiple processes.
This patch replaces signal_struct->group_rwsem with a global
percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader
side and contained in cgroups proper. This patch converts one-to-one.
This does make writer side heavier and lower the granularity; however,
cgroup process migration is a fairly cold path, we do want to optimize
thread operations over it and cgroup migration operations don't take
enough time for the lower granularity to matter.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
| -rw-r--r-- | include/linux/cgroup-defs.h | 27 | ||||
| -rw-r--r-- | include/linux/init_task.h | 8 | ||||
| -rw-r--r-- | include/linux/sched.h | 12 | ||||
| -rw-r--r-- | init/Kconfig | 1 | ||||
| -rw-r--r-- | kernel/cgroup.c | 77 | ||||
| -rw-r--r-- | kernel/fork.c | 4 |
6 files changed, 46 insertions, 83 deletions
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b8c93806dbd..7d83d7f73420 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
| @@ -461,8 +461,31 @@ struct cgroup_subsys { | |||
| 461 | unsigned int depends_on; | 461 | unsigned int depends_on; |
| 462 | }; | 462 | }; |
| 463 | 463 | ||
| 464 | void cgroup_threadgroup_change_begin(struct task_struct *tsk); | 464 | extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; |
| 465 | void cgroup_threadgroup_change_end(struct task_struct *tsk); | 465 | |
| 466 | /** | ||
| 467 | * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups | ||
| 468 | * @tsk: target task | ||
| 469 | * | ||
| 470 | * Called from threadgroup_change_begin() and allows cgroup operations to | ||
| 471 | * synchronize against threadgroup changes using a percpu_rw_semaphore. | ||
| 472 | */ | ||
| 473 | static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) | ||
| 474 | { | ||
| 475 | percpu_down_read(&cgroup_threadgroup_rwsem); | ||
| 476 | } | ||
| 477 | |||
| 478 | /** | ||
| 479 | * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups | ||
| 480 | * @tsk: target task | ||
| 481 | * | ||
| 482 | * Called from threadgroup_change_end(). Counterpart of | ||
| 483 | * cgroup_threadcgroup_change_begin(). | ||
| 484 | */ | ||
| 485 | static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) | ||
| 486 | { | ||
| 487 | percpu_up_read(&cgroup_threadgroup_rwsem); | ||
| 488 | } | ||
| 466 | 489 | ||
| 467 | #else /* CONFIG_CGROUPS */ | 490 | #else /* CONFIG_CGROUPS */ |
| 468 | 491 | ||
diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 696d22312b31..0cc0bbf20022 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h | |||
| @@ -25,13 +25,6 @@ | |||
| 25 | extern struct files_struct init_files; | 25 | extern struct files_struct init_files; |
| 26 | extern struct fs_struct init_fs; | 26 | extern struct fs_struct init_fs; |
| 27 | 27 | ||
| 28 | #ifdef CONFIG_CGROUPS | ||
| 29 | #define INIT_GROUP_RWSEM(sig) \ | ||
| 30 | .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), | ||
| 31 | #else | ||
| 32 | #define INIT_GROUP_RWSEM(sig) | ||
| 33 | #endif | ||
| 34 | |||
| 35 | #ifdef CONFIG_CPUSETS | 28 | #ifdef CONFIG_CPUSETS |
| 36 | #define INIT_CPUSET_SEQ(tsk) \ | 29 | #define INIT_CPUSET_SEQ(tsk) \ |
| 37 | .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), | 30 | .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq), |
| @@ -56,7 +49,6 @@ extern struct fs_struct init_fs; | |||
| 56 | }, \ | 49 | }, \ |
| 57 | .cred_guard_mutex = \ | 50 | .cred_guard_mutex = \ |
| 58 | __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ | 51 | __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ |
| 59 | INIT_GROUP_RWSEM(sig) \ | ||
| 60 | } | 52 | } |
| 61 | 53 | ||
| 62 | extern struct nsproxy init_nsproxy; | 54 | extern struct nsproxy init_nsproxy; |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 5ee290003470..add524a910bd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -743,18 +743,6 @@ struct signal_struct { | |||
| 743 | unsigned audit_tty_log_passwd; | 743 | unsigned audit_tty_log_passwd; |
| 744 | struct tty_audit_buf *tty_audit_buf; | 744 | struct tty_audit_buf *tty_audit_buf; |
| 745 | #endif | 745 | #endif |
| 746 | #ifdef CONFIG_CGROUPS | ||
| 747 | /* | ||
| 748 | * group_rwsem prevents new tasks from entering the threadgroup and | ||
| 749 | * member tasks from exiting,a more specifically, setting of | ||
| 750 | * PF_EXITING. fork and exit paths are protected with this rwsem | ||
| 751 | * using threadgroup_change_begin/end(). Users which require | ||
| 752 | * threadgroup to remain stable should use threadgroup_[un]lock() | ||
| 753 | * which also takes care of exec path. Currently, cgroup is the | ||
| 754 | * only user. | ||
| 755 | */ | ||
| 756 | struct rw_semaphore group_rwsem; | ||
| 757 | #endif | ||
| 758 | 746 | ||
| 759 | oom_flags_t oom_flags; | 747 | oom_flags_t oom_flags; |
| 760 | short oom_score_adj; /* OOM kill score adjustment */ | 748 | short oom_score_adj; /* OOM kill score adjustment */ |
diff --git a/init/Kconfig b/init/Kconfig index dc24dec60232..b9b824bf8f6b 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
| @@ -938,6 +938,7 @@ config NUMA_BALANCING_DEFAULT_ENABLED | |||
| 938 | menuconfig CGROUPS | 938 | menuconfig CGROUPS |
| 939 | bool "Control Group support" | 939 | bool "Control Group support" |
| 940 | select KERNFS | 940 | select KERNFS |
| 941 | select PERCPU_RWSEM | ||
| 941 | help | 942 | help |
| 942 | This option adds support for grouping sets of processes together, for | 943 | This option adds support for grouping sets of processes together, for |
| 943 | use with process control subsystems such as Cpusets, CFS, memory | 944 | use with process control subsystems such as Cpusets, CFS, memory |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 980b1f52f39f..77578a169b8c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -46,6 +46,7 @@ | |||
| 46 | #include <linux/slab.h> | 46 | #include <linux/slab.h> |
| 47 | #include <linux/spinlock.h> | 47 | #include <linux/spinlock.h> |
| 48 | #include <linux/rwsem.h> | 48 | #include <linux/rwsem.h> |
| 49 | #include <linux/percpu-rwsem.h> | ||
| 49 | #include <linux/string.h> | 50 | #include <linux/string.h> |
| 50 | #include <linux/sort.h> | 51 | #include <linux/sort.h> |
| 51 | #include <linux/kmod.h> | 52 | #include <linux/kmod.h> |
| @@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); | |||
| 103 | */ | 104 | */ |
| 104 | static DEFINE_SPINLOCK(release_agent_path_lock); | 105 | static DEFINE_SPINLOCK(release_agent_path_lock); |
| 105 | 106 | ||
| 107 | struct percpu_rw_semaphore cgroup_threadgroup_rwsem; | ||
| 108 | |||
| 106 | #define cgroup_assert_mutex_or_rcu_locked() \ | 109 | #define cgroup_assert_mutex_or_rcu_locked() \ |
| 107 | rcu_lockdep_assert(rcu_read_lock_held() || \ | 110 | rcu_lockdep_assert(rcu_read_lock_held() || \ |
| 108 | lockdep_is_held(&cgroup_mutex), \ | 111 | lockdep_is_held(&cgroup_mutex), \ |
| @@ -848,48 +851,6 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
| 848 | return cset; | 851 | return cset; |
| 849 | } | 852 | } |
| 850 | 853 | ||
| 851 | void cgroup_threadgroup_change_begin(struct task_struct *tsk) | ||
| 852 | { | ||
| 853 | down_read(&tsk->signal->group_rwsem); | ||
| 854 | } | ||
| 855 | |||
| 856 | void cgroup_threadgroup_change_end(struct task_struct *tsk) | ||
| 857 | { | ||
| 858 | up_read(&tsk->signal->group_rwsem); | ||
| 859 | } | ||
| 860 | |||
| 861 | /** | ||
| 862 | * threadgroup_lock - lock threadgroup | ||
| 863 | * @tsk: member task of the threadgroup to lock | ||
| 864 | * | ||
| 865 | * Lock the threadgroup @tsk belongs to. No new task is allowed to enter | ||
| 866 | * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or | ||
| 867 | * change ->group_leader/pid. This is useful for cases where the threadgroup | ||
| 868 | * needs to stay stable across blockable operations. | ||
| 869 | * | ||
| 870 | * fork and exit explicitly call threadgroup_change_{begin|end}() for | ||
| 871 | * synchronization. While held, no new task will be added to threadgroup | ||
| 872 | * and no existing live task will have its PF_EXITING set. | ||
| 873 | * | ||
| 874 | * de_thread() does threadgroup_change_{begin|end}() when a non-leader | ||
| 875 | * sub-thread becomes a new leader. | ||
| 876 | */ | ||
| 877 | static void threadgroup_lock(struct task_struct *tsk) | ||
| 878 | { | ||
| 879 | down_write(&tsk->signal->group_rwsem); | ||
| 880 | } | ||
| 881 | |||
| 882 | /** | ||
| 883 | * threadgroup_unlock - unlock threadgroup | ||
| 884 | * @tsk: member task of the threadgroup to unlock | ||
| 885 | * | ||
| 886 | * Reverse threadgroup_lock(). | ||
| 887 | */ | ||
| 888 | static inline void threadgroup_unlock(struct task_struct *tsk) | ||
| 889 | { | ||
| 890 | up_write(&tsk->signal->group_rwsem); | ||
| 891 | } | ||
| 892 | |||
| 893 | static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) | 854 | static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) |
| 894 | { | 855 | { |
| 895 | struct cgroup *root_cgrp = kf_root->kn->priv; | 856 | struct cgroup *root_cgrp = kf_root->kn->priv; |
| @@ -2095,9 +2056,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, | |||
| 2095 | lockdep_assert_held(&css_set_rwsem); | 2056 | lockdep_assert_held(&css_set_rwsem); |
| 2096 | 2057 | ||
| 2097 | /* | 2058 | /* |
| 2098 | * We are synchronized through threadgroup_lock() against PF_EXITING | 2059 | * We are synchronized through cgroup_threadgroup_rwsem against |
| 2099 | * setting such that we can't race against cgroup_exit() changing the | 2060 | * PF_EXITING setting such that we can't race against cgroup_exit() |
| 2100 | * css_set to init_css_set and dropping the old one. | 2061 | * changing the css_set to init_css_set and dropping the old one. |
| 2101 | */ | 2062 | */ |
| 2102 | WARN_ON_ONCE(tsk->flags & PF_EXITING); | 2063 | WARN_ON_ONCE(tsk->flags & PF_EXITING); |
| 2103 | old_cset = task_css_set(tsk); | 2064 | old_cset = task_css_set(tsk); |
| @@ -2154,10 +2115,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
| 2154 | * @src_cset and add it to @preloaded_csets, which should later be cleaned | 2115 | * @src_cset and add it to @preloaded_csets, which should later be cleaned |
| 2155 | * up by cgroup_migrate_finish(). | 2116 | * up by cgroup_migrate_finish(). |
| 2156 | * | 2117 | * |
| 2157 | * This function may be called without holding threadgroup_lock even if the | 2118 | * This function may be called without holding cgroup_threadgroup_rwsem |
| 2158 | * target is a process. Threads may be created and destroyed but as long | 2119 | * even if the target is a process. Threads may be created and destroyed |
| 2159 | * as cgroup_mutex is not dropped, no new css_set can be put into play and | 2120 | * but as long as cgroup_mutex is not dropped, no new css_set can be put |
| 2160 | * the preloaded css_sets are guaranteed to cover all migrations. | 2121 | * into play and the preloaded css_sets are guaranteed to cover all |
| 2122 | * migrations. | ||
| 2161 | */ | 2123 | */ |
| 2162 | static void cgroup_migrate_add_src(struct css_set *src_cset, | 2124 | static void cgroup_migrate_add_src(struct css_set *src_cset, |
| 2163 | struct cgroup *dst_cgrp, | 2125 | struct cgroup *dst_cgrp, |
| @@ -2260,7 +2222,7 @@ err: | |||
| 2260 | * @threadgroup: whether @leader points to the whole process or a single task | 2222 | * @threadgroup: whether @leader points to the whole process or a single task |
| 2261 | * | 2223 | * |
| 2262 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a | 2224 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a |
| 2263 | * process, the caller must be holding threadgroup_lock of @leader. The | 2225 | * process, the caller must be holding cgroup_threadgroup_rwsem. The |
| 2264 | * caller is also responsible for invoking cgroup_migrate_add_src() and | 2226 | * caller is also responsible for invoking cgroup_migrate_add_src() and |
| 2265 | * cgroup_migrate_prepare_dst() on the targets before invoking this | 2227 | * cgroup_migrate_prepare_dst() on the targets before invoking this |
| 2266 | * function and following up with cgroup_migrate_finish(). | 2228 | * function and following up with cgroup_migrate_finish(). |
| @@ -2388,7 +2350,7 @@ out_release_tset: | |||
| 2388 | * @leader: the task or the leader of the threadgroup to be attached | 2350 | * @leader: the task or the leader of the threadgroup to be attached |
| 2389 | * @threadgroup: attach the whole threadgroup? | 2351 | * @threadgroup: attach the whole threadgroup? |
| 2390 | * | 2352 | * |
| 2391 | * Call holding cgroup_mutex and threadgroup_lock of @leader. | 2353 | * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. |
| 2392 | */ | 2354 | */ |
| 2393 | static int cgroup_attach_task(struct cgroup *dst_cgrp, | 2355 | static int cgroup_attach_task(struct cgroup *dst_cgrp, |
| 2394 | struct task_struct *leader, bool threadgroup) | 2356 | struct task_struct *leader, bool threadgroup) |
| @@ -2481,7 +2443,7 @@ retry_find_task: | |||
| 2481 | get_task_struct(tsk); | 2443 | get_task_struct(tsk); |
| 2482 | rcu_read_unlock(); | 2444 | rcu_read_unlock(); |
| 2483 | 2445 | ||
| 2484 | threadgroup_lock(tsk); | 2446 | percpu_down_write(&cgroup_threadgroup_rwsem); |
| 2485 | if (threadgroup) { | 2447 | if (threadgroup) { |
| 2486 | if (!thread_group_leader(tsk)) { | 2448 | if (!thread_group_leader(tsk)) { |
| 2487 | /* | 2449 | /* |
| @@ -2491,7 +2453,7 @@ retry_find_task: | |||
| 2491 | * try again; this is | 2453 | * try again; this is |
| 2492 | * "double-double-toil-and-trouble-check locking". | 2454 | * "double-double-toil-and-trouble-check locking". |
| 2493 | */ | 2455 | */ |
| 2494 | threadgroup_unlock(tsk); | 2456 | percpu_up_write(&cgroup_threadgroup_rwsem); |
| 2495 | put_task_struct(tsk); | 2457 | put_task_struct(tsk); |
| 2496 | goto retry_find_task; | 2458 | goto retry_find_task; |
| 2497 | } | 2459 | } |
| @@ -2499,7 +2461,7 @@ retry_find_task: | |||
| 2499 | 2461 | ||
| 2500 | ret = cgroup_attach_task(cgrp, tsk, threadgroup); | 2462 | ret = cgroup_attach_task(cgrp, tsk, threadgroup); |
| 2501 | 2463 | ||
| 2502 | threadgroup_unlock(tsk); | 2464 | percpu_up_write(&cgroup_threadgroup_rwsem); |
| 2503 | 2465 | ||
| 2504 | put_task_struct(tsk); | 2466 | put_task_struct(tsk); |
| 2505 | out_unlock_cgroup: | 2467 | out_unlock_cgroup: |
| @@ -2704,17 +2666,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
| 2704 | goto out_finish; | 2666 | goto out_finish; |
| 2705 | last_task = task; | 2667 | last_task = task; |
| 2706 | 2668 | ||
| 2707 | threadgroup_lock(task); | 2669 | percpu_down_write(&cgroup_threadgroup_rwsem); |
| 2708 | /* raced against de_thread() from another thread? */ | 2670 | /* raced against de_thread() from another thread? */ |
| 2709 | if (!thread_group_leader(task)) { | 2671 | if (!thread_group_leader(task)) { |
| 2710 | threadgroup_unlock(task); | 2672 | percpu_up_write(&cgroup_threadgroup_rwsem); |
| 2711 | put_task_struct(task); | 2673 | put_task_struct(task); |
| 2712 | continue; | 2674 | continue; |
| 2713 | } | 2675 | } |
| 2714 | 2676 | ||
| 2715 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); | 2677 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); |
| 2716 | 2678 | ||
| 2717 | threadgroup_unlock(task); | 2679 | percpu_up_write(&cgroup_threadgroup_rwsem); |
| 2718 | put_task_struct(task); | 2680 | put_task_struct(task); |
| 2719 | 2681 | ||
| 2720 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) | 2682 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) |
| @@ -5032,6 +4994,7 @@ int __init cgroup_init(void) | |||
| 5032 | unsigned long key; | 4994 | unsigned long key; |
| 5033 | int ssid, err; | 4995 | int ssid, err; |
| 5034 | 4996 | ||
| 4997 | BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); | ||
| 5035 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); | 4998 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); |
| 5036 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); | 4999 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); |
| 5037 | 5000 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 03c1eaaa6ef5..9531275e12a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1144,10 +1144,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1144 | tty_audit_fork(sig); | 1144 | tty_audit_fork(sig); |
| 1145 | sched_autogroup_fork(sig); | 1145 | sched_autogroup_fork(sig); |
| 1146 | 1146 | ||
| 1147 | #ifdef CONFIG_CGROUPS | ||
| 1148 | init_rwsem(&sig->group_rwsem); | ||
| 1149 | #endif | ||
| 1150 | |||
| 1151 | sig->oom_score_adj = current->signal->oom_score_adj; | 1147 | sig->oom_score_adj = current->signal->oom_score_adj; |
| 1152 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | 1148 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
| 1153 | 1149 | ||
