diff options
author | Tejun Heo <tj@kernel.org> | 2015-09-16 12:53:17 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2015-09-16 12:53:17 -0400 |
commit | 1ed1328792ff46e4bb86a3d7f7be2971f4549f6c (patch) | |
tree | 53719cfc0bf81bc7e6fb522944553d9b4fa36cbf /kernel/cgroup.c | |
parent | 0c986253b939cc14c69d4adbe2b4121bdf4aa220 (diff) |
sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem
Note: This commit was originally committed as d59cfc09c32a but got
reverted by 0c986253b939 due to the performance regression from
the percpu_rwsem write down/up operations added to cgroup task
migration path. percpu_rwsem changes which alleviate the
performance issue are pending for v4.4-rc1 merge window.
Re-apply.
The cgroup side of threadgroup locking uses signal_struct->group_rwsem
to synchronize against threadgroup changes. This per-process rwsem
adds small overhead to thread creation, exit and exec paths, forces
cgroup code paths to do lock-verify-unlock-retry dance in a couple
places and makes it impossible to atomically perform operations across
multiple processes.
This patch replaces signal_struct->group_rwsem with a global
percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader
side and contained in cgroups proper. This patch converts one-to-one.
This does make writer side heavier and lower the granularity; however,
cgroup process migration is a fairly cold path, we do want to optimize
thread operations over it and cgroup migration operations don't take
enough time for the lower granularity to matter.
Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 77 |
1 files changed, 20 insertions, 57 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2c9eae6ad970..115091efa889 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/slab.h> | 46 | #include <linux/slab.h> |
47 | #include <linux/spinlock.h> | 47 | #include <linux/spinlock.h> |
48 | #include <linux/rwsem.h> | 48 | #include <linux/rwsem.h> |
49 | #include <linux/percpu-rwsem.h> | ||
49 | #include <linux/string.h> | 50 | #include <linux/string.h> |
50 | #include <linux/sort.h> | 51 | #include <linux/sort.h> |
51 | #include <linux/kmod.h> | 52 | #include <linux/kmod.h> |
@@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); | |||
103 | */ | 104 | */ |
104 | static DEFINE_SPINLOCK(release_agent_path_lock); | 105 | static DEFINE_SPINLOCK(release_agent_path_lock); |
105 | 106 | ||
107 | struct percpu_rw_semaphore cgroup_threadgroup_rwsem; | ||
108 | |||
106 | #define cgroup_assert_mutex_or_rcu_locked() \ | 109 | #define cgroup_assert_mutex_or_rcu_locked() \ |
107 | RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ | 110 | RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ |
108 | !lockdep_is_held(&cgroup_mutex), \ | 111 | !lockdep_is_held(&cgroup_mutex), \ |
@@ -871,48 +874,6 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
871 | return cset; | 874 | return cset; |
872 | } | 875 | } |
873 | 876 | ||
874 | void cgroup_threadgroup_change_begin(struct task_struct *tsk) | ||
875 | { | ||
876 | down_read(&tsk->signal->group_rwsem); | ||
877 | } | ||
878 | |||
879 | void cgroup_threadgroup_change_end(struct task_struct *tsk) | ||
880 | { | ||
881 | up_read(&tsk->signal->group_rwsem); | ||
882 | } | ||
883 | |||
884 | /** | ||
885 | * threadgroup_lock - lock threadgroup | ||
886 | * @tsk: member task of the threadgroup to lock | ||
887 | * | ||
888 | * Lock the threadgroup @tsk belongs to. No new task is allowed to enter | ||
889 | * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or | ||
890 | * change ->group_leader/pid. This is useful for cases where the threadgroup | ||
891 | * needs to stay stable across blockable operations. | ||
892 | * | ||
893 | * fork and exit explicitly call threadgroup_change_{begin|end}() for | ||
894 | * synchronization. While held, no new task will be added to threadgroup | ||
895 | * and no existing live task will have its PF_EXITING set. | ||
896 | * | ||
897 | * de_thread() does threadgroup_change_{begin|end}() when a non-leader | ||
898 | * sub-thread becomes a new leader. | ||
899 | */ | ||
900 | static void threadgroup_lock(struct task_struct *tsk) | ||
901 | { | ||
902 | down_write(&tsk->signal->group_rwsem); | ||
903 | } | ||
904 | |||
905 | /** | ||
906 | * threadgroup_unlock - unlock threadgroup | ||
907 | * @tsk: member task of the threadgroup to unlock | ||
908 | * | ||
909 | * Reverse threadgroup_lock(). | ||
910 | */ | ||
911 | static inline void threadgroup_unlock(struct task_struct *tsk) | ||
912 | { | ||
913 | up_write(&tsk->signal->group_rwsem); | ||
914 | } | ||
915 | |||
916 | static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) | 877 | static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) |
917 | { | 878 | { |
918 | struct cgroup *root_cgrp = kf_root->kn->priv; | 879 | struct cgroup *root_cgrp = kf_root->kn->priv; |
@@ -2113,9 +2074,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, | |||
2113 | lockdep_assert_held(&css_set_rwsem); | 2074 | lockdep_assert_held(&css_set_rwsem); |
2114 | 2075 | ||
2115 | /* | 2076 | /* |
2116 | * We are synchronized through threadgroup_lock() against PF_EXITING | 2077 | * We are synchronized through cgroup_threadgroup_rwsem against |
2117 | * setting such that we can't race against cgroup_exit() changing the | 2078 | * PF_EXITING setting such that we can't race against cgroup_exit() |
2118 | * css_set to init_css_set and dropping the old one. | 2079 | * changing the css_set to init_css_set and dropping the old one. |
2119 | */ | 2080 | */ |
2120 | WARN_ON_ONCE(tsk->flags & PF_EXITING); | 2081 | WARN_ON_ONCE(tsk->flags & PF_EXITING); |
2121 | old_cset = task_css_set(tsk); | 2082 | old_cset = task_css_set(tsk); |
@@ -2172,10 +2133,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) | |||
2172 | * @src_cset and add it to @preloaded_csets, which should later be cleaned | 2133 | * @src_cset and add it to @preloaded_csets, which should later be cleaned |
2173 | * up by cgroup_migrate_finish(). | 2134 | * up by cgroup_migrate_finish(). |
2174 | * | 2135 | * |
2175 | * This function may be called without holding threadgroup_lock even if the | 2136 | * This function may be called without holding cgroup_threadgroup_rwsem |
2176 | * target is a process. Threads may be created and destroyed but as long | 2137 | * even if the target is a process. Threads may be created and destroyed |
2177 | * as cgroup_mutex is not dropped, no new css_set can be put into play and | 2138 | * but as long as cgroup_mutex is not dropped, no new css_set can be put |
2178 | * the preloaded css_sets are guaranteed to cover all migrations. | 2139 | * into play and the preloaded css_sets are guaranteed to cover all |
2140 | * migrations. | ||
2179 | */ | 2141 | */ |
2180 | static void cgroup_migrate_add_src(struct css_set *src_cset, | 2142 | static void cgroup_migrate_add_src(struct css_set *src_cset, |
2181 | struct cgroup *dst_cgrp, | 2143 | struct cgroup *dst_cgrp, |
@@ -2278,7 +2240,7 @@ err: | |||
2278 | * @threadgroup: whether @leader points to the whole process or a single task | 2240 | * @threadgroup: whether @leader points to the whole process or a single task |
2279 | * | 2241 | * |
2280 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a | 2242 | * Migrate a process or task denoted by @leader to @cgrp. If migrating a |
2281 | * process, the caller must be holding threadgroup_lock of @leader. The | 2243 | * process, the caller must be holding cgroup_threadgroup_rwsem. The |
2282 | * caller is also responsible for invoking cgroup_migrate_add_src() and | 2244 | * caller is also responsible for invoking cgroup_migrate_add_src() and |
2283 | * cgroup_migrate_prepare_dst() on the targets before invoking this | 2245 | * cgroup_migrate_prepare_dst() on the targets before invoking this |
2284 | * function and following up with cgroup_migrate_finish(). | 2246 | * function and following up with cgroup_migrate_finish(). |
@@ -2406,7 +2368,7 @@ out_release_tset: | |||
2406 | * @leader: the task or the leader of the threadgroup to be attached | 2368 | * @leader: the task or the leader of the threadgroup to be attached |
2407 | * @threadgroup: attach the whole threadgroup? | 2369 | * @threadgroup: attach the whole threadgroup? |
2408 | * | 2370 | * |
2409 | * Call holding cgroup_mutex and threadgroup_lock of @leader. | 2371 | * Call holding cgroup_mutex and cgroup_threadgroup_rwsem. |
2410 | */ | 2372 | */ |
2411 | static int cgroup_attach_task(struct cgroup *dst_cgrp, | 2373 | static int cgroup_attach_task(struct cgroup *dst_cgrp, |
2412 | struct task_struct *leader, bool threadgroup) | 2374 | struct task_struct *leader, bool threadgroup) |
@@ -2528,7 +2490,7 @@ retry_find_task: | |||
2528 | get_task_struct(tsk); | 2490 | get_task_struct(tsk); |
2529 | rcu_read_unlock(); | 2491 | rcu_read_unlock(); |
2530 | 2492 | ||
2531 | threadgroup_lock(tsk); | 2493 | percpu_down_write(&cgroup_threadgroup_rwsem); |
2532 | if (threadgroup) { | 2494 | if (threadgroup) { |
2533 | if (!thread_group_leader(tsk)) { | 2495 | if (!thread_group_leader(tsk)) { |
2534 | /* | 2496 | /* |
@@ -2538,7 +2500,7 @@ retry_find_task: | |||
2538 | * try again; this is | 2500 | * try again; this is |
2539 | * "double-double-toil-and-trouble-check locking". | 2501 | * "double-double-toil-and-trouble-check locking". |
2540 | */ | 2502 | */ |
2541 | threadgroup_unlock(tsk); | 2503 | percpu_up_write(&cgroup_threadgroup_rwsem); |
2542 | put_task_struct(tsk); | 2504 | put_task_struct(tsk); |
2543 | goto retry_find_task; | 2505 | goto retry_find_task; |
2544 | } | 2506 | } |
@@ -2548,7 +2510,7 @@ retry_find_task: | |||
2548 | if (!ret) | 2510 | if (!ret) |
2549 | ret = cgroup_attach_task(cgrp, tsk, threadgroup); | 2511 | ret = cgroup_attach_task(cgrp, tsk, threadgroup); |
2550 | 2512 | ||
2551 | threadgroup_unlock(tsk); | 2513 | percpu_up_write(&cgroup_threadgroup_rwsem); |
2552 | 2514 | ||
2553 | put_task_struct(tsk); | 2515 | put_task_struct(tsk); |
2554 | out_unlock_cgroup: | 2516 | out_unlock_cgroup: |
@@ -2751,17 +2713,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) | |||
2751 | goto out_finish; | 2713 | goto out_finish; |
2752 | last_task = task; | 2714 | last_task = task; |
2753 | 2715 | ||
2754 | threadgroup_lock(task); | 2716 | percpu_down_write(&cgroup_threadgroup_rwsem); |
2755 | /* raced against de_thread() from another thread? */ | 2717 | /* raced against de_thread() from another thread? */ |
2756 | if (!thread_group_leader(task)) { | 2718 | if (!thread_group_leader(task)) { |
2757 | threadgroup_unlock(task); | 2719 | percpu_up_write(&cgroup_threadgroup_rwsem); |
2758 | put_task_struct(task); | 2720 | put_task_struct(task); |
2759 | continue; | 2721 | continue; |
2760 | } | 2722 | } |
2761 | 2723 | ||
2762 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); | 2724 | ret = cgroup_migrate(src_cset->dfl_cgrp, task, true); |
2763 | 2725 | ||
2764 | threadgroup_unlock(task); | 2726 | percpu_up_write(&cgroup_threadgroup_rwsem); |
2765 | put_task_struct(task); | 2727 | put_task_struct(task); |
2766 | 2728 | ||
2767 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) | 2729 | if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret)) |
@@ -5083,6 +5045,7 @@ int __init cgroup_init(void) | |||
5083 | unsigned long key; | 5045 | unsigned long key; |
5084 | int ssid, err; | 5046 | int ssid, err; |
5085 | 5047 | ||
5048 | BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); | ||
5086 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); | 5049 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); |
5087 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); | 5050 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); |
5088 | 5051 | ||