diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-02-10 14:36:19 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-02-10 14:36:19 -0500 |
commit | fb0dc5f129bc2d4763bdc237b8df0e1708c03e1e (patch) | |
tree | 4c635f1ca11535c0072d2f10282a31d16979d639 | |
parent | 9aece75c138d93bde79a2baeb9187a1109b4e952 (diff) | |
parent | 9a2ddda572a002633a64b1ae5f4bc49cfcbf495f (diff) |
Merge branch 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup fixes from Tejun Heo:
- The destruction path of cgroup objects are asynchronous and
multi-staged and some of them ended up destroying parents before
children leading to failures in cpu and memory controllers. Ensure
that parents are always destroyed after children.
- cpuset mm node migration was performed synchronously while holding
threadgroup and cgroup mutexes and the recent threadgroup locking
update resulted in a possible deadlock. The migration is best effort
and shouldn't have been performed under those locks to begin with.
Made asynchronous.
- Minor documentation fix.
* 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
Documentation: cgroup: Fix 'cgroup-legacy' -> 'cgroup-v1'
cgroup: make sure a parent css isn't freed before its children
cgroup: make sure a parent css isn't offlined before its children
cpuset: make mm migration asynchronous
-rw-r--r-- | Documentation/cgroup-v2.txt | 2 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 6 | ||||
-rw-r--r-- | include/linux/cpuset.h | 6 | ||||
-rw-r--r-- | kernel/cgroup.c | 31 | ||||
-rw-r--r-- | kernel/cpuset.c | 71 |
5 files changed, 85 insertions, 31 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index e8d25e784214..ff49cf901148 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt | |||
@@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and | |||
7 | conventions of cgroup v2. It describes all userland-visible aspects | 7 | conventions of cgroup v2. It describes all userland-visible aspects |
8 | of cgroup including core and specific controller behaviors. All | 8 | of cgroup including core and specific controller behaviors. All |
9 | future changes must be reflected in this document. Documentation for | 9 | future changes must be reflected in this document. Documentation for |
10 | v1 is available under Documentation/cgroup-legacy/. | 10 | v1 is available under Documentation/cgroup-v1/. |
11 | 11 | ||
12 | CONTENTS | 12 | CONTENTS |
13 | 13 | ||
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7f540f7f588d..789471dba6fb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
@@ -127,6 +127,12 @@ struct cgroup_subsys_state { | |||
127 | */ | 127 | */ |
128 | u64 serial_nr; | 128 | u64 serial_nr; |
129 | 129 | ||
130 | /* | ||
131 | * Incremented by online self and children. Used to guarantee that | ||
132 | * parents are not offlined before their children. | ||
133 | */ | ||
134 | atomic_t online_cnt; | ||
135 | |||
130 | /* percpu_ref killing and RCU release */ | 136 | /* percpu_ref killing and RCU release */ |
131 | struct rcu_head rcu_head; | 137 | struct rcu_head rcu_head; |
132 | struct work_struct destroy_work; | 138 | struct work_struct destroy_work; |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 85a868ccb493..fea160ee5803 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) | |||
137 | task_unlock(current); | 137 | task_unlock(current); |
138 | } | 138 | } |
139 | 139 | ||
140 | extern void cpuset_post_attach_flush(void); | ||
141 | |||
140 | #else /* !CONFIG_CPUSETS */ | 142 | #else /* !CONFIG_CPUSETS */ |
141 | 143 | ||
142 | static inline bool cpusets_enabled(void) { return false; } | 144 | static inline bool cpusets_enabled(void) { return false; } |
@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq) | |||
243 | return false; | 245 | return false; |
244 | } | 246 | } |
245 | 247 | ||
248 | static inline void cpuset_post_attach_flush(void) | ||
249 | { | ||
250 | } | ||
251 | |||
246 | #endif /* !CONFIG_CPUSETS */ | 252 | #endif /* !CONFIG_CPUSETS */ |
247 | 253 | ||
248 | #endif /* _LINUX_CPUSET_H */ | 254 | #endif /* _LINUX_CPUSET_H */ |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c03a640ef6da..d27904c193da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -58,6 +58,7 @@ | |||
58 | #include <linux/kthread.h> | 58 | #include <linux/kthread.h> |
59 | #include <linux/delay.h> | 59 | #include <linux/delay.h> |
60 | #include <linux/atomic.h> | 60 | #include <linux/atomic.h> |
61 | #include <linux/cpuset.h> | ||
61 | #include <net/sock.h> | 62 | #include <net/sock.h> |
62 | 63 | ||
63 | /* | 64 | /* |
@@ -2739,6 +2740,7 @@ out_unlock_rcu: | |||
2739 | out_unlock_threadgroup: | 2740 | out_unlock_threadgroup: |
2740 | percpu_up_write(&cgroup_threadgroup_rwsem); | 2741 | percpu_up_write(&cgroup_threadgroup_rwsem); |
2741 | cgroup_kn_unlock(of->kn); | 2742 | cgroup_kn_unlock(of->kn); |
2743 | cpuset_post_attach_flush(); | ||
2742 | return ret ?: nbytes; | 2744 | return ret ?: nbytes; |
2743 | } | 2745 | } |
2744 | 2746 | ||
@@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work) | |||
4655 | 4657 | ||
4656 | if (ss) { | 4658 | if (ss) { |
4657 | /* css free path */ | 4659 | /* css free path */ |
4660 | struct cgroup_subsys_state *parent = css->parent; | ||
4658 | int id = css->id; | 4661 | int id = css->id; |
4659 | 4662 | ||
4660 | if (css->parent) | ||
4661 | css_put(css->parent); | ||
4662 | |||
4663 | ss->css_free(css); | 4663 | ss->css_free(css); |
4664 | cgroup_idr_remove(&ss->css_idr, id); | 4664 | cgroup_idr_remove(&ss->css_idr, id); |
4665 | cgroup_put(cgrp); | 4665 | cgroup_put(cgrp); |
4666 | |||
4667 | if (parent) | ||
4668 | css_put(parent); | ||
4666 | } else { | 4669 | } else { |
4667 | /* cgroup free path */ | 4670 | /* cgroup free path */ |
4668 | atomic_dec(&cgrp->root->nr_cgrps); | 4671 | atomic_dec(&cgrp->root->nr_cgrps); |
@@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, | |||
4758 | INIT_LIST_HEAD(&css->sibling); | 4761 | INIT_LIST_HEAD(&css->sibling); |
4759 | INIT_LIST_HEAD(&css->children); | 4762 | INIT_LIST_HEAD(&css->children); |
4760 | css->serial_nr = css_serial_nr_next++; | 4763 | css->serial_nr = css_serial_nr_next++; |
4764 | atomic_set(&css->online_cnt, 0); | ||
4761 | 4765 | ||
4762 | if (cgroup_parent(cgrp)) { | 4766 | if (cgroup_parent(cgrp)) { |
4763 | css->parent = cgroup_css(cgroup_parent(cgrp), ss); | 4767 | css->parent = cgroup_css(cgroup_parent(cgrp), ss); |
@@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css) | |||
4780 | if (!ret) { | 4784 | if (!ret) { |
4781 | css->flags |= CSS_ONLINE; | 4785 | css->flags |= CSS_ONLINE; |
4782 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); | 4786 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); |
4787 | |||
4788 | atomic_inc(&css->online_cnt); | ||
4789 | if (css->parent) | ||
4790 | atomic_inc(&css->parent->online_cnt); | ||
4783 | } | 4791 | } |
4784 | return ret; | 4792 | return ret; |
4785 | } | 4793 | } |
@@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work) | |||
5017 | container_of(work, struct cgroup_subsys_state, destroy_work); | 5025 | container_of(work, struct cgroup_subsys_state, destroy_work); |
5018 | 5026 | ||
5019 | mutex_lock(&cgroup_mutex); | 5027 | mutex_lock(&cgroup_mutex); |
5020 | offline_css(css); | ||
5021 | mutex_unlock(&cgroup_mutex); | ||
5022 | 5028 | ||
5023 | css_put(css); | 5029 | do { |
5030 | offline_css(css); | ||
5031 | css_put(css); | ||
5032 | /* @css can't go away while we're holding cgroup_mutex */ | ||
5033 | css = css->parent; | ||
5034 | } while (css && atomic_dec_and_test(&css->online_cnt)); | ||
5035 | |||
5036 | mutex_unlock(&cgroup_mutex); | ||
5024 | } | 5037 | } |
5025 | 5038 | ||
5026 | /* css kill confirmation processing requires process context, bounce */ | 5039 | /* css kill confirmation processing requires process context, bounce */ |
@@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) | |||
5029 | struct cgroup_subsys_state *css = | 5042 | struct cgroup_subsys_state *css = |
5030 | container_of(ref, struct cgroup_subsys_state, refcnt); | 5043 | container_of(ref, struct cgroup_subsys_state, refcnt); |
5031 | 5044 | ||
5032 | INIT_WORK(&css->destroy_work, css_killed_work_fn); | 5045 | if (atomic_dec_and_test(&css->online_cnt)) { |
5033 | queue_work(cgroup_destroy_wq, &css->destroy_work); | 5046 | INIT_WORK(&css->destroy_work, css_killed_work_fn); |
5047 | queue_work(cgroup_destroy_wq, &css->destroy_work); | ||
5048 | } | ||
5034 | } | 5049 | } |
5035 | 5050 | ||
5036 | /** | 5051 | /** |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e945fcd8179..41989ab4db57 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = { | |||
287 | static DEFINE_MUTEX(cpuset_mutex); | 287 | static DEFINE_MUTEX(cpuset_mutex); |
288 | static DEFINE_SPINLOCK(callback_lock); | 288 | static DEFINE_SPINLOCK(callback_lock); |
289 | 289 | ||
290 | static struct workqueue_struct *cpuset_migrate_mm_wq; | ||
291 | |||
290 | /* | 292 | /* |
291 | * CPU / memory hotplug is handled asynchronously. | 293 | * CPU / memory hotplug is handled asynchronously. |
292 | */ | 294 | */ |
@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
972 | } | 974 | } |
973 | 975 | ||
974 | /* | 976 | /* |
975 | * cpuset_migrate_mm | 977 | * Migrate memory region from one set of nodes to another. This is |
976 | * | 978 | * performed asynchronously as it can be called from process migration path |
977 | * Migrate memory region from one set of nodes to another. | 979 | * holding locks involved in process management. All mm migrations are |
978 | * | 980 | * performed in the queued order and can be waited for by flushing |
979 | * Temporarilly set tasks mems_allowed to target nodes of migration, | 981 | * cpuset_migrate_mm_wq. |
980 | * so that the migration code can allocate pages on these nodes. | ||
981 | * | ||
982 | * While the mm_struct we are migrating is typically from some | ||
983 | * other task, the task_struct mems_allowed that we are hacking | ||
984 | * is for our current task, which must allocate new pages for that | ||
985 | * migrating memory region. | ||
986 | */ | 982 | */ |
987 | 983 | ||
984 | struct cpuset_migrate_mm_work { | ||
985 | struct work_struct work; | ||
986 | struct mm_struct *mm; | ||
987 | nodemask_t from; | ||
988 | nodemask_t to; | ||
989 | }; | ||
990 | |||
991 | static void cpuset_migrate_mm_workfn(struct work_struct *work) | ||
992 | { | ||
993 | struct cpuset_migrate_mm_work *mwork = | ||
994 | container_of(work, struct cpuset_migrate_mm_work, work); | ||
995 | |||
996 | /* on a wq worker, no need to worry about %current's mems_allowed */ | ||
997 | do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); | ||
998 | mmput(mwork->mm); | ||
999 | kfree(mwork); | ||
1000 | } | ||
1001 | |||
988 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | 1002 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, |
989 | const nodemask_t *to) | 1003 | const nodemask_t *to) |
990 | { | 1004 | { |
991 | struct task_struct *tsk = current; | 1005 | struct cpuset_migrate_mm_work *mwork; |
992 | |||
993 | tsk->mems_allowed = *to; | ||
994 | 1006 | ||
995 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | 1007 | mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); |
1008 | if (mwork) { | ||
1009 | mwork->mm = mm; | ||
1010 | mwork->from = *from; | ||
1011 | mwork->to = *to; | ||
1012 | INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); | ||
1013 | queue_work(cpuset_migrate_mm_wq, &mwork->work); | ||
1014 | } else { | ||
1015 | mmput(mm); | ||
1016 | } | ||
1017 | } | ||
996 | 1018 | ||
997 | rcu_read_lock(); | 1019 | void cpuset_post_attach_flush(void) |
998 | guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); | 1020 | { |
999 | rcu_read_unlock(); | 1021 | flush_workqueue(cpuset_migrate_mm_wq); |
1000 | } | 1022 | } |
1001 | 1023 | ||
1002 | /* | 1024 | /* |
@@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs) | |||
1097 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1119 | mpol_rebind_mm(mm, &cs->mems_allowed); |
1098 | if (migrate) | 1120 | if (migrate) |
1099 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); | 1121 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); |
1100 | mmput(mm); | 1122 | else |
1123 | mmput(mm); | ||
1101 | } | 1124 | } |
1102 | css_task_iter_end(&it); | 1125 | css_task_iter_end(&it); |
1103 | 1126 | ||
@@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) | |||
1545 | * @old_mems_allowed is the right nodesets that we | 1568 | * @old_mems_allowed is the right nodesets that we |
1546 | * migrate mm from. | 1569 | * migrate mm from. |
1547 | */ | 1570 | */ |
1548 | if (is_memory_migrate(cs)) { | 1571 | if (is_memory_migrate(cs)) |
1549 | cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, | 1572 | cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, |
1550 | &cpuset_attach_nodemask_to); | 1573 | &cpuset_attach_nodemask_to); |
1551 | } | 1574 | else |
1552 | mmput(mm); | 1575 | mmput(mm); |
1553 | } | 1576 | } |
1554 | } | 1577 | } |
1555 | 1578 | ||
@@ -1714,6 +1737,7 @@ out_unlock: | |||
1714 | mutex_unlock(&cpuset_mutex); | 1737 | mutex_unlock(&cpuset_mutex); |
1715 | kernfs_unbreak_active_protection(of->kn); | 1738 | kernfs_unbreak_active_protection(of->kn); |
1716 | css_put(&cs->css); | 1739 | css_put(&cs->css); |
1740 | flush_workqueue(cpuset_migrate_mm_wq); | ||
1717 | return retval ?: nbytes; | 1741 | return retval ?: nbytes; |
1718 | } | 1742 | } |
1719 | 1743 | ||
@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void) | |||
2359 | top_cpuset.effective_mems = node_states[N_MEMORY]; | 2383 | top_cpuset.effective_mems = node_states[N_MEMORY]; |
2360 | 2384 | ||
2361 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); | 2385 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); |
2386 | |||
2387 | cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); | ||
2388 | BUG_ON(!cpuset_migrate_mm_wq); | ||
2362 | } | 2389 | } |
2363 | 2390 | ||
2364 | /** | 2391 | /** |