aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-02-10 14:36:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-02-10 14:36:19 -0500
commitfb0dc5f129bc2d4763bdc237b8df0e1708c03e1e (patch)
tree4c635f1ca11535c0072d2f10282a31d16979d639
parent9aece75c138d93bde79a2baeb9187a1109b4e952 (diff)
parent9a2ddda572a002633a64b1ae5f4bc49cfcbf495f (diff)
Merge branch 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup fixes from Tejun Heo: - The destruction path of cgroup objects are asynchronous and multi-staged and some of them ended up destroying parents before children leading to failures in cpu and memory controllers. Ensure that parents are always destroyed after children. - cpuset mm node migration was performed synchronously while holding threadgroup and cgroup mutexes and the recent threadgroup locking update resulted in a possible deadlock. The migration is best effort and shouldn't have been performed under those locks to begin with. Made asynchronous. - Minor documentation fix. * 'for-4.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: Documentation: cgroup: Fix 'cgroup-legacy' -> 'cgroup-v1' cgroup: make sure a parent css isn't freed before its children cgroup: make sure a parent css isn't offlined before its children cpuset: make mm migration asynchronous
-rw-r--r--Documentation/cgroup-v2.txt2
-rw-r--r--include/linux/cgroup-defs.h6
-rw-r--r--include/linux/cpuset.h6
-rw-r--r--kernel/cgroup.c31
-rw-r--r--kernel/cpuset.c71
5 files changed, 85 insertions, 31 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt
index e8d25e784214..ff49cf901148 100644
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and
7conventions of cgroup v2. It describes all userland-visible aspects 7conventions of cgroup v2. It describes all userland-visible aspects
8of cgroup including core and specific controller behaviors. All 8of cgroup including core and specific controller behaviors. All
9future changes must be reflected in this document. Documentation for 9future changes must be reflected in this document. Documentation for
10v1 is available under Documentation/cgroup-legacy/. 10v1 is available under Documentation/cgroup-v1/.
11 11
12CONTENTS 12CONTENTS
13 13
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 7f540f7f588d..789471dba6fb 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -127,6 +127,12 @@ struct cgroup_subsys_state {
127 */ 127 */
128 u64 serial_nr; 128 u64 serial_nr;
129 129
130 /*
131 * Incremented by online self and children. Used to guarantee that
132 * parents are not offlined before their children.
133 */
134 atomic_t online_cnt;
135
130 /* percpu_ref killing and RCU release */ 136 /* percpu_ref killing and RCU release */
131 struct rcu_head rcu_head; 137 struct rcu_head rcu_head;
132 struct work_struct destroy_work; 138 struct work_struct destroy_work;
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 85a868ccb493..fea160ee5803 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask)
137 task_unlock(current); 137 task_unlock(current);
138} 138}
139 139
140extern void cpuset_post_attach_flush(void);
141
140#else /* !CONFIG_CPUSETS */ 142#else /* !CONFIG_CPUSETS */
141 143
142static inline bool cpusets_enabled(void) { return false; } 144static inline bool cpusets_enabled(void) { return false; }
@@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq)
243 return false; 245 return false;
244} 246}
245 247
248static inline void cpuset_post_attach_flush(void)
249{
250}
251
246#endif /* !CONFIG_CPUSETS */ 252#endif /* !CONFIG_CPUSETS */
247 253
248#endif /* _LINUX_CPUSET_H */ 254#endif /* _LINUX_CPUSET_H */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c03a640ef6da..d27904c193da 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -58,6 +58,7 @@
58#include <linux/kthread.h> 58#include <linux/kthread.h>
59#include <linux/delay.h> 59#include <linux/delay.h>
60#include <linux/atomic.h> 60#include <linux/atomic.h>
61#include <linux/cpuset.h>
61#include <net/sock.h> 62#include <net/sock.h>
62 63
63/* 64/*
@@ -2739,6 +2740,7 @@ out_unlock_rcu:
2739out_unlock_threadgroup: 2740out_unlock_threadgroup:
2740 percpu_up_write(&cgroup_threadgroup_rwsem); 2741 percpu_up_write(&cgroup_threadgroup_rwsem);
2741 cgroup_kn_unlock(of->kn); 2742 cgroup_kn_unlock(of->kn);
2743 cpuset_post_attach_flush();
2742 return ret ?: nbytes; 2744 return ret ?: nbytes;
2743} 2745}
2744 2746
@@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
4655 4657
4656 if (ss) { 4658 if (ss) {
4657 /* css free path */ 4659 /* css free path */
4660 struct cgroup_subsys_state *parent = css->parent;
4658 int id = css->id; 4661 int id = css->id;
4659 4662
4660 if (css->parent)
4661 css_put(css->parent);
4662
4663 ss->css_free(css); 4663 ss->css_free(css);
4664 cgroup_idr_remove(&ss->css_idr, id); 4664 cgroup_idr_remove(&ss->css_idr, id);
4665 cgroup_put(cgrp); 4665 cgroup_put(cgrp);
4666
4667 if (parent)
4668 css_put(parent);
4666 } else { 4669 } else {
4667 /* cgroup free path */ 4670 /* cgroup free path */
4668 atomic_dec(&cgrp->root->nr_cgrps); 4671 atomic_dec(&cgrp->root->nr_cgrps);
@@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
4758 INIT_LIST_HEAD(&css->sibling); 4761 INIT_LIST_HEAD(&css->sibling);
4759 INIT_LIST_HEAD(&css->children); 4762 INIT_LIST_HEAD(&css->children);
4760 css->serial_nr = css_serial_nr_next++; 4763 css->serial_nr = css_serial_nr_next++;
4764 atomic_set(&css->online_cnt, 0);
4761 4765
4762 if (cgroup_parent(cgrp)) { 4766 if (cgroup_parent(cgrp)) {
4763 css->parent = cgroup_css(cgroup_parent(cgrp), ss); 4767 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css)
4780 if (!ret) { 4784 if (!ret) {
4781 css->flags |= CSS_ONLINE; 4785 css->flags |= CSS_ONLINE;
4782 rcu_assign_pointer(css->cgroup->subsys[ss->id], css); 4786 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
4787
4788 atomic_inc(&css->online_cnt);
4789 if (css->parent)
4790 atomic_inc(&css->parent->online_cnt);
4783 } 4791 }
4784 return ret; 4792 return ret;
4785} 4793}
@@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work)
5017 container_of(work, struct cgroup_subsys_state, destroy_work); 5025 container_of(work, struct cgroup_subsys_state, destroy_work);
5018 5026
5019 mutex_lock(&cgroup_mutex); 5027 mutex_lock(&cgroup_mutex);
5020 offline_css(css);
5021 mutex_unlock(&cgroup_mutex);
5022 5028
5023 css_put(css); 5029 do {
5030 offline_css(css);
5031 css_put(css);
5032 /* @css can't go away while we're holding cgroup_mutex */
5033 css = css->parent;
5034 } while (css && atomic_dec_and_test(&css->online_cnt));
5035
5036 mutex_unlock(&cgroup_mutex);
5024} 5037}
5025 5038
5026/* css kill confirmation processing requires process context, bounce */ 5039/* css kill confirmation processing requires process context, bounce */
@@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
5029 struct cgroup_subsys_state *css = 5042 struct cgroup_subsys_state *css =
5030 container_of(ref, struct cgroup_subsys_state, refcnt); 5043 container_of(ref, struct cgroup_subsys_state, refcnt);
5031 5044
5032 INIT_WORK(&css->destroy_work, css_killed_work_fn); 5045 if (atomic_dec_and_test(&css->online_cnt)) {
5033 queue_work(cgroup_destroy_wq, &css->destroy_work); 5046 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5047 queue_work(cgroup_destroy_wq, &css->destroy_work);
5048 }
5034} 5049}
5035 5050
5036/** 5051/**
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e945fcd8179..41989ab4db57 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
287static DEFINE_MUTEX(cpuset_mutex); 287static DEFINE_MUTEX(cpuset_mutex);
288static DEFINE_SPINLOCK(callback_lock); 288static DEFINE_SPINLOCK(callback_lock);
289 289
290static struct workqueue_struct *cpuset_migrate_mm_wq;
291
290/* 292/*
291 * CPU / memory hotplug is handled asynchronously. 293 * CPU / memory hotplug is handled asynchronously.
292 */ 294 */
@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
972} 974}
973 975
974/* 976/*
975 * cpuset_migrate_mm 977 * Migrate memory region from one set of nodes to another. This is
976 * 978 * performed asynchronously as it can be called from process migration path
977 * Migrate memory region from one set of nodes to another. 979 * holding locks involved in process management. All mm migrations are
978 * 980 * performed in the queued order and can be waited for by flushing
979 * Temporarilly set tasks mems_allowed to target nodes of migration, 981 * cpuset_migrate_mm_wq.
980 * so that the migration code can allocate pages on these nodes.
981 *
982 * While the mm_struct we are migrating is typically from some
983 * other task, the task_struct mems_allowed that we are hacking
984 * is for our current task, which must allocate new pages for that
985 * migrating memory region.
986 */ 982 */
987 983
984struct cpuset_migrate_mm_work {
985 struct work_struct work;
986 struct mm_struct *mm;
987 nodemask_t from;
988 nodemask_t to;
989};
990
991static void cpuset_migrate_mm_workfn(struct work_struct *work)
992{
993 struct cpuset_migrate_mm_work *mwork =
994 container_of(work, struct cpuset_migrate_mm_work, work);
995
996 /* on a wq worker, no need to worry about %current's mems_allowed */
997 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
998 mmput(mwork->mm);
999 kfree(mwork);
1000}
1001
988static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, 1002static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
989 const nodemask_t *to) 1003 const nodemask_t *to)
990{ 1004{
991 struct task_struct *tsk = current; 1005 struct cpuset_migrate_mm_work *mwork;
992
993 tsk->mems_allowed = *to;
994 1006
995 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 1007 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1008 if (mwork) {
1009 mwork->mm = mm;
1010 mwork->from = *from;
1011 mwork->to = *to;
1012 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1013 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1014 } else {
1015 mmput(mm);
1016 }
1017}
996 1018
997 rcu_read_lock(); 1019void cpuset_post_attach_flush(void)
998 guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); 1020{
999 rcu_read_unlock(); 1021 flush_workqueue(cpuset_migrate_mm_wq);
1000} 1022}
1001 1023
1002/* 1024/*
@@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
1097 mpol_rebind_mm(mm, &cs->mems_allowed); 1119 mpol_rebind_mm(mm, &cs->mems_allowed);
1098 if (migrate) 1120 if (migrate)
1099 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); 1121 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1100 mmput(mm); 1122 else
1123 mmput(mm);
1101 } 1124 }
1102 css_task_iter_end(&it); 1125 css_task_iter_end(&it);
1103 1126
@@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
1545 * @old_mems_allowed is the right nodesets that we 1568 * @old_mems_allowed is the right nodesets that we
1546 * migrate mm from. 1569 * migrate mm from.
1547 */ 1570 */
1548 if (is_memory_migrate(cs)) { 1571 if (is_memory_migrate(cs))
1549 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 1572 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
1550 &cpuset_attach_nodemask_to); 1573 &cpuset_attach_nodemask_to);
1551 } 1574 else
1552 mmput(mm); 1575 mmput(mm);
1553 } 1576 }
1554 } 1577 }
1555 1578
@@ -1714,6 +1737,7 @@ out_unlock:
1714 mutex_unlock(&cpuset_mutex); 1737 mutex_unlock(&cpuset_mutex);
1715 kernfs_unbreak_active_protection(of->kn); 1738 kernfs_unbreak_active_protection(of->kn);
1716 css_put(&cs->css); 1739 css_put(&cs->css);
1740 flush_workqueue(cpuset_migrate_mm_wq);
1717 return retval ?: nbytes; 1741 return retval ?: nbytes;
1718} 1742}
1719 1743
@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
2359 top_cpuset.effective_mems = node_states[N_MEMORY]; 2383 top_cpuset.effective_mems = node_states[N_MEMORY];
2360 2384
2361 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2385 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2386
2387 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
2388 BUG_ON(!cpuset_migrate_mm_wq);
2362} 2389}
2363 2390
2364/** 2391/**