diff options
| -rw-r--r-- | Documentation/cgroup-v2.txt | 2 | ||||
| -rw-r--r-- | include/linux/cgroup-defs.h | 6 | ||||
| -rw-r--r-- | include/linux/cpuset.h | 6 | ||||
| -rw-r--r-- | kernel/cgroup.c | 31 | ||||
| -rw-r--r-- | kernel/cpuset.c | 71 |
5 files changed, 85 insertions, 31 deletions
diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index e8d25e784214..ff49cf901148 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt | |||
| @@ -7,7 +7,7 @@ This is the authoritative documentation on the design, interface and | |||
| 7 | conventions of cgroup v2. It describes all userland-visible aspects | 7 | conventions of cgroup v2. It describes all userland-visible aspects |
| 8 | of cgroup including core and specific controller behaviors. All | 8 | of cgroup including core and specific controller behaviors. All |
| 9 | future changes must be reflected in this document. Documentation for | 9 | future changes must be reflected in this document. Documentation for |
| 10 | v1 is available under Documentation/cgroup-legacy/. | 10 | v1 is available under Documentation/cgroup-v1/. |
| 11 | 11 | ||
| 12 | CONTENTS | 12 | CONTENTS |
| 13 | 13 | ||
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 7f540f7f588d..789471dba6fb 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
| @@ -127,6 +127,12 @@ struct cgroup_subsys_state { | |||
| 127 | */ | 127 | */ |
| 128 | u64 serial_nr; | 128 | u64 serial_nr; |
| 129 | 129 | ||
| 130 | /* | ||
| 131 | * Incremented by online self and children. Used to guarantee that | ||
| 132 | * parents are not offlined before their children. | ||
| 133 | */ | ||
| 134 | atomic_t online_cnt; | ||
| 135 | |||
| 130 | /* percpu_ref killing and RCU release */ | 136 | /* percpu_ref killing and RCU release */ |
| 131 | struct rcu_head rcu_head; | 137 | struct rcu_head rcu_head; |
| 132 | struct work_struct destroy_work; | 138 | struct work_struct destroy_work; |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 85a868ccb493..fea160ee5803 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
| @@ -137,6 +137,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) | |||
| 137 | task_unlock(current); | 137 | task_unlock(current); |
| 138 | } | 138 | } |
| 139 | 139 | ||
| 140 | extern void cpuset_post_attach_flush(void); | ||
| 141 | |||
| 140 | #else /* !CONFIG_CPUSETS */ | 142 | #else /* !CONFIG_CPUSETS */ |
| 141 | 143 | ||
| 142 | static inline bool cpusets_enabled(void) { return false; } | 144 | static inline bool cpusets_enabled(void) { return false; } |
| @@ -243,6 +245,10 @@ static inline bool read_mems_allowed_retry(unsigned int seq) | |||
| 243 | return false; | 245 | return false; |
| 244 | } | 246 | } |
| 245 | 247 | ||
| 248 | static inline void cpuset_post_attach_flush(void) | ||
| 249 | { | ||
| 250 | } | ||
| 251 | |||
| 246 | #endif /* !CONFIG_CPUSETS */ | 252 | #endif /* !CONFIG_CPUSETS */ |
| 247 | 253 | ||
| 248 | #endif /* _LINUX_CPUSET_H */ | 254 | #endif /* _LINUX_CPUSET_H */ |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c03a640ef6da..d27904c193da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -58,6 +58,7 @@ | |||
| 58 | #include <linux/kthread.h> | 58 | #include <linux/kthread.h> |
| 59 | #include <linux/delay.h> | 59 | #include <linux/delay.h> |
| 60 | #include <linux/atomic.h> | 60 | #include <linux/atomic.h> |
| 61 | #include <linux/cpuset.h> | ||
| 61 | #include <net/sock.h> | 62 | #include <net/sock.h> |
| 62 | 63 | ||
| 63 | /* | 64 | /* |
| @@ -2739,6 +2740,7 @@ out_unlock_rcu: | |||
| 2739 | out_unlock_threadgroup: | 2740 | out_unlock_threadgroup: |
| 2740 | percpu_up_write(&cgroup_threadgroup_rwsem); | 2741 | percpu_up_write(&cgroup_threadgroup_rwsem); |
| 2741 | cgroup_kn_unlock(of->kn); | 2742 | cgroup_kn_unlock(of->kn); |
| 2743 | cpuset_post_attach_flush(); | ||
| 2742 | return ret ?: nbytes; | 2744 | return ret ?: nbytes; |
| 2743 | } | 2745 | } |
| 2744 | 2746 | ||
| @@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work) | |||
| 4655 | 4657 | ||
| 4656 | if (ss) { | 4658 | if (ss) { |
| 4657 | /* css free path */ | 4659 | /* css free path */ |
| 4660 | struct cgroup_subsys_state *parent = css->parent; | ||
| 4658 | int id = css->id; | 4661 | int id = css->id; |
| 4659 | 4662 | ||
| 4660 | if (css->parent) | ||
| 4661 | css_put(css->parent); | ||
| 4662 | |||
| 4663 | ss->css_free(css); | 4663 | ss->css_free(css); |
| 4664 | cgroup_idr_remove(&ss->css_idr, id); | 4664 | cgroup_idr_remove(&ss->css_idr, id); |
| 4665 | cgroup_put(cgrp); | 4665 | cgroup_put(cgrp); |
| 4666 | |||
| 4667 | if (parent) | ||
| 4668 | css_put(parent); | ||
| 4666 | } else { | 4669 | } else { |
| 4667 | /* cgroup free path */ | 4670 | /* cgroup free path */ |
| 4668 | atomic_dec(&cgrp->root->nr_cgrps); | 4671 | atomic_dec(&cgrp->root->nr_cgrps); |
| @@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, | |||
| 4758 | INIT_LIST_HEAD(&css->sibling); | 4761 | INIT_LIST_HEAD(&css->sibling); |
| 4759 | INIT_LIST_HEAD(&css->children); | 4762 | INIT_LIST_HEAD(&css->children); |
| 4760 | css->serial_nr = css_serial_nr_next++; | 4763 | css->serial_nr = css_serial_nr_next++; |
| 4764 | atomic_set(&css->online_cnt, 0); | ||
| 4761 | 4765 | ||
| 4762 | if (cgroup_parent(cgrp)) { | 4766 | if (cgroup_parent(cgrp)) { |
| 4763 | css->parent = cgroup_css(cgroup_parent(cgrp), ss); | 4767 | css->parent = cgroup_css(cgroup_parent(cgrp), ss); |
| @@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css) | |||
| 4780 | if (!ret) { | 4784 | if (!ret) { |
| 4781 | css->flags |= CSS_ONLINE; | 4785 | css->flags |= CSS_ONLINE; |
| 4782 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); | 4786 | rcu_assign_pointer(css->cgroup->subsys[ss->id], css); |
| 4787 | |||
| 4788 | atomic_inc(&css->online_cnt); | ||
| 4789 | if (css->parent) | ||
| 4790 | atomic_inc(&css->parent->online_cnt); | ||
| 4783 | } | 4791 | } |
| 4784 | return ret; | 4792 | return ret; |
| 4785 | } | 4793 | } |
| @@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work) | |||
| 5017 | container_of(work, struct cgroup_subsys_state, destroy_work); | 5025 | container_of(work, struct cgroup_subsys_state, destroy_work); |
| 5018 | 5026 | ||
| 5019 | mutex_lock(&cgroup_mutex); | 5027 | mutex_lock(&cgroup_mutex); |
| 5020 | offline_css(css); | ||
| 5021 | mutex_unlock(&cgroup_mutex); | ||
| 5022 | 5028 | ||
| 5023 | css_put(css); | 5029 | do { |
| 5030 | offline_css(css); | ||
| 5031 | css_put(css); | ||
| 5032 | /* @css can't go away while we're holding cgroup_mutex */ | ||
| 5033 | css = css->parent; | ||
| 5034 | } while (css && atomic_dec_and_test(&css->online_cnt)); | ||
| 5035 | |||
| 5036 | mutex_unlock(&cgroup_mutex); | ||
| 5024 | } | 5037 | } |
| 5025 | 5038 | ||
| 5026 | /* css kill confirmation processing requires process context, bounce */ | 5039 | /* css kill confirmation processing requires process context, bounce */ |
| @@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) | |||
| 5029 | struct cgroup_subsys_state *css = | 5042 | struct cgroup_subsys_state *css = |
| 5030 | container_of(ref, struct cgroup_subsys_state, refcnt); | 5043 | container_of(ref, struct cgroup_subsys_state, refcnt); |
| 5031 | 5044 | ||
| 5032 | INIT_WORK(&css->destroy_work, css_killed_work_fn); | 5045 | if (atomic_dec_and_test(&css->online_cnt)) { |
| 5033 | queue_work(cgroup_destroy_wq, &css->destroy_work); | 5046 | INIT_WORK(&css->destroy_work, css_killed_work_fn); |
| 5047 | queue_work(cgroup_destroy_wq, &css->destroy_work); | ||
| 5048 | } | ||
| 5034 | } | 5049 | } |
| 5035 | 5050 | ||
| 5036 | /** | 5051 | /** |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e945fcd8179..41989ab4db57 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -287,6 +287,8 @@ static struct cpuset top_cpuset = { | |||
| 287 | static DEFINE_MUTEX(cpuset_mutex); | 287 | static DEFINE_MUTEX(cpuset_mutex); |
| 288 | static DEFINE_SPINLOCK(callback_lock); | 288 | static DEFINE_SPINLOCK(callback_lock); |
| 289 | 289 | ||
| 290 | static struct workqueue_struct *cpuset_migrate_mm_wq; | ||
| 291 | |||
| 290 | /* | 292 | /* |
| 291 | * CPU / memory hotplug is handled asynchronously. | 293 | * CPU / memory hotplug is handled asynchronously. |
| 292 | */ | 294 | */ |
| @@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 972 | } | 974 | } |
| 973 | 975 | ||
| 974 | /* | 976 | /* |
| 975 | * cpuset_migrate_mm | 977 | * Migrate memory region from one set of nodes to another. This is |
| 976 | * | 978 | * performed asynchronously as it can be called from process migration path |
| 977 | * Migrate memory region from one set of nodes to another. | 979 | * holding locks involved in process management. All mm migrations are |
| 978 | * | 980 | * performed in the queued order and can be waited for by flushing |
| 979 | * Temporarilly set tasks mems_allowed to target nodes of migration, | 981 | * cpuset_migrate_mm_wq. |
| 980 | * so that the migration code can allocate pages on these nodes. | ||
| 981 | * | ||
| 982 | * While the mm_struct we are migrating is typically from some | ||
| 983 | * other task, the task_struct mems_allowed that we are hacking | ||
| 984 | * is for our current task, which must allocate new pages for that | ||
| 985 | * migrating memory region. | ||
| 986 | */ | 982 | */ |
| 987 | 983 | ||
| 984 | struct cpuset_migrate_mm_work { | ||
| 985 | struct work_struct work; | ||
| 986 | struct mm_struct *mm; | ||
| 987 | nodemask_t from; | ||
| 988 | nodemask_t to; | ||
| 989 | }; | ||
| 990 | |||
| 991 | static void cpuset_migrate_mm_workfn(struct work_struct *work) | ||
| 992 | { | ||
| 993 | struct cpuset_migrate_mm_work *mwork = | ||
| 994 | container_of(work, struct cpuset_migrate_mm_work, work); | ||
| 995 | |||
| 996 | /* on a wq worker, no need to worry about %current's mems_allowed */ | ||
| 997 | do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); | ||
| 998 | mmput(mwork->mm); | ||
| 999 | kfree(mwork); | ||
| 1000 | } | ||
| 1001 | |||
| 988 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | 1002 | static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, |
| 989 | const nodemask_t *to) | 1003 | const nodemask_t *to) |
| 990 | { | 1004 | { |
| 991 | struct task_struct *tsk = current; | 1005 | struct cpuset_migrate_mm_work *mwork; |
| 992 | |||
| 993 | tsk->mems_allowed = *to; | ||
| 994 | 1006 | ||
| 995 | do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); | 1007 | mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); |
| 1008 | if (mwork) { | ||
| 1009 | mwork->mm = mm; | ||
| 1010 | mwork->from = *from; | ||
| 1011 | mwork->to = *to; | ||
| 1012 | INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); | ||
| 1013 | queue_work(cpuset_migrate_mm_wq, &mwork->work); | ||
| 1014 | } else { | ||
| 1015 | mmput(mm); | ||
| 1016 | } | ||
| 1017 | } | ||
| 996 | 1018 | ||
| 997 | rcu_read_lock(); | 1019 | void cpuset_post_attach_flush(void) |
| 998 | guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); | 1020 | { |
| 999 | rcu_read_unlock(); | 1021 | flush_workqueue(cpuset_migrate_mm_wq); |
| 1000 | } | 1022 | } |
| 1001 | 1023 | ||
| 1002 | /* | 1024 | /* |
| @@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs) | |||
| 1097 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1119 | mpol_rebind_mm(mm, &cs->mems_allowed); |
| 1098 | if (migrate) | 1120 | if (migrate) |
| 1099 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); | 1121 | cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); |
| 1100 | mmput(mm); | 1122 | else |
| 1123 | mmput(mm); | ||
| 1101 | } | 1124 | } |
| 1102 | css_task_iter_end(&it); | 1125 | css_task_iter_end(&it); |
| 1103 | 1126 | ||
| @@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) | |||
| 1545 | * @old_mems_allowed is the right nodesets that we | 1568 | * @old_mems_allowed is the right nodesets that we |
| 1546 | * migrate mm from. | 1569 | * migrate mm from. |
| 1547 | */ | 1570 | */ |
| 1548 | if (is_memory_migrate(cs)) { | 1571 | if (is_memory_migrate(cs)) |
| 1549 | cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, | 1572 | cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, |
| 1550 | &cpuset_attach_nodemask_to); | 1573 | &cpuset_attach_nodemask_to); |
| 1551 | } | 1574 | else |
| 1552 | mmput(mm); | 1575 | mmput(mm); |
| 1553 | } | 1576 | } |
| 1554 | } | 1577 | } |
| 1555 | 1578 | ||
| @@ -1714,6 +1737,7 @@ out_unlock: | |||
| 1714 | mutex_unlock(&cpuset_mutex); | 1737 | mutex_unlock(&cpuset_mutex); |
| 1715 | kernfs_unbreak_active_protection(of->kn); | 1738 | kernfs_unbreak_active_protection(of->kn); |
| 1716 | css_put(&cs->css); | 1739 | css_put(&cs->css); |
| 1740 | flush_workqueue(cpuset_migrate_mm_wq); | ||
| 1717 | return retval ?: nbytes; | 1741 | return retval ?: nbytes; |
| 1718 | } | 1742 | } |
| 1719 | 1743 | ||
| @@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void) | |||
| 2359 | top_cpuset.effective_mems = node_states[N_MEMORY]; | 2383 | top_cpuset.effective_mems = node_states[N_MEMORY]; |
| 2360 | 2384 | ||
| 2361 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); | 2385 | register_hotmemory_notifier(&cpuset_track_online_nodes_nb); |
| 2386 | |||
| 2387 | cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); | ||
| 2388 | BUG_ON(!cpuset_migrate_mm_wq); | ||
| 2362 | } | 2389 | } |
| 2363 | 2390 | ||
| 2364 | /** | 2391 | /** |
