aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-02 23:04:25 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-02 23:04:25 -0400
commit0b0585c3e192967cb2ef0ac0816eb8a8c8d99840 (patch)
tree9f655158a396623736b8cc94c2a27a2fce4ab644
parentb028161fbba178ccd35aa69051c04d7673fe9d80 (diff)
parentc9e5fe66f5947c9e56dfc7655e5b4b127ca2120f (diff)
Merge branch 'for-3.11-cpuset' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cpuset changes from Tejun Heo: "cpuset has always been rather odd about its configurations - a cgroup right after creation didn't allow any task executions before configuration, changing configuration in the parent modifies the descendants irreversibly and so on. These behaviors are inherently nasty and almost hostile against sharing the hierarchy with other controllers making it very difficult to use in unified hierarchy. Li is currently in the process of updating the behaviors for __DEVEL__sane_behavior which is the bulk of changes in this pull request. It isn't complete yet and the behaviors will change further but all changes are gated behind sane_behavior. In the process, the rather hairy work-item punting which was used to work around the limitations of cgroup descendant iterator was simplified." * 'for-3.11-cpuset' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cpuset: rename @cont to @cgrp cpuset: fix to migrate mm correctly in a corner case cpuset: allow to move tasks to empty cpusets cpuset: allow to keep tasks in empty cpusets cpuset: introduce effective_{cpumask|nodemask}_cpuset() cpuset: record old_mems_allowed in struct cpuset cpuset: remove async hotplug propagation work cpuset: let hotplug propagation work wait for task attaching cpuset: re-structure update_cpumask() a bit cpuset: remove cpuset_test_cpumask() cpuset: remove unnecessary variable in cpuset_attach() cpuset: cleanup guarantee_online_{cpus|mems}() cpuset: remove redundant check in cpuset_cpus_allowed_fallback()
-rw-r--r--include/linux/cgroup.h9
-rw-r--r--kernel/cpuset.c478
2 files changed, 295 insertions, 192 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8db53974f7b5..fd097ecfcd97 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -261,13 +261,20 @@ enum {
261 * 261 *
262 * - Remount is disallowed. 262 * - Remount is disallowed.
263 * 263 *
264 * - rename(2) is disallowed.
265 *
264 * - "tasks" is removed. Everything should be at process 266 * - "tasks" is removed. Everything should be at process
265 * granularity. Use "cgroup.procs" instead. 267 * granularity. Use "cgroup.procs" instead.
266 * 268 *
267 * - "release_agent" and "notify_on_release" are removed. 269 * - "release_agent" and "notify_on_release" are removed.
268 * Replacement notification mechanism will be implemented. 270 * Replacement notification mechanism will be implemented.
269 * 271 *
270 * - rename(2) is disallowed. 272 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
273 * and take masks of ancestors with non-empty cpus/mems, instead of
274 * being moved to an ancestor.
275 *
276 * - cpuset: a task can be moved into an empty cpuset, and again it
277 * takes masks of ancestors.
271 * 278 *
272 * - memcg: use_hierarchy is on by default and the cgroup file for 279 * - memcg: use_hierarchy is on by default and the cgroup file for
273 * the flag is not created. 280 * the flag is not created.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 902d13fc2b13..e5657788fedd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -59,6 +59,7 @@
59#include <linux/mutex.h> 59#include <linux/mutex.h>
60#include <linux/workqueue.h> 60#include <linux/workqueue.h>
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h>
62 63
63/* 64/*
64 * Tracks how many cpusets are currently defined in system. 65 * Tracks how many cpusets are currently defined in system.
@@ -87,6 +88,18 @@ struct cpuset {
87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 88 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 89 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 90
91 /*
92 * This is old Memory Nodes tasks took on.
93 *
94 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
95 * - A new cpuset's old_mems_allowed is initialized when some
96 * task is moved into it.
97 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
98 * cpuset.mems_allowed and have tasks' nodemask updated, and
99 * then old_mems_allowed is updated to mems_allowed.
100 */
101 nodemask_t old_mems_allowed;
102
90 struct fmeter fmeter; /* memory_pressure filter */ 103 struct fmeter fmeter; /* memory_pressure filter */
91 104
92 /* 105 /*
@@ -100,14 +113,12 @@ struct cpuset {
100 113
101 /* for custom sched domain */ 114 /* for custom sched domain */
102 int relax_domain_level; 115 int relax_domain_level;
103
104 struct work_struct hotplug_work;
105}; 116};
106 117
107/* Retrieve the cpuset for a cgroup */ 118/* Retrieve the cpuset for a cgroup */
108static inline struct cpuset *cgroup_cs(struct cgroup *cont) 119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
109{ 120{
110 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), 121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id),
111 struct cpuset, css); 122 struct cpuset, css);
112} 123}
113 124
@@ -267,14 +278,11 @@ static DEFINE_MUTEX(callback_mutex);
267/* 278/*
268 * CPU / memory hotplug is handled asynchronously. 279 * CPU / memory hotplug is handled asynchronously.
269 */ 280 */
270static struct workqueue_struct *cpuset_propagate_hotplug_wq;
271
272static void cpuset_hotplug_workfn(struct work_struct *work); 281static void cpuset_hotplug_workfn(struct work_struct *work);
273static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
274static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
275
276static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 282static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
277 283
284static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
285
278/* 286/*
279 * This is ugly, but preserves the userspace API for existing cpuset 287 * This is ugly, but preserves the userspace API for existing cpuset
280 * users. If someone tries to mount the "cpuset" filesystem, we 288 * users. If someone tries to mount the "cpuset" filesystem, we
@@ -304,53 +312,38 @@ static struct file_system_type cpuset_fs_type = {
304/* 312/*
305 * Return in pmask the portion of a cpusets's cpus_allowed that 313 * Return in pmask the portion of a cpusets's cpus_allowed that
306 * are online. If none are online, walk up the cpuset hierarchy 314 * are online. If none are online, walk up the cpuset hierarchy
307 * until we find one that does have some online cpus. If we get 315 * until we find one that does have some online cpus. The top
308 * all the way to the top and still haven't found any online cpus, 316 * cpuset always has some cpus online.
309 * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
310 * task, return cpu_online_mask.
311 * 317 *
312 * One way or another, we guarantee to return some non-empty subset 318 * One way or another, we guarantee to return some non-empty subset
313 * of cpu_online_mask. 319 * of cpu_online_mask.
314 * 320 *
315 * Call with callback_mutex held. 321 * Call with callback_mutex held.
316 */ 322 */
317
318static void guarantee_online_cpus(const struct cpuset *cs, 323static void guarantee_online_cpus(const struct cpuset *cs,
319 struct cpumask *pmask) 324 struct cpumask *pmask)
320{ 325{
321 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
322 cs = parent_cs(cs); 327 cs = parent_cs(cs);
323 if (cs) 328 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
324 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
325 else
326 cpumask_copy(pmask, cpu_online_mask);
327 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
328} 329}
329 330
330/* 331/*
331 * Return in *pmask the portion of a cpusets's mems_allowed that 332 * Return in *pmask the portion of a cpusets's mems_allowed that
332 * are online, with memory. If none are online with memory, walk 333 * are online, with memory. If none are online with memory, walk
333 * up the cpuset hierarchy until we find one that does have some 334 * up the cpuset hierarchy until we find one that does have some
334 * online mems. If we get all the way to the top and still haven't 335 * online mems. The top cpuset always has some mems online.
335 * found any online mems, return node_states[N_MEMORY].
336 * 336 *
337 * One way or another, we guarantee to return some non-empty subset 337 * One way or another, we guarantee to return some non-empty subset
338 * of node_states[N_MEMORY]. 338 * of node_states[N_MEMORY].
339 * 339 *
340 * Call with callback_mutex held. 340 * Call with callback_mutex held.
341 */ 341 */
342
343static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
344{ 343{
345 while (cs && !nodes_intersects(cs->mems_allowed, 344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
346 node_states[N_MEMORY]))
347 cs = parent_cs(cs); 345 cs = parent_cs(cs);
348 if (cs) 346 nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
349 nodes_and(*pmask, cs->mems_allowed,
350 node_states[N_MEMORY]);
351 else
352 *pmask = node_states[N_MEMORY];
353 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
354} 347}
355 348
356/* 349/*
@@ -440,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial)
440 433
441static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 434static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
442{ 435{
443 struct cgroup *cont; 436 struct cgroup *cgrp;
444 struct cpuset *c, *par; 437 struct cpuset *c, *par;
445 int ret; 438 int ret;
446 439
@@ -448,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
448 441
449 /* Each of our child cpusets must be a subset of us */ 442 /* Each of our child cpusets must be a subset of us */
450 ret = -EBUSY; 443 ret = -EBUSY;
451 cpuset_for_each_child(c, cont, cur) 444 cpuset_for_each_child(c, cgrp, cur)
452 if (!is_cpuset_subset(c, trial)) 445 if (!is_cpuset_subset(c, trial))
453 goto out; 446 goto out;
454 447
@@ -469,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
469 * overlap 462 * overlap
470 */ 463 */
471 ret = -EINVAL; 464 ret = -EINVAL;
472 cpuset_for_each_child(c, cont, par) { 465 cpuset_for_each_child(c, cgrp, par) {
473 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
474 c != cur && 467 c != cur &&
475 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -486,7 +479,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
486 */ 479 */
487 ret = -ENOSPC; 480 ret = -ENOSPC;
488 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && 481 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
489 (cpumask_empty(trial->cpus_allowed) || 482 (cpumask_empty(trial->cpus_allowed) &&
490 nodes_empty(trial->mems_allowed))) 483 nodes_empty(trial->mems_allowed)))
491 goto out; 484 goto out;
492 485
@@ -798,21 +791,43 @@ void rebuild_sched_domains(void)
798 mutex_unlock(&cpuset_mutex); 791 mutex_unlock(&cpuset_mutex);
799} 792}
800 793
801/** 794/*
802 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's 795 * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
803 * @tsk: task to test 796 * @cs: the cpuset in interest
804 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
805 * 797 *
806 * Call with cpuset_mutex held. May take callback_mutex during call. 798 * A cpuset's effective cpumask is the cpumask of the nearest ancestor
807 * Called for each task in a cgroup by cgroup_scan_tasks(). 799 * with non-empty cpus. We use effective cpumask whenever:
808 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 800 * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
809 * words, if its mask is not equal to its cpuset's mask). 801 * if the cpuset they reside in has no cpus)
802 * - we want to retrieve task_cs(tsk)'s cpus_allowed.
803 *
804 * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
805 * exception. See comments there.
810 */ 806 */
811static int cpuset_test_cpumask(struct task_struct *tsk, 807static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
812 struct cgroup_scanner *scan)
813{ 808{
814 return !cpumask_equal(&tsk->cpus_allowed, 809 while (cpumask_empty(cs->cpus_allowed))
815 (cgroup_cs(scan->cg))->cpus_allowed); 810 cs = parent_cs(cs);
811 return cs;
812}
813
814/*
815 * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
816 * @cs: the cpuset in interest
817 *
818 * A cpuset's effective nodemask is the nodemask of the nearest ancestor
819 * with non-empty memss. We use effective nodemask whenever:
820 * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
821 * if the cpuset they reside in has no mems)
822 * - we want to retrieve task_cs(tsk)'s mems_allowed.
823 *
824 * Called with cpuset_mutex held.
825 */
826static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
827{
828 while (nodes_empty(cs->mems_allowed))
829 cs = parent_cs(cs);
830 return cs;
816} 831}
817 832
818/** 833/**
@@ -829,7 +844,10 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829static void cpuset_change_cpumask(struct task_struct *tsk, 844static void cpuset_change_cpumask(struct task_struct *tsk,
830 struct cgroup_scanner *scan) 845 struct cgroup_scanner *scan)
831{ 846{
832 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); 847 struct cpuset *cpus_cs;
848
849 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
850 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
833} 851}
834 852
835/** 853/**
@@ -850,12 +868,51 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
850 struct cgroup_scanner scan; 868 struct cgroup_scanner scan;
851 869
852 scan.cg = cs->css.cgroup; 870 scan.cg = cs->css.cgroup;
853 scan.test_task = cpuset_test_cpumask; 871 scan.test_task = NULL;
854 scan.process_task = cpuset_change_cpumask; 872 scan.process_task = cpuset_change_cpumask;
855 scan.heap = heap; 873 scan.heap = heap;
856 cgroup_scan_tasks(&scan); 874 cgroup_scan_tasks(&scan);
857} 875}
858 876
877/*
878 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
879 * @root_cs: the root cpuset of the hierarchy
880 * @update_root: update root cpuset or not?
881 * @heap: the heap used by cgroup_scan_tasks()
882 *
883 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
884 * which take on cpumask of @root_cs.
885 *
886 * Called with cpuset_mutex held
887 */
888static void update_tasks_cpumask_hier(struct cpuset *root_cs,
889 bool update_root, struct ptr_heap *heap)
890{
891 struct cpuset *cp;
892 struct cgroup *pos_cgrp;
893
894 if (update_root)
895 update_tasks_cpumask(root_cs, heap);
896
897 rcu_read_lock();
898 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
899 /* skip the whole subtree if @cp have some CPU */
900 if (!cpumask_empty(cp->cpus_allowed)) {
901 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
902 continue;
903 }
904 if (!css_tryget(&cp->css))
905 continue;
906 rcu_read_unlock();
907
908 update_tasks_cpumask(cp, heap);
909
910 rcu_read_lock();
911 css_put(&cp->css);
912 }
913 rcu_read_unlock();
914}
915
859/** 916/**
860 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 917 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
861 * @cs: the cpuset to consider 918 * @cs: the cpuset to consider
@@ -888,14 +945,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
888 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 945 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
889 return -EINVAL; 946 return -EINVAL;
890 } 947 }
891 retval = validate_change(cs, trialcs);
892 if (retval < 0)
893 return retval;
894 948
895 /* Nothing to do if the cpus didn't change */ 949 /* Nothing to do if the cpus didn't change */
896 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 950 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
897 return 0; 951 return 0;
898 952
953 retval = validate_change(cs, trialcs);
954 if (retval < 0)
955 return retval;
956
899 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 957 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
900 if (retval) 958 if (retval)
901 return retval; 959 return retval;
@@ -906,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
906 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 964 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
907 mutex_unlock(&callback_mutex); 965 mutex_unlock(&callback_mutex);
908 966
909 /* 967 update_tasks_cpumask_hier(cs, true, &heap);
910 * Scan tasks in the cpuset, and update the cpumasks of any
911 * that need an update.
912 */
913 update_tasks_cpumask(cs, &heap);
914 968
915 heap_free(&heap); 969 heap_free(&heap);
916 970
@@ -943,12 +997,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
943 const nodemask_t *to) 997 const nodemask_t *to)
944{ 998{
945 struct task_struct *tsk = current; 999 struct task_struct *tsk = current;
1000 struct cpuset *mems_cs;
946 1001
947 tsk->mems_allowed = *to; 1002 tsk->mems_allowed = *to;
948 1003
949 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 1004 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
950 1005
951 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 1006 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
1007 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
952} 1008}
953 1009
954/* 1010/*
@@ -1007,16 +1063,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007static void cpuset_change_nodemask(struct task_struct *p, 1063static void cpuset_change_nodemask(struct task_struct *p,
1008 struct cgroup_scanner *scan) 1064 struct cgroup_scanner *scan)
1009{ 1065{
1066 struct cpuset *cs = cgroup_cs(scan->cg);
1010 struct mm_struct *mm; 1067 struct mm_struct *mm;
1011 struct cpuset *cs;
1012 int migrate; 1068 int migrate;
1013 const nodemask_t *oldmem = scan->data; 1069 nodemask_t *newmems = scan->data;
1014 static nodemask_t newmems; /* protected by cpuset_mutex */
1015
1016 cs = cgroup_cs(scan->cg);
1017 guarantee_online_mems(cs, &newmems);
1018 1070
1019 cpuset_change_task_nodemask(p, &newmems); 1071 cpuset_change_task_nodemask(p, newmems);
1020 1072
1021 mm = get_task_mm(p); 1073 mm = get_task_mm(p);
1022 if (!mm) 1074 if (!mm)
@@ -1026,7 +1078,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1026 1078
1027 mpol_rebind_mm(mm, &cs->mems_allowed); 1079 mpol_rebind_mm(mm, &cs->mems_allowed);
1028 if (migrate) 1080 if (migrate)
1029 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); 1081 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);
1030 mmput(mm); 1082 mmput(mm);
1031} 1083}
1032 1084
@@ -1035,25 +1087,27 @@ static void *cpuset_being_rebound;
1035/** 1087/**
1036 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1088 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1037 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1089 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1038 * @oldmem: old mems_allowed of cpuset cs
1039 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1090 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1040 * 1091 *
1041 * Called with cpuset_mutex held 1092 * Called with cpuset_mutex held
1042 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1093 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1043 * if @heap != NULL. 1094 * if @heap != NULL.
1044 */ 1095 */
1045static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, 1096static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1046 struct ptr_heap *heap)
1047{ 1097{
1098 static nodemask_t newmems; /* protected by cpuset_mutex */
1048 struct cgroup_scanner scan; 1099 struct cgroup_scanner scan;
1100 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1049 1101
1050 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1102 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1051 1103
1104 guarantee_online_mems(mems_cs, &newmems);
1105
1052 scan.cg = cs->css.cgroup; 1106 scan.cg = cs->css.cgroup;
1053 scan.test_task = NULL; 1107 scan.test_task = NULL;
1054 scan.process_task = cpuset_change_nodemask; 1108 scan.process_task = cpuset_change_nodemask;
1055 scan.heap = heap; 1109 scan.heap = heap;
1056 scan.data = (nodemask_t *)oldmem; 1110 scan.data = &newmems;
1057 1111
1058 /* 1112 /*
1059 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1113 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1067,11 +1121,56 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1067 */ 1121 */
1068 cgroup_scan_tasks(&scan); 1122 cgroup_scan_tasks(&scan);
1069 1123
1124 /*
1125 * All the tasks' nodemasks have been updated, update
1126 * cs->old_mems_allowed.
1127 */
1128 cs->old_mems_allowed = newmems;
1129
1070 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1130 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1071 cpuset_being_rebound = NULL; 1131 cpuset_being_rebound = NULL;
1072} 1132}
1073 1133
1074/* 1134/*
1135 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1136 * @cs: the root cpuset of the hierarchy
1137 * @update_root: update the root cpuset or not?
1138 * @heap: the heap used by cgroup_scan_tasks()
1139 *
1140 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1141 * which take on nodemask of @root_cs.
1142 *
1143 * Called with cpuset_mutex held
1144 */
1145static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1146 bool update_root, struct ptr_heap *heap)
1147{
1148 struct cpuset *cp;
1149 struct cgroup *pos_cgrp;
1150
1151 if (update_root)
1152 update_tasks_nodemask(root_cs, heap);
1153
1154 rcu_read_lock();
1155 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
1156 /* skip the whole subtree if @cp have some CPU */
1157 if (!nodes_empty(cp->mems_allowed)) {
1158 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
1159 continue;
1160 }
1161 if (!css_tryget(&cp->css))
1162 continue;
1163 rcu_read_unlock();
1164
1165 update_tasks_nodemask(cp, heap);
1166
1167 rcu_read_lock();
1168 css_put(&cp->css);
1169 }
1170 rcu_read_unlock();
1171}
1172
1173/*
1075 * Handle user request to change the 'mems' memory placement 1174 * Handle user request to change the 'mems' memory placement
1076 * of a cpuset. Needs to validate the request, update the 1175 * of a cpuset. Needs to validate the request, update the
1077 * cpusets mems_allowed, and for each task in the cpuset, 1176 * cpusets mems_allowed, and for each task in the cpuset,
@@ -1087,13 +1186,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1087static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1186static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1088 const char *buf) 1187 const char *buf)
1089{ 1188{
1090 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1091 int retval; 1189 int retval;
1092 struct ptr_heap heap; 1190 struct ptr_heap heap;
1093 1191
1094 if (!oldmem)
1095 return -ENOMEM;
1096
1097 /* 1192 /*
1098 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1193 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1099 * it's read-only 1194 * it's read-only
@@ -1122,8 +1217,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1217 goto done;
1123 } 1218 }
1124 } 1219 }
1125 *oldmem = cs->mems_allowed; 1220
1126 if (nodes_equal(*oldmem, trialcs->mems_allowed)) { 1221 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1127 retval = 0; /* Too easy - nothing to do */ 1222 retval = 0; /* Too easy - nothing to do */
1128 goto done; 1223 goto done;
1129 } 1224 }
@@ -1139,11 +1234,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1139 cs->mems_allowed = trialcs->mems_allowed; 1234 cs->mems_allowed = trialcs->mems_allowed;
1140 mutex_unlock(&callback_mutex); 1235 mutex_unlock(&callback_mutex);
1141 1236
1142 update_tasks_nodemask(cs, oldmem, &heap); 1237 update_tasks_nodemask_hier(cs, true, &heap);
1143 1238
1144 heap_free(&heap); 1239 heap_free(&heap);
1145done: 1240done:
1146 NODEMASK_FREE(oldmem);
1147 return retval; 1241 return retval;
1148} 1242}
1149 1243
@@ -1372,8 +1466,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1372 1466
1373 mutex_lock(&cpuset_mutex); 1467 mutex_lock(&cpuset_mutex);
1374 1468
1469 /*
1470 * We allow to move tasks into an empty cpuset if sane_behavior
1471 * flag is set.
1472 */
1375 ret = -ENOSPC; 1473 ret = -ENOSPC;
1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1474 if (!cgroup_sane_behavior(cgrp) &&
1475 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1377 goto out_unlock; 1476 goto out_unlock;
1378 1477
1379 cgroup_taskset_for_each(task, cgrp, tset) { 1478 cgroup_taskset_for_each(task, cgrp, tset) {
@@ -1422,8 +1521,7 @@ static cpumask_var_t cpus_attach;
1422 1521
1423static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1522static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1424{ 1523{
1425 /* static bufs protected by cpuset_mutex */ 1524 /* static buf protected by cpuset_mutex */
1426 static nodemask_t cpuset_attach_nodemask_from;
1427 static nodemask_t cpuset_attach_nodemask_to; 1525 static nodemask_t cpuset_attach_nodemask_to;
1428 struct mm_struct *mm; 1526 struct mm_struct *mm;
1429 struct task_struct *task; 1527 struct task_struct *task;
@@ -1431,6 +1529,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1431 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1529 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1432 struct cpuset *cs = cgroup_cs(cgrp); 1530 struct cpuset *cs = cgroup_cs(cgrp);
1433 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1531 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1532 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1533 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1434 1534
1435 mutex_lock(&cpuset_mutex); 1535 mutex_lock(&cpuset_mutex);
1436 1536
@@ -1438,9 +1538,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1438 if (cs == &top_cpuset) 1538 if (cs == &top_cpuset)
1439 cpumask_copy(cpus_attach, cpu_possible_mask); 1539 cpumask_copy(cpus_attach, cpu_possible_mask);
1440 else 1540 else
1441 guarantee_online_cpus(cs, cpus_attach); 1541 guarantee_online_cpus(cpus_cs, cpus_attach);
1442 1542
1443 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1543 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1444 1544
1445 cgroup_taskset_for_each(task, cgrp, tset) { 1545 cgroup_taskset_for_each(task, cgrp, tset) {
1446 /* 1546 /*
@@ -1457,26 +1557,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1457 * Change mm, possibly for multiple threads in a threadgroup. This is 1557 * Change mm, possibly for multiple threads in a threadgroup. This is
1458 * expensive and may sleep. 1558 * expensive and may sleep.
1459 */ 1559 */
1460 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1461 cpuset_attach_nodemask_to = cs->mems_allowed; 1560 cpuset_attach_nodemask_to = cs->mems_allowed;
1462 mm = get_task_mm(leader); 1561 mm = get_task_mm(leader);
1463 if (mm) { 1562 if (mm) {
1563 struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
1564
1464 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1565 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1465 if (is_memory_migrate(cs)) 1566
1466 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, 1567 /*
1568 * old_mems_allowed is the same with mems_allowed here, except
1569 * if this task is being moved automatically due to hotplug.
1570 * In that case @mems_allowed has been updated and is empty,
1571 * so @old_mems_allowed is the right nodesets that we migrate
1572 * mm from.
1573 */
1574 if (is_memory_migrate(cs)) {
1575 cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
1467 &cpuset_attach_nodemask_to); 1576 &cpuset_attach_nodemask_to);
1577 }
1468 mmput(mm); 1578 mmput(mm);
1469 } 1579 }
1470 1580
1471 cs->attach_in_progress--; 1581 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1472 1582
1473 /* 1583 cs->attach_in_progress--;
1474 * We may have raced with CPU/memory hotunplug. Trigger hotplug 1584 if (!cs->attach_in_progress)
1475 * propagation if @cs doesn't have any CPU or memory. It will move 1585 wake_up(&cpuset_attach_wq);
1476 * the newly added tasks to the nearest parent which can execute.
1477 */
1478 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1479 schedule_cpuset_propagate_hotplug(cs);
1480 1586
1481 mutex_unlock(&cpuset_mutex); 1587 mutex_unlock(&cpuset_mutex);
1482} 1588}
@@ -1588,13 +1694,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1588 * resources, wait for the previously scheduled operations before 1694 * resources, wait for the previously scheduled operations before
1589 * proceeding, so that we don't end up keep removing tasks added 1695 * proceeding, so that we don't end up keep removing tasks added
1590 * after execution capability is restored. 1696 * after execution capability is restored.
1591 *
1592 * Flushing cpuset_hotplug_work is enough to synchronize against
1593 * hotplug hanlding; however, cpuset_attach() may schedule
1594 * propagation work directly. Flush the workqueue too.
1595 */ 1697 */
1596 flush_work(&cpuset_hotplug_work); 1698 flush_work(&cpuset_hotplug_work);
1597 flush_workqueue(cpuset_propagate_hotplug_wq);
1598 1699
1599 mutex_lock(&cpuset_mutex); 1700 mutex_lock(&cpuset_mutex);
1600 if (!is_cpuset_online(cs)) 1701 if (!is_cpuset_online(cs))
@@ -1658,13 +1759,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1658 return count; 1759 return count;
1659} 1760}
1660 1761
1661static ssize_t cpuset_common_file_read(struct cgroup *cont, 1762static ssize_t cpuset_common_file_read(struct cgroup *cgrp,
1662 struct cftype *cft, 1763 struct cftype *cft,
1663 struct file *file, 1764 struct file *file,
1664 char __user *buf, 1765 char __user *buf,
1665 size_t nbytes, loff_t *ppos) 1766 size_t nbytes, loff_t *ppos)
1666{ 1767{
1667 struct cpuset *cs = cgroup_cs(cont); 1768 struct cpuset *cs = cgroup_cs(cgrp);
1668 cpuset_filetype_t type = cft->private; 1769 cpuset_filetype_t type = cft->private;
1669 char *page; 1770 char *page;
1670 ssize_t retval = 0; 1771 ssize_t retval = 0;
@@ -1694,9 +1795,9 @@ out:
1694 return retval; 1795 return retval;
1695} 1796}
1696 1797
1697static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) 1798static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1698{ 1799{
1699 struct cpuset *cs = cgroup_cs(cont); 1800 struct cpuset *cs = cgroup_cs(cgrp);
1700 cpuset_filetype_t type = cft->private; 1801 cpuset_filetype_t type = cft->private;
1701 switch (type) { 1802 switch (type) {
1702 case FILE_CPU_EXCLUSIVE: 1803 case FILE_CPU_EXCLUSIVE:
@@ -1725,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1725 return 0; 1826 return 0;
1726} 1827}
1727 1828
1728static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1829static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft)
1729{ 1830{
1730 struct cpuset *cs = cgroup_cs(cont); 1831 struct cpuset *cs = cgroup_cs(cgrp);
1731 cpuset_filetype_t type = cft->private; 1832 cpuset_filetype_t type = cft->private;
1732 switch (type) { 1833 switch (type) {
1733 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1834 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1839,14 +1940,14 @@ static struct cftype files[] = {
1839 1940
1840/* 1941/*
1841 * cpuset_css_alloc - allocate a cpuset css 1942 * cpuset_css_alloc - allocate a cpuset css
1842 * cont: control group that the new cpuset will be part of 1943 * cgrp: control group that the new cpuset will be part of
1843 */ 1944 */
1844 1945
1845static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1946static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1846{ 1947{
1847 struct cpuset *cs; 1948 struct cpuset *cs;
1848 1949
1849 if (!cont->parent) 1950 if (!cgrp->parent)
1850 return &top_cpuset.css; 1951 return &top_cpuset.css;
1851 1952
1852 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1953 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1861,7 +1962,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1861 cpumask_clear(cs->cpus_allowed); 1962 cpumask_clear(cs->cpus_allowed);
1862 nodes_clear(cs->mems_allowed); 1963 nodes_clear(cs->mems_allowed);
1863 fmeter_init(&cs->fmeter); 1964 fmeter_init(&cs->fmeter);
1864 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1865 cs->relax_domain_level = -1; 1965 cs->relax_domain_level = -1;
1866 1966
1867 return &cs->css; 1967 return &cs->css;
@@ -1942,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
1942 * will call rebuild_sched_domains_locked(). 2042 * will call rebuild_sched_domains_locked().
1943 */ 2043 */
1944 2044
1945static void cpuset_css_free(struct cgroup *cont) 2045static void cpuset_css_free(struct cgroup *cgrp)
1946{ 2046{
1947 struct cpuset *cs = cgroup_cs(cont); 2047 struct cpuset *cs = cgroup_cs(cgrp);
1948 2048
1949 free_cpumask_var(cs->cpus_allowed); 2049 free_cpumask_var(cs->cpus_allowed);
1950 kfree(cs); 2050 kfree(cs);
@@ -2024,41 +2124,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2024} 2124}
2025 2125
2026/** 2126/**
2027 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset 2127 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2028 * @cs: cpuset in interest 2128 * @cs: cpuset in interest
2029 * 2129 *
2030 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 2130 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2031 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 2131 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2032 * all its tasks are moved to the nearest ancestor with both resources. 2132 * all its tasks are moved to the nearest ancestor with both resources.
2033 */ 2133 */
2034static void cpuset_propagate_hotplug_workfn(struct work_struct *work) 2134static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2035{ 2135{
2036 static cpumask_t off_cpus; 2136 static cpumask_t off_cpus;
2037 static nodemask_t off_mems, tmp_mems; 2137 static nodemask_t off_mems;
2038 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2039 bool is_empty; 2138 bool is_empty;
2139 bool sane = cgroup_sane_behavior(cs->css.cgroup);
2140
2141retry:
2142 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2040 2143
2041 mutex_lock(&cpuset_mutex); 2144 mutex_lock(&cpuset_mutex);
2042 2145
2146 /*
2147 * We have raced with task attaching. We wait until attaching
2148 * is finished, so we won't attach a task to an empty cpuset.
2149 */
2150 if (cs->attach_in_progress) {
2151 mutex_unlock(&cpuset_mutex);
2152 goto retry;
2153 }
2154
2043 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2155 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2044 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2156 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2045 2157
2046 /* remove offline cpus from @cs */ 2158 mutex_lock(&callback_mutex);
2047 if (!cpumask_empty(&off_cpus)) { 2159 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2048 mutex_lock(&callback_mutex); 2160 mutex_unlock(&callback_mutex);
2049 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); 2161
2050 mutex_unlock(&callback_mutex); 2162 /*
2163 * If sane_behavior flag is set, we need to update tasks' cpumask
2164 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
2165 * call update_tasks_cpumask() if the cpuset becomes empty, as
2166 * the tasks in it will be migrated to an ancestor.
2167 */
2168 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2169 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2051 update_tasks_cpumask(cs, NULL); 2170 update_tasks_cpumask(cs, NULL);
2052 }
2053 2171
2054 /* remove offline mems from @cs */ 2172 mutex_lock(&callback_mutex);
2055 if (!nodes_empty(off_mems)) { 2173 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2056 tmp_mems = cs->mems_allowed; 2174 mutex_unlock(&callback_mutex);
2057 mutex_lock(&callback_mutex); 2175
2058 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2176 /*
2059 mutex_unlock(&callback_mutex); 2177 * If sane_behavior flag is set, we need to update tasks' nodemask
2060 update_tasks_nodemask(cs, &tmp_mems, NULL); 2178 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
2061 } 2179 * call update_tasks_nodemask() if the cpuset becomes empty, as
2180 * the tasks in it will be migratd to an ancestor.
2181 */
2182 if ((sane && nodes_empty(cs->mems_allowed)) ||
2183 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2184 update_tasks_nodemask(cs, NULL);
2062 2185
2063 is_empty = cpumask_empty(cs->cpus_allowed) || 2186 is_empty = cpumask_empty(cs->cpus_allowed) ||
2064 nodes_empty(cs->mems_allowed); 2187 nodes_empty(cs->mems_allowed);
@@ -2066,40 +2189,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
2066 mutex_unlock(&cpuset_mutex); 2189 mutex_unlock(&cpuset_mutex);
2067 2190
2068 /* 2191 /*
2069 * If @cs became empty, move tasks to the nearest ancestor with 2192 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
2070 * execution resources. This is full cgroup operation which will 2193 *
2194 * Otherwise move tasks to the nearest ancestor with execution
2195 * resources. This is full cgroup operation which will
2071 * also call back into cpuset. Should be done outside any lock. 2196 * also call back into cpuset. Should be done outside any lock.
2072 */ 2197 */
2073 if (is_empty) 2198 if (!sane && is_empty)
2074 remove_tasks_in_empty_cpuset(cs); 2199 remove_tasks_in_empty_cpuset(cs);
2075
2076 /* the following may free @cs, should be the last operation */
2077 css_put(&cs->css);
2078}
2079
2080/**
2081 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2082 * @cs: cpuset of interest
2083 *
2084 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2085 * memory masks according to top_cpuset.
2086 */
2087static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2088{
2089 /*
2090 * Pin @cs. The refcnt will be released when the work item
2091 * finishes executing.
2092 */
2093 if (!css_tryget(&cs->css))
2094 return;
2095
2096 /*
2097 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2098 * cpuset_propagate_hotplug_wq is ordered and propagation will
2099 * happen in the order this function is called.
2100 */
2101 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2102 css_put(&cs->css);
2103} 2200}
2104 2201
2105/** 2202/**
@@ -2112,18 +2209,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2112 * actively using CPU hotplug but making no active use of cpusets. 2209 * actively using CPU hotplug but making no active use of cpusets.
2113 * 2210 *
2114 * Non-root cpusets are only affected by offlining. If any CPUs or memory 2211 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2115 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all 2212 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
2116 * descendants. 2213 * all descendants.
2117 * 2214 *
2118 * Note that CPU offlining during suspend is ignored. We don't modify 2215 * Note that CPU offlining during suspend is ignored. We don't modify
2119 * cpusets across suspend/resume cycles at all. 2216 * cpusets across suspend/resume cycles at all.
2120 */ 2217 */
2121static void cpuset_hotplug_workfn(struct work_struct *work) 2218static void cpuset_hotplug_workfn(struct work_struct *work)
2122{ 2219{
2123 static cpumask_t new_cpus, tmp_cpus; 2220 static cpumask_t new_cpus;
2124 static nodemask_t new_mems, tmp_mems; 2221 static nodemask_t new_mems;
2125 bool cpus_updated, mems_updated; 2222 bool cpus_updated, mems_updated;
2126 bool cpus_offlined, mems_offlined;
2127 2223
2128 mutex_lock(&cpuset_mutex); 2224 mutex_lock(&cpuset_mutex);
2129 2225
@@ -2132,12 +2228,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2132 new_mems = node_states[N_MEMORY]; 2228 new_mems = node_states[N_MEMORY];
2133 2229
2134 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2230 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2135 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2136 &new_cpus);
2137
2138 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2231 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2139 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2140 mems_offlined = !nodes_empty(tmp_mems);
2141 2232
2142 /* synchronize cpus_allowed to cpu_active_mask */ 2233 /* synchronize cpus_allowed to cpu_active_mask */
2143 if (cpus_updated) { 2234 if (cpus_updated) {
@@ -2149,28 +2240,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2149 2240
2150 /* synchronize mems_allowed to N_MEMORY */ 2241 /* synchronize mems_allowed to N_MEMORY */
2151 if (mems_updated) { 2242 if (mems_updated) {
2152 tmp_mems = top_cpuset.mems_allowed;
2153 mutex_lock(&callback_mutex); 2243 mutex_lock(&callback_mutex);
2154 top_cpuset.mems_allowed = new_mems; 2244 top_cpuset.mems_allowed = new_mems;
2155 mutex_unlock(&callback_mutex); 2245 mutex_unlock(&callback_mutex);
2156 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); 2246 update_tasks_nodemask(&top_cpuset, NULL);
2157 } 2247 }
2158 2248
2159 /* if cpus or mems went down, we need to propagate to descendants */ 2249 mutex_unlock(&cpuset_mutex);
2160 if (cpus_offlined || mems_offlined) { 2250
2251 /* if cpus or mems changed, we need to propagate to descendants */
2252 if (cpus_updated || mems_updated) {
2161 struct cpuset *cs; 2253 struct cpuset *cs;
2162 struct cgroup *pos_cgrp; 2254 struct cgroup *pos_cgrp;
2163 2255
2164 rcu_read_lock(); 2256 rcu_read_lock();
2165 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) 2257 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) {
2166 schedule_cpuset_propagate_hotplug(cs); 2258 if (!css_tryget(&cs->css))
2167 rcu_read_unlock(); 2259 continue;
2168 } 2260 rcu_read_unlock();
2169 2261
2170 mutex_unlock(&cpuset_mutex); 2262 cpuset_hotplug_update_tasks(cs);
2171 2263
2172 /* wait for propagations to finish */ 2264 rcu_read_lock();
2173 flush_workqueue(cpuset_propagate_hotplug_wq); 2265 css_put(&cs->css);
2266 }
2267 rcu_read_unlock();
2268 }
2174 2269
2175 /* rebuild sched domains if cpus_allowed has changed */ 2270 /* rebuild sched domains if cpus_allowed has changed */
2176 if (cpus_updated) 2271 if (cpus_updated)
@@ -2219,12 +2314,9 @@ void __init cpuset_init_smp(void)
2219{ 2314{
2220 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2315 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2221 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2316 top_cpuset.mems_allowed = node_states[N_MEMORY];
2317 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2222 2318
2223 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2319 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2224
2225 cpuset_propagate_hotplug_wq =
2226 alloc_ordered_workqueue("cpuset_hotplug", 0);
2227 BUG_ON(!cpuset_propagate_hotplug_wq);
2228} 2320}
2229 2321
2230/** 2322/**
@@ -2240,21 +2332,23 @@ void __init cpuset_init_smp(void)
2240 2332
2241void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2333void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2242{ 2334{
2335 struct cpuset *cpus_cs;
2336
2243 mutex_lock(&callback_mutex); 2337 mutex_lock(&callback_mutex);
2244 task_lock(tsk); 2338 task_lock(tsk);
2245 guarantee_online_cpus(task_cs(tsk), pmask); 2339 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2340 guarantee_online_cpus(cpus_cs, pmask);
2246 task_unlock(tsk); 2341 task_unlock(tsk);
2247 mutex_unlock(&callback_mutex); 2342 mutex_unlock(&callback_mutex);
2248} 2343}
2249 2344
2250void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2345void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2251{ 2346{
2252 const struct cpuset *cs; 2347 const struct cpuset *cpus_cs;
2253 2348
2254 rcu_read_lock(); 2349 rcu_read_lock();
2255 cs = task_cs(tsk); 2350 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2256 if (cs) 2351 do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
2257 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2258 rcu_read_unlock(); 2352 rcu_read_unlock();
2259 2353
2260 /* 2354 /*
@@ -2293,11 +2387,13 @@ void cpuset_init_current_mems_allowed(void)
2293 2387
2294nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2388nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2295{ 2389{
2390 struct cpuset *mems_cs;
2296 nodemask_t mask; 2391 nodemask_t mask;
2297 2392
2298 mutex_lock(&callback_mutex); 2393 mutex_lock(&callback_mutex);
2299 task_lock(tsk); 2394 task_lock(tsk);
2300 guarantee_online_mems(task_cs(tsk), &mask); 2395 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2396 guarantee_online_mems(mems_cs, &mask);
2301 task_unlock(tsk); 2397 task_unlock(tsk);
2302 mutex_unlock(&callback_mutex); 2398 mutex_unlock(&callback_mutex);
2303 2399