aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c500
1 files changed, 301 insertions, 199 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b3f791bbe5..ea1966db34f2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -59,6 +59,7 @@
59#include <linux/mutex.h> 59#include <linux/mutex.h>
60#include <linux/workqueue.h> 60#include <linux/workqueue.h>
61#include <linux/cgroup.h> 61#include <linux/cgroup.h>
62#include <linux/wait.h>
62 63
63/* 64/*
64 * Tracks how many cpusets are currently defined in system. 65 * Tracks how many cpusets are currently defined in system.
@@ -87,6 +88,18 @@ struct cpuset {
87 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 88 cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
88 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 89 nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
89 90
91 /*
92 * This is old Memory Nodes tasks took on.
93 *
94 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
95 * - A new cpuset's old_mems_allowed is initialized when some
96 * task is moved into it.
97 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
98 * cpuset.mems_allowed and have tasks' nodemask updated, and
99 * then old_mems_allowed is updated to mems_allowed.
100 */
101 nodemask_t old_mems_allowed;
102
90 struct fmeter fmeter; /* memory_pressure filter */ 103 struct fmeter fmeter; /* memory_pressure filter */
91 104
92 /* 105 /*
@@ -100,14 +113,12 @@ struct cpuset {
100 113
101 /* for custom sched domain */ 114 /* for custom sched domain */
102 int relax_domain_level; 115 int relax_domain_level;
103
104 struct work_struct hotplug_work;
105}; 116};
106 117
107/* Retrieve the cpuset for a cgroup */ 118/* Retrieve the cpuset for a cgroup */
108static inline struct cpuset *cgroup_cs(struct cgroup *cont) 119static inline struct cpuset *cgroup_cs(struct cgroup *cgrp)
109{ 120{
110 return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), 121 return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id),
111 struct cpuset, css); 122 struct cpuset, css);
112} 123}
113 124
@@ -267,14 +278,11 @@ static DEFINE_MUTEX(callback_mutex);
267/* 278/*
268 * CPU / memory hotplug is handled asynchronously. 279 * CPU / memory hotplug is handled asynchronously.
269 */ 280 */
270static struct workqueue_struct *cpuset_propagate_hotplug_wq;
271
272static void cpuset_hotplug_workfn(struct work_struct *work); 281static void cpuset_hotplug_workfn(struct work_struct *work);
273static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
274static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
275
276static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); 282static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
277 283
284static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
285
278/* 286/*
279 * This is ugly, but preserves the userspace API for existing cpuset 287 * This is ugly, but preserves the userspace API for existing cpuset
280 * users. If someone tries to mount the "cpuset" filesystem, we 288 * users. If someone tries to mount the "cpuset" filesystem, we
@@ -304,53 +312,38 @@ static struct file_system_type cpuset_fs_type = {
304/* 312/*
305 * Return in pmask the portion of a cpusets's cpus_allowed that 313 * Return in pmask the portion of a cpusets's cpus_allowed that
306 * are online. If none are online, walk up the cpuset hierarchy 314 * are online. If none are online, walk up the cpuset hierarchy
307 * until we find one that does have some online cpus. If we get 315 * until we find one that does have some online cpus. The top
308 * all the way to the top and still haven't found any online cpus, 316 * cpuset always has some cpus online.
309 * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
310 * task, return cpu_online_mask.
311 * 317 *
312 * One way or another, we guarantee to return some non-empty subset 318 * One way or another, we guarantee to return some non-empty subset
313 * of cpu_online_mask. 319 * of cpu_online_mask.
314 * 320 *
315 * Call with callback_mutex held. 321 * Call with callback_mutex held.
316 */ 322 */
317
318static void guarantee_online_cpus(const struct cpuset *cs, 323static void guarantee_online_cpus(const struct cpuset *cs,
319 struct cpumask *pmask) 324 struct cpumask *pmask)
320{ 325{
321 while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 326 while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
322 cs = parent_cs(cs); 327 cs = parent_cs(cs);
323 if (cs) 328 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
324 cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
325 else
326 cpumask_copy(pmask, cpu_online_mask);
327 BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
328} 329}
329 330
330/* 331/*
331 * Return in *pmask the portion of a cpusets's mems_allowed that 332 * Return in *pmask the portion of a cpusets's mems_allowed that
332 * are online, with memory. If none are online with memory, walk 333 * are online, with memory. If none are online with memory, walk
333 * up the cpuset hierarchy until we find one that does have some 334 * up the cpuset hierarchy until we find one that does have some
334 * online mems. If we get all the way to the top and still haven't 335 * online mems. The top cpuset always has some mems online.
335 * found any online mems, return node_states[N_MEMORY].
336 * 336 *
337 * One way or another, we guarantee to return some non-empty subset 337 * One way or another, we guarantee to return some non-empty subset
338 * of node_states[N_MEMORY]. 338 * of node_states[N_MEMORY].
339 * 339 *
340 * Call with callback_mutex held. 340 * Call with callback_mutex held.
341 */ 341 */
342
343static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) 342static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
344{ 343{
345 while (cs && !nodes_intersects(cs->mems_allowed, 344 while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
346 node_states[N_MEMORY]))
347 cs = parent_cs(cs); 345 cs = parent_cs(cs);
348 if (cs) 346 nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
349 nodes_and(*pmask, cs->mems_allowed,
350 node_states[N_MEMORY]);
351 else
352 *pmask = node_states[N_MEMORY];
353 BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
354} 347}
355 348
356/* 349/*
@@ -440,7 +433,7 @@ static void free_trial_cpuset(struct cpuset *trial)
440 433
441static int validate_change(const struct cpuset *cur, const struct cpuset *trial) 434static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
442{ 435{
443 struct cgroup *cont; 436 struct cgroup *cgrp;
444 struct cpuset *c, *par; 437 struct cpuset *c, *par;
445 int ret; 438 int ret;
446 439
@@ -448,7 +441,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
448 441
449 /* Each of our child cpusets must be a subset of us */ 442 /* Each of our child cpusets must be a subset of us */
450 ret = -EBUSY; 443 ret = -EBUSY;
451 cpuset_for_each_child(c, cont, cur) 444 cpuset_for_each_child(c, cgrp, cur)
452 if (!is_cpuset_subset(c, trial)) 445 if (!is_cpuset_subset(c, trial))
453 goto out; 446 goto out;
454 447
@@ -469,7 +462,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
469 * overlap 462 * overlap
470 */ 463 */
471 ret = -EINVAL; 464 ret = -EINVAL;
472 cpuset_for_each_child(c, cont, par) { 465 cpuset_for_each_child(c, cgrp, par) {
473 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 466 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
474 c != cur && 467 c != cur &&
475 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) 468 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
@@ -482,13 +475,17 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
482 475
483 /* 476 /*
484 * Cpusets with tasks - existing or newly being attached - can't 477 * Cpusets with tasks - existing or newly being attached - can't
485 * have empty cpus_allowed or mems_allowed. 478 * be changed to have empty cpus_allowed or mems_allowed.
486 */ 479 */
487 ret = -ENOSPC; 480 ret = -ENOSPC;
488 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && 481 if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) {
489 (cpumask_empty(trial->cpus_allowed) || 482 if (!cpumask_empty(cur->cpus_allowed) &&
490 nodes_empty(trial->mems_allowed))) 483 cpumask_empty(trial->cpus_allowed))
491 goto out; 484 goto out;
485 if (!nodes_empty(cur->mems_allowed) &&
486 nodes_empty(trial->mems_allowed))
487 goto out;
488 }
492 489
493 ret = 0; 490 ret = 0;
494out: 491out:
@@ -540,7 +537,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
540 * This function builds a partial partition of the systems CPUs 537 * This function builds a partial partition of the systems CPUs
541 * A 'partial partition' is a set of non-overlapping subsets whose 538 * A 'partial partition' is a set of non-overlapping subsets whose
542 * union is a subset of that set. 539 * union is a subset of that set.
543 * The output of this function needs to be passed to kernel/sched.c 540 * The output of this function needs to be passed to kernel/sched/core.c
544 * partition_sched_domains() routine, which will rebuild the scheduler's 541 * partition_sched_domains() routine, which will rebuild the scheduler's
545 * load balancing domains (sched domains) as specified by that partial 542 * load balancing domains (sched domains) as specified by that partial
546 * partition. 543 * partition.
@@ -569,7 +566,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
569 * is a subset of one of these domains, while there are as 566 * is a subset of one of these domains, while there are as
570 * many such domains as possible, each as small as possible. 567 * many such domains as possible, each as small as possible.
571 * doms - Conversion of 'csa' to an array of cpumasks, for passing to 568 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
572 * the kernel/sched.c routine partition_sched_domains() in a 569 * the kernel/sched/core.c routine partition_sched_domains() in a
573 * convenient format, that can be easily compared to the prior 570 * convenient format, that can be easily compared to the prior
574 * value to determine what partition elements (sched domains) 571 * value to determine what partition elements (sched domains)
575 * were changed (added or removed.) 572 * were changed (added or removed.)
@@ -798,21 +795,43 @@ void rebuild_sched_domains(void)
798 mutex_unlock(&cpuset_mutex); 795 mutex_unlock(&cpuset_mutex);
799} 796}
800 797
801/** 798/*
802 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's 799 * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
803 * @tsk: task to test 800 * @cs: the cpuset in interest
804 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
805 * 801 *
806 * Call with cpuset_mutex held. May take callback_mutex during call. 802 * A cpuset's effective cpumask is the cpumask of the nearest ancestor
807 * Called for each task in a cgroup by cgroup_scan_tasks(). 803 * with non-empty cpus. We use effective cpumask whenever:
808 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 804 * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
809 * words, if its mask is not equal to its cpuset's mask). 805 * if the cpuset they reside in has no cpus)
806 * - we want to retrieve task_cs(tsk)'s cpus_allowed.
807 *
808 * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
809 * exception. See comments there.
810 */ 810 */
811static int cpuset_test_cpumask(struct task_struct *tsk, 811static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
812 struct cgroup_scanner *scan)
813{ 812{
814 return !cpumask_equal(&tsk->cpus_allowed, 813 while (cpumask_empty(cs->cpus_allowed))
815 (cgroup_cs(scan->cg))->cpus_allowed); 814 cs = parent_cs(cs);
815 return cs;
816}
817
818/*
819 * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
820 * @cs: the cpuset in interest
821 *
822 * A cpuset's effective nodemask is the nodemask of the nearest ancestor
823 * with non-empty memss. We use effective nodemask whenever:
824 * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
825 * if the cpuset they reside in has no mems)
826 * - we want to retrieve task_cs(tsk)'s mems_allowed.
827 *
828 * Called with cpuset_mutex held.
829 */
830static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
831{
832 while (nodes_empty(cs->mems_allowed))
833 cs = parent_cs(cs);
834 return cs;
816} 835}
817 836
818/** 837/**
@@ -829,7 +848,10 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
829static void cpuset_change_cpumask(struct task_struct *tsk, 848static void cpuset_change_cpumask(struct task_struct *tsk,
830 struct cgroup_scanner *scan) 849 struct cgroup_scanner *scan)
831{ 850{
832 set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); 851 struct cpuset *cpus_cs;
852
853 cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg));
854 set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed);
833} 855}
834 856
835/** 857/**
@@ -850,12 +872,51 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
850 struct cgroup_scanner scan; 872 struct cgroup_scanner scan;
851 873
852 scan.cg = cs->css.cgroup; 874 scan.cg = cs->css.cgroup;
853 scan.test_task = cpuset_test_cpumask; 875 scan.test_task = NULL;
854 scan.process_task = cpuset_change_cpumask; 876 scan.process_task = cpuset_change_cpumask;
855 scan.heap = heap; 877 scan.heap = heap;
856 cgroup_scan_tasks(&scan); 878 cgroup_scan_tasks(&scan);
857} 879}
858 880
881/*
882 * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
883 * @root_cs: the root cpuset of the hierarchy
884 * @update_root: update root cpuset or not?
885 * @heap: the heap used by cgroup_scan_tasks()
886 *
887 * This will update cpumasks of tasks in @root_cs and all other empty cpusets
888 * which take on cpumask of @root_cs.
889 *
890 * Called with cpuset_mutex held
891 */
892static void update_tasks_cpumask_hier(struct cpuset *root_cs,
893 bool update_root, struct ptr_heap *heap)
894{
895 struct cpuset *cp;
896 struct cgroup *pos_cgrp;
897
898 if (update_root)
899 update_tasks_cpumask(root_cs, heap);
900
901 rcu_read_lock();
902 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
903 /* skip the whole subtree if @cp have some CPU */
904 if (!cpumask_empty(cp->cpus_allowed)) {
905 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
906 continue;
907 }
908 if (!css_tryget(&cp->css))
909 continue;
910 rcu_read_unlock();
911
912 update_tasks_cpumask(cp, heap);
913
914 rcu_read_lock();
915 css_put(&cp->css);
916 }
917 rcu_read_unlock();
918}
919
859/** 920/**
860 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 921 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
861 * @cs: the cpuset to consider 922 * @cs: the cpuset to consider
@@ -888,14 +949,15 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
888 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 949 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
889 return -EINVAL; 950 return -EINVAL;
890 } 951 }
891 retval = validate_change(cs, trialcs);
892 if (retval < 0)
893 return retval;
894 952
895 /* Nothing to do if the cpus didn't change */ 953 /* Nothing to do if the cpus didn't change */
896 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 954 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
897 return 0; 955 return 0;
898 956
957 retval = validate_change(cs, trialcs);
958 if (retval < 0)
959 return retval;
960
899 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); 961 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
900 if (retval) 962 if (retval)
901 return retval; 963 return retval;
@@ -906,11 +968,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
906 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 968 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
907 mutex_unlock(&callback_mutex); 969 mutex_unlock(&callback_mutex);
908 970
909 /* 971 update_tasks_cpumask_hier(cs, true, &heap);
910 * Scan tasks in the cpuset, and update the cpumasks of any
911 * that need an update.
912 */
913 update_tasks_cpumask(cs, &heap);
914 972
915 heap_free(&heap); 973 heap_free(&heap);
916 974
@@ -943,12 +1001,14 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
943 const nodemask_t *to) 1001 const nodemask_t *to)
944{ 1002{
945 struct task_struct *tsk = current; 1003 struct task_struct *tsk = current;
1004 struct cpuset *mems_cs;
946 1005
947 tsk->mems_allowed = *to; 1006 tsk->mems_allowed = *to;
948 1007
949 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 1008 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
950 1009
951 guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); 1010 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
1011 guarantee_online_mems(mems_cs, &tsk->mems_allowed);
952} 1012}
953 1013
954/* 1014/*
@@ -1007,16 +1067,12 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1007static void cpuset_change_nodemask(struct task_struct *p, 1067static void cpuset_change_nodemask(struct task_struct *p,
1008 struct cgroup_scanner *scan) 1068 struct cgroup_scanner *scan)
1009{ 1069{
1070 struct cpuset *cs = cgroup_cs(scan->cg);
1010 struct mm_struct *mm; 1071 struct mm_struct *mm;
1011 struct cpuset *cs;
1012 int migrate; 1072 int migrate;
1013 const nodemask_t *oldmem = scan->data; 1073 nodemask_t *newmems = scan->data;
1014 static nodemask_t newmems; /* protected by cpuset_mutex */
1015 1074
1016 cs = cgroup_cs(scan->cg); 1075 cpuset_change_task_nodemask(p, newmems);
1017 guarantee_online_mems(cs, &newmems);
1018
1019 cpuset_change_task_nodemask(p, &newmems);
1020 1076
1021 mm = get_task_mm(p); 1077 mm = get_task_mm(p);
1022 if (!mm) 1078 if (!mm)
@@ -1026,7 +1082,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1026 1082
1027 mpol_rebind_mm(mm, &cs->mems_allowed); 1083 mpol_rebind_mm(mm, &cs->mems_allowed);
1028 if (migrate) 1084 if (migrate)
1029 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); 1085 cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems);
1030 mmput(mm); 1086 mmput(mm);
1031} 1087}
1032 1088
@@ -1035,25 +1091,27 @@ static void *cpuset_being_rebound;
1035/** 1091/**
1036 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. 1092 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1037 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1093 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1038 * @oldmem: old mems_allowed of cpuset cs
1039 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1094 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1040 * 1095 *
1041 * Called with cpuset_mutex held 1096 * Called with cpuset_mutex held
1042 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1097 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1043 * if @heap != NULL. 1098 * if @heap != NULL.
1044 */ 1099 */
1045static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, 1100static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
1046 struct ptr_heap *heap)
1047{ 1101{
1102 static nodemask_t newmems; /* protected by cpuset_mutex */
1048 struct cgroup_scanner scan; 1103 struct cgroup_scanner scan;
1104 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1049 1105
1050 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1106 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1051 1107
1108 guarantee_online_mems(mems_cs, &newmems);
1109
1052 scan.cg = cs->css.cgroup; 1110 scan.cg = cs->css.cgroup;
1053 scan.test_task = NULL; 1111 scan.test_task = NULL;
1054 scan.process_task = cpuset_change_nodemask; 1112 scan.process_task = cpuset_change_nodemask;
1055 scan.heap = heap; 1113 scan.heap = heap;
1056 scan.data = (nodemask_t *)oldmem; 1114 scan.data = &newmems;
1057 1115
1058 /* 1116 /*
1059 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't 1117 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1067,11 +1125,56 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1067 */ 1125 */
1068 cgroup_scan_tasks(&scan); 1126 cgroup_scan_tasks(&scan);
1069 1127
1128 /*
1129 * All the tasks' nodemasks have been updated, update
1130 * cs->old_mems_allowed.
1131 */
1132 cs->old_mems_allowed = newmems;
1133
1070 /* We're done rebinding vmas to this cpuset's new mems_allowed. */ 1134 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1071 cpuset_being_rebound = NULL; 1135 cpuset_being_rebound = NULL;
1072} 1136}
1073 1137
1074/* 1138/*
1139 * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
1140 * @cs: the root cpuset of the hierarchy
1141 * @update_root: update the root cpuset or not?
1142 * @heap: the heap used by cgroup_scan_tasks()
1143 *
1144 * This will update nodemasks of tasks in @root_cs and all other empty cpusets
1145 * which take on nodemask of @root_cs.
1146 *
1147 * Called with cpuset_mutex held
1148 */
1149static void update_tasks_nodemask_hier(struct cpuset *root_cs,
1150 bool update_root, struct ptr_heap *heap)
1151{
1152 struct cpuset *cp;
1153 struct cgroup *pos_cgrp;
1154
1155 if (update_root)
1156 update_tasks_nodemask(root_cs, heap);
1157
1158 rcu_read_lock();
1159 cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
1160 /* skip the whole subtree if @cp have some CPU */
1161 if (!nodes_empty(cp->mems_allowed)) {
1162 pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
1163 continue;
1164 }
1165 if (!css_tryget(&cp->css))
1166 continue;
1167 rcu_read_unlock();
1168
1169 update_tasks_nodemask(cp, heap);
1170
1171 rcu_read_lock();
1172 css_put(&cp->css);
1173 }
1174 rcu_read_unlock();
1175}
1176
1177/*
1075 * Handle user request to change the 'mems' memory placement 1178 * Handle user request to change the 'mems' memory placement
1076 * of a cpuset. Needs to validate the request, update the 1179 * of a cpuset. Needs to validate the request, update the
1077 * cpusets mems_allowed, and for each task in the cpuset, 1180 * cpusets mems_allowed, and for each task in the cpuset,
@@ -1087,13 +1190,9 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1087static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1190static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1088 const char *buf) 1191 const char *buf)
1089{ 1192{
1090 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1091 int retval; 1193 int retval;
1092 struct ptr_heap heap; 1194 struct ptr_heap heap;
1093 1195
1094 if (!oldmem)
1095 return -ENOMEM;
1096
1097 /* 1196 /*
1098 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 1197 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
1099 * it's read-only 1198 * it's read-only
@@ -1122,8 +1221,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1122 goto done; 1221 goto done;
1123 } 1222 }
1124 } 1223 }
1125 *oldmem = cs->mems_allowed; 1224
1126 if (nodes_equal(*oldmem, trialcs->mems_allowed)) { 1225 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1127 retval = 0; /* Too easy - nothing to do */ 1226 retval = 0; /* Too easy - nothing to do */
1128 goto done; 1227 goto done;
1129 } 1228 }
@@ -1139,11 +1238,10 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1139 cs->mems_allowed = trialcs->mems_allowed; 1238 cs->mems_allowed = trialcs->mems_allowed;
1140 mutex_unlock(&callback_mutex); 1239 mutex_unlock(&callback_mutex);
1141 1240
1142 update_tasks_nodemask(cs, oldmem, &heap); 1241 update_tasks_nodemask_hier(cs, true, &heap);
1143 1242
1144 heap_free(&heap); 1243 heap_free(&heap);
1145done: 1244done:
1146 NODEMASK_FREE(oldmem);
1147 return retval; 1245 return retval;
1148} 1246}
1149 1247
@@ -1372,8 +1470,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1372 1470
1373 mutex_lock(&cpuset_mutex); 1471 mutex_lock(&cpuset_mutex);
1374 1472
1473 /*
1474 * We allow to move tasks into an empty cpuset if sane_behavior
1475 * flag is set.
1476 */
1375 ret = -ENOSPC; 1477 ret = -ENOSPC;
1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1478 if (!cgroup_sane_behavior(cgrp) &&
1479 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
1377 goto out_unlock; 1480 goto out_unlock;
1378 1481
1379 cgroup_taskset_for_each(task, cgrp, tset) { 1482 cgroup_taskset_for_each(task, cgrp, tset) {
@@ -1422,8 +1525,7 @@ static cpumask_var_t cpus_attach;
1422 1525
1423static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1526static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1424{ 1527{
1425 /* static bufs protected by cpuset_mutex */ 1528 /* static buf protected by cpuset_mutex */
1426 static nodemask_t cpuset_attach_nodemask_from;
1427 static nodemask_t cpuset_attach_nodemask_to; 1529 static nodemask_t cpuset_attach_nodemask_to;
1428 struct mm_struct *mm; 1530 struct mm_struct *mm;
1429 struct task_struct *task; 1531 struct task_struct *task;
@@ -1431,6 +1533,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1431 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1533 struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
1432 struct cpuset *cs = cgroup_cs(cgrp); 1534 struct cpuset *cs = cgroup_cs(cgrp);
1433 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1535 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1536 struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
1537 struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
1434 1538
1435 mutex_lock(&cpuset_mutex); 1539 mutex_lock(&cpuset_mutex);
1436 1540
@@ -1438,9 +1542,9 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1438 if (cs == &top_cpuset) 1542 if (cs == &top_cpuset)
1439 cpumask_copy(cpus_attach, cpu_possible_mask); 1543 cpumask_copy(cpus_attach, cpu_possible_mask);
1440 else 1544 else
1441 guarantee_online_cpus(cs, cpus_attach); 1545 guarantee_online_cpus(cpus_cs, cpus_attach);
1442 1546
1443 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1547 guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
1444 1548
1445 cgroup_taskset_for_each(task, cgrp, tset) { 1549 cgroup_taskset_for_each(task, cgrp, tset) {
1446 /* 1550 /*
@@ -1457,26 +1561,32 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1457 * Change mm, possibly for multiple threads in a threadgroup. This is 1561 * Change mm, possibly for multiple threads in a threadgroup. This is
1458 * expensive and may sleep. 1562 * expensive and may sleep.
1459 */ 1563 */
1460 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1461 cpuset_attach_nodemask_to = cs->mems_allowed; 1564 cpuset_attach_nodemask_to = cs->mems_allowed;
1462 mm = get_task_mm(leader); 1565 mm = get_task_mm(leader);
1463 if (mm) { 1566 if (mm) {
1567 struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
1568
1464 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1569 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1465 if (is_memory_migrate(cs)) 1570
1466 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, 1571 /*
1572 * old_mems_allowed is the same with mems_allowed here, except
1573 * if this task is being moved automatically due to hotplug.
1574 * In that case @mems_allowed has been updated and is empty,
1575 * so @old_mems_allowed is the right nodesets that we migrate
1576 * mm from.
1577 */
1578 if (is_memory_migrate(cs)) {
1579 cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
1467 &cpuset_attach_nodemask_to); 1580 &cpuset_attach_nodemask_to);
1581 }
1468 mmput(mm); 1582 mmput(mm);
1469 } 1583 }
1470 1584
1471 cs->attach_in_progress--; 1585 cs->old_mems_allowed = cpuset_attach_nodemask_to;
1472 1586
1473 /* 1587 cs->attach_in_progress--;
1474 * We may have raced with CPU/memory hotunplug. Trigger hotplug 1588 if (!cs->attach_in_progress)
1475 * propagation if @cs doesn't have any CPU or memory. It will move 1589 wake_up(&cpuset_attach_wq);
1476 * the newly added tasks to the nearest parent which can execute.
1477 */
1478 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1479 schedule_cpuset_propagate_hotplug(cs);
1480 1590
1481 mutex_unlock(&cpuset_mutex); 1591 mutex_unlock(&cpuset_mutex);
1482} 1592}
@@ -1502,11 +1612,13 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1502{ 1612{
1503 struct cpuset *cs = cgroup_cs(cgrp); 1613 struct cpuset *cs = cgroup_cs(cgrp);
1504 cpuset_filetype_t type = cft->private; 1614 cpuset_filetype_t type = cft->private;
1505 int retval = -ENODEV; 1615 int retval = 0;
1506 1616
1507 mutex_lock(&cpuset_mutex); 1617 mutex_lock(&cpuset_mutex);
1508 if (!is_cpuset_online(cs)) 1618 if (!is_cpuset_online(cs)) {
1619 retval = -ENODEV;
1509 goto out_unlock; 1620 goto out_unlock;
1621 }
1510 1622
1511 switch (type) { 1623 switch (type) {
1512 case FILE_CPU_EXCLUSIVE: 1624 case FILE_CPU_EXCLUSIVE:
@@ -1588,13 +1700,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1588 * resources, wait for the previously scheduled operations before 1700 * resources, wait for the previously scheduled operations before
1589 * proceeding, so that we don't end up keep removing tasks added 1701 * proceeding, so that we don't end up keep removing tasks added
1590 * after execution capability is restored. 1702 * after execution capability is restored.
1591 *
1592 * Flushing cpuset_hotplug_work is enough to synchronize against
1593 * hotplug hanlding; however, cpuset_attach() may schedule
1594 * propagation work directly. Flush the workqueue too.
1595 */ 1703 */
1596 flush_work(&cpuset_hotplug_work); 1704 flush_work(&cpuset_hotplug_work);
1597 flush_workqueue(cpuset_propagate_hotplug_wq);
1598 1705
1599 mutex_lock(&cpuset_mutex); 1706 mutex_lock(&cpuset_mutex);
1600 if (!is_cpuset_online(cs)) 1707 if (!is_cpuset_online(cs))
@@ -1658,13 +1765,13 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1658 return count; 1765 return count;
1659} 1766}
1660 1767
1661static ssize_t cpuset_common_file_read(struct cgroup *cont, 1768static ssize_t cpuset_common_file_read(struct cgroup *cgrp,
1662 struct cftype *cft, 1769 struct cftype *cft,
1663 struct file *file, 1770 struct file *file,
1664 char __user *buf, 1771 char __user *buf,
1665 size_t nbytes, loff_t *ppos) 1772 size_t nbytes, loff_t *ppos)
1666{ 1773{
1667 struct cpuset *cs = cgroup_cs(cont); 1774 struct cpuset *cs = cgroup_cs(cgrp);
1668 cpuset_filetype_t type = cft->private; 1775 cpuset_filetype_t type = cft->private;
1669 char *page; 1776 char *page;
1670 ssize_t retval = 0; 1777 ssize_t retval = 0;
@@ -1694,9 +1801,9 @@ out:
1694 return retval; 1801 return retval;
1695} 1802}
1696 1803
1697static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) 1804static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft)
1698{ 1805{
1699 struct cpuset *cs = cgroup_cs(cont); 1806 struct cpuset *cs = cgroup_cs(cgrp);
1700 cpuset_filetype_t type = cft->private; 1807 cpuset_filetype_t type = cft->private;
1701 switch (type) { 1808 switch (type) {
1702 case FILE_CPU_EXCLUSIVE: 1809 case FILE_CPU_EXCLUSIVE:
@@ -1725,9 +1832,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1725 return 0; 1832 return 0;
1726} 1833}
1727 1834
1728static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1835static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft)
1729{ 1836{
1730 struct cpuset *cs = cgroup_cs(cont); 1837 struct cpuset *cs = cgroup_cs(cgrp);
1731 cpuset_filetype_t type = cft->private; 1838 cpuset_filetype_t type = cft->private;
1732 switch (type) { 1839 switch (type) {
1733 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1840 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1839,14 +1946,14 @@ static struct cftype files[] = {
1839 1946
1840/* 1947/*
1841 * cpuset_css_alloc - allocate a cpuset css 1948 * cpuset_css_alloc - allocate a cpuset css
1842 * cont: control group that the new cpuset will be part of 1949 * cgrp: control group that the new cpuset will be part of
1843 */ 1950 */
1844 1951
1845static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) 1952static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp)
1846{ 1953{
1847 struct cpuset *cs; 1954 struct cpuset *cs;
1848 1955
1849 if (!cont->parent) 1956 if (!cgrp->parent)
1850 return &top_cpuset.css; 1957 return &top_cpuset.css;
1851 1958
1852 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1959 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
@@ -1861,7 +1968,6 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1861 cpumask_clear(cs->cpus_allowed); 1968 cpumask_clear(cs->cpus_allowed);
1862 nodes_clear(cs->mems_allowed); 1969 nodes_clear(cs->mems_allowed);
1863 fmeter_init(&cs->fmeter); 1970 fmeter_init(&cs->fmeter);
1864 INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1865 cs->relax_domain_level = -1; 1971 cs->relax_domain_level = -1;
1866 1972
1867 return &cs->css; 1973 return &cs->css;
@@ -1942,9 +2048,9 @@ static void cpuset_css_offline(struct cgroup *cgrp)
1942 * will call rebuild_sched_domains_locked(). 2048 * will call rebuild_sched_domains_locked().
1943 */ 2049 */
1944 2050
1945static void cpuset_css_free(struct cgroup *cont) 2051static void cpuset_css_free(struct cgroup *cgrp)
1946{ 2052{
1947 struct cpuset *cs = cgroup_cs(cont); 2053 struct cpuset *cs = cgroup_cs(cgrp);
1948 2054
1949 free_cpumask_var(cs->cpus_allowed); 2055 free_cpumask_var(cs->cpus_allowed);
1950 kfree(cs); 2056 kfree(cs);
@@ -2024,41 +2130,64 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2024} 2130}
2025 2131
2026/** 2132/**
2027 * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset 2133 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
2028 * @cs: cpuset in interest 2134 * @cs: cpuset in interest
2029 * 2135 *
2030 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone 2136 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2031 * offline, update @cs accordingly. If @cs ends up with no CPU or memory, 2137 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2032 * all its tasks are moved to the nearest ancestor with both resources. 2138 * all its tasks are moved to the nearest ancestor with both resources.
2033 */ 2139 */
2034static void cpuset_propagate_hotplug_workfn(struct work_struct *work) 2140static void cpuset_hotplug_update_tasks(struct cpuset *cs)
2035{ 2141{
2036 static cpumask_t off_cpus; 2142 static cpumask_t off_cpus;
2037 static nodemask_t off_mems, tmp_mems; 2143 static nodemask_t off_mems;
2038 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2039 bool is_empty; 2144 bool is_empty;
2145 bool sane = cgroup_sane_behavior(cs->css.cgroup);
2146
2147retry:
2148 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2040 2149
2041 mutex_lock(&cpuset_mutex); 2150 mutex_lock(&cpuset_mutex);
2042 2151
2152 /*
2153 * We have raced with task attaching. We wait until attaching
2154 * is finished, so we won't attach a task to an empty cpuset.
2155 */
2156 if (cs->attach_in_progress) {
2157 mutex_unlock(&cpuset_mutex);
2158 goto retry;
2159 }
2160
2043 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2161 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2044 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2162 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2045 2163
2046 /* remove offline cpus from @cs */ 2164 mutex_lock(&callback_mutex);
2047 if (!cpumask_empty(&off_cpus)) { 2165 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2048 mutex_lock(&callback_mutex); 2166 mutex_unlock(&callback_mutex);
2049 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); 2167
2050 mutex_unlock(&callback_mutex); 2168 /*
2169 * If sane_behavior flag is set, we need to update tasks' cpumask
2170 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
2171 * call update_tasks_cpumask() if the cpuset becomes empty, as
2172 * the tasks in it will be migrated to an ancestor.
2173 */
2174 if ((sane && cpumask_empty(cs->cpus_allowed)) ||
2175 (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
2051 update_tasks_cpumask(cs, NULL); 2176 update_tasks_cpumask(cs, NULL);
2052 }
2053 2177
2054 /* remove offline mems from @cs */ 2178 mutex_lock(&callback_mutex);
2055 if (!nodes_empty(off_mems)) { 2179 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2056 tmp_mems = cs->mems_allowed; 2180 mutex_unlock(&callback_mutex);
2057 mutex_lock(&callback_mutex); 2181
2058 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2182 /*
2059 mutex_unlock(&callback_mutex); 2183 * If sane_behavior flag is set, we need to update tasks' nodemask
2060 update_tasks_nodemask(cs, &tmp_mems, NULL); 2184 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
2061 } 2185 * call update_tasks_nodemask() if the cpuset becomes empty, as
2186 * the tasks in it will be migratd to an ancestor.
2187 */
2188 if ((sane && nodes_empty(cs->mems_allowed)) ||
2189 (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
2190 update_tasks_nodemask(cs, NULL);
2062 2191
2063 is_empty = cpumask_empty(cs->cpus_allowed) || 2192 is_empty = cpumask_empty(cs->cpus_allowed) ||
2064 nodes_empty(cs->mems_allowed); 2193 nodes_empty(cs->mems_allowed);
@@ -2066,40 +2195,14 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
2066 mutex_unlock(&cpuset_mutex); 2195 mutex_unlock(&cpuset_mutex);
2067 2196
2068 /* 2197 /*
2069 * If @cs became empty, move tasks to the nearest ancestor with 2198 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
2070 * execution resources. This is full cgroup operation which will 2199 *
2200 * Otherwise move tasks to the nearest ancestor with execution
2201 * resources. This is full cgroup operation which will
2071 * also call back into cpuset. Should be done outside any lock. 2202 * also call back into cpuset. Should be done outside any lock.
2072 */ 2203 */
2073 if (is_empty) 2204 if (!sane && is_empty)
2074 remove_tasks_in_empty_cpuset(cs); 2205 remove_tasks_in_empty_cpuset(cs);
2075
2076 /* the following may free @cs, should be the last operation */
2077 css_put(&cs->css);
2078}
2079
2080/**
2081 * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
2082 * @cs: cpuset of interest
2083 *
2084 * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
2085 * memory masks according to top_cpuset.
2086 */
2087static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2088{
2089 /*
2090 * Pin @cs. The refcnt will be released when the work item
2091 * finishes executing.
2092 */
2093 if (!css_tryget(&cs->css))
2094 return;
2095
2096 /*
2097 * Queue @cs->hotplug_work. If already pending, lose the css ref.
2098 * cpuset_propagate_hotplug_wq is ordered and propagation will
2099 * happen in the order this function is called.
2100 */
2101 if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
2102 css_put(&cs->css);
2103} 2206}
2104 2207
2105/** 2208/**
@@ -2112,18 +2215,17 @@ static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
2112 * actively using CPU hotplug but making no active use of cpusets. 2215 * actively using CPU hotplug but making no active use of cpusets.
2113 * 2216 *
2114 * Non-root cpusets are only affected by offlining. If any CPUs or memory 2217 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2115 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all 2218 * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
2116 * descendants. 2219 * all descendants.
2117 * 2220 *
2118 * Note that CPU offlining during suspend is ignored. We don't modify 2221 * Note that CPU offlining during suspend is ignored. We don't modify
2119 * cpusets across suspend/resume cycles at all. 2222 * cpusets across suspend/resume cycles at all.
2120 */ 2223 */
2121static void cpuset_hotplug_workfn(struct work_struct *work) 2224static void cpuset_hotplug_workfn(struct work_struct *work)
2122{ 2225{
2123 static cpumask_t new_cpus, tmp_cpus; 2226 static cpumask_t new_cpus;
2124 static nodemask_t new_mems, tmp_mems; 2227 static nodemask_t new_mems;
2125 bool cpus_updated, mems_updated; 2228 bool cpus_updated, mems_updated;
2126 bool cpus_offlined, mems_offlined;
2127 2229
2128 mutex_lock(&cpuset_mutex); 2230 mutex_lock(&cpuset_mutex);
2129 2231
@@ -2132,12 +2234,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2132 new_mems = node_states[N_MEMORY]; 2234 new_mems = node_states[N_MEMORY];
2133 2235
2134 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2236 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2135 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2136 &new_cpus);
2137
2138 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2237 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2139 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2140 mems_offlined = !nodes_empty(tmp_mems);
2141 2238
2142 /* synchronize cpus_allowed to cpu_active_mask */ 2239 /* synchronize cpus_allowed to cpu_active_mask */
2143 if (cpus_updated) { 2240 if (cpus_updated) {
@@ -2149,28 +2246,32 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2149 2246
2150 /* synchronize mems_allowed to N_MEMORY */ 2247 /* synchronize mems_allowed to N_MEMORY */
2151 if (mems_updated) { 2248 if (mems_updated) {
2152 tmp_mems = top_cpuset.mems_allowed;
2153 mutex_lock(&callback_mutex); 2249 mutex_lock(&callback_mutex);
2154 top_cpuset.mems_allowed = new_mems; 2250 top_cpuset.mems_allowed = new_mems;
2155 mutex_unlock(&callback_mutex); 2251 mutex_unlock(&callback_mutex);
2156 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); 2252 update_tasks_nodemask(&top_cpuset, NULL);
2157 } 2253 }
2158 2254
2159 /* if cpus or mems went down, we need to propagate to descendants */ 2255 mutex_unlock(&cpuset_mutex);
2160 if (cpus_offlined || mems_offlined) { 2256
2257 /* if cpus or mems changed, we need to propagate to descendants */
2258 if (cpus_updated || mems_updated) {
2161 struct cpuset *cs; 2259 struct cpuset *cs;
2162 struct cgroup *pos_cgrp; 2260 struct cgroup *pos_cgrp;
2163 2261
2164 rcu_read_lock(); 2262 rcu_read_lock();
2165 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) 2263 cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) {
2166 schedule_cpuset_propagate_hotplug(cs); 2264 if (!css_tryget(&cs->css))
2167 rcu_read_unlock(); 2265 continue;
2168 } 2266 rcu_read_unlock();
2169 2267
2170 mutex_unlock(&cpuset_mutex); 2268 cpuset_hotplug_update_tasks(cs);
2171 2269
2172 /* wait for propagations to finish */ 2270 rcu_read_lock();
2173 flush_workqueue(cpuset_propagate_hotplug_wq); 2271 css_put(&cs->css);
2272 }
2273 rcu_read_unlock();
2274 }
2174 2275
2175 /* rebuild sched domains if cpus_allowed has changed */ 2276 /* rebuild sched domains if cpus_allowed has changed */
2176 if (cpus_updated) 2277 if (cpus_updated)
@@ -2219,12 +2320,9 @@ void __init cpuset_init_smp(void)
2219{ 2320{
2220 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2321 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2221 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2322 top_cpuset.mems_allowed = node_states[N_MEMORY];
2323 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
2222 2324
2223 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2325 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
2224
2225 cpuset_propagate_hotplug_wq =
2226 alloc_ordered_workqueue("cpuset_hotplug", 0);
2227 BUG_ON(!cpuset_propagate_hotplug_wq);
2228} 2326}
2229 2327
2230/** 2328/**
@@ -2240,21 +2338,23 @@ void __init cpuset_init_smp(void)
2240 2338
2241void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2339void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2242{ 2340{
2341 struct cpuset *cpus_cs;
2342
2243 mutex_lock(&callback_mutex); 2343 mutex_lock(&callback_mutex);
2244 task_lock(tsk); 2344 task_lock(tsk);
2245 guarantee_online_cpus(task_cs(tsk), pmask); 2345 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2346 guarantee_online_cpus(cpus_cs, pmask);
2246 task_unlock(tsk); 2347 task_unlock(tsk);
2247 mutex_unlock(&callback_mutex); 2348 mutex_unlock(&callback_mutex);
2248} 2349}
2249 2350
2250void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2351void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2251{ 2352{
2252 const struct cpuset *cs; 2353 const struct cpuset *cpus_cs;
2253 2354
2254 rcu_read_lock(); 2355 rcu_read_lock();
2255 cs = task_cs(tsk); 2356 cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
2256 if (cs) 2357 do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
2257 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2258 rcu_read_unlock(); 2358 rcu_read_unlock();
2259 2359
2260 /* 2360 /*
@@ -2293,11 +2393,13 @@ void cpuset_init_current_mems_allowed(void)
2293 2393
2294nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2394nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2295{ 2395{
2396 struct cpuset *mems_cs;
2296 nodemask_t mask; 2397 nodemask_t mask;
2297 2398
2298 mutex_lock(&callback_mutex); 2399 mutex_lock(&callback_mutex);
2299 task_lock(tsk); 2400 task_lock(tsk);
2300 guarantee_online_mems(task_cs(tsk), &mask); 2401 mems_cs = effective_nodemask_cpuset(task_cs(tsk));
2402 guarantee_online_mems(mems_cs, &mask);
2301 task_unlock(tsk); 2403 task_unlock(tsk);
2302 mutex_unlock(&callback_mutex); 2404 mutex_unlock(&callback_mutex);
2303 2405