diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-20 12:18:31 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-20 12:18:31 -0500 |
commit | 9ae46e6702d98d22037368896298d05958ad5737 (patch) | |
tree | 019ce8ccff0a88fc7f5ebaf5c052daac5bac3860 | |
parent | 502b24c23b44fbaa01cc2cbd86d8035845b7811f (diff) | |
parent | d127027baf98dce3ca31bec18c2c0e048ceda7c4 (diff) |
Merge branch 'for-3.9-cpuset' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cpuset changes from Tejun Heo:
- Synchornization has seen a lot of changes with focus on decoupling
cpuset synchronization from cgroup internal locking.
After this change, there only remain a couple of mostly trivial
dependencies on cgroup_lock outside cgroup core proper. cgroup_lock
is scheduled to be unexported in this devel cycle.
This will finally remove the fragile locking order around cgroup
(cgroup locking wants to / should be one of the outermost but yet has
been acquired from deep inside individual controllers).
- At this point, Li is most knowlegeable with cpuset and taking over
the maintainership of cpuset.
* 'for-3.9-cpuset' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cpuset: drop spurious retval assignment in proc_cpuset_show()
cpuset: fix RCU lockdep splat
cpuset: update MAINTAINERS
cpuset: remove cpuset->parent
cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre()
cpuset: replace cgroup_mutex locking with cpuset internal locking
cpuset: schedule hotplug propagation from cpuset_attach() if the cpuset is empty
cpuset: pin down cpus and mems while a task is being attached
cpuset: make CPU / memory hotplug propagation asynchronous
cpuset: drop async_rebuild_sched_domains()
cpuset: don't nest cgroup_mutex inside get_online_cpus()
cpuset: reorganize CPU / memory hotplug handling
cpuset: cleanup cpuset[_can]_attach()
cpuset: introduce cpuset_for_each_child()
cpuset: introduce CS_ONLINE
cpuset: introduce ->css_on/offline()
cpuset: remove fast exit path from remove_tasks_in_empty_cpuset()
cpuset: remove unused cpuset_unlock()
-rw-r--r-- | MAINTAINERS | 4 | ||||
-rw-r--r-- | kernel/cpuset.c | 872 |
2 files changed, 485 insertions, 391 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 526fb85f2f7e..b7013e41b623 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -2140,10 +2140,10 @@ S: Maintained | |||
2140 | F: tools/power/cpupower | 2140 | F: tools/power/cpupower |
2141 | 2141 | ||
2142 | CPUSETS | 2142 | CPUSETS |
2143 | M: Paul Menage <paul@paulmenage.org> | 2143 | M: Li Zefan <lizefan@huawei.com> |
2144 | W: http://www.bullopensource.org/cpuset/ | 2144 | W: http://www.bullopensource.org/cpuset/ |
2145 | W: http://oss.sgi.com/projects/cpusets/ | 2145 | W: http://oss.sgi.com/projects/cpusets/ |
2146 | S: Supported | 2146 | S: Maintained |
2147 | F: Documentation/cgroups/cpusets.txt | 2147 | F: Documentation/cgroups/cpusets.txt |
2148 | F: include/linux/cpuset.h | 2148 | F: include/linux/cpuset.h |
2149 | F: kernel/cpuset.c | 2149 | F: kernel/cpuset.c |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5bb9bf18438c..4f9dfe43ecbd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -61,14 +61,6 @@ | |||
61 | #include <linux/cgroup.h> | 61 | #include <linux/cgroup.h> |
62 | 62 | ||
63 | /* | 63 | /* |
64 | * Workqueue for cpuset related tasks. | ||
65 | * | ||
66 | * Using kevent workqueue may cause deadlock when memory_migrate | ||
67 | * is set. So we create a separate workqueue thread for cpuset. | ||
68 | */ | ||
69 | static struct workqueue_struct *cpuset_wq; | ||
70 | |||
71 | /* | ||
72 | * Tracks how many cpusets are currently defined in system. | 64 | * Tracks how many cpusets are currently defined in system. |
73 | * When there is only one cpuset (the root cpuset) we can | 65 | * When there is only one cpuset (the root cpuset) we can |
74 | * short circuit some hooks. | 66 | * short circuit some hooks. |
@@ -95,18 +87,21 @@ struct cpuset { | |||
95 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 87 | cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
96 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ | 88 | nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ |
97 | 89 | ||
98 | struct cpuset *parent; /* my parent */ | ||
99 | |||
100 | struct fmeter fmeter; /* memory_pressure filter */ | 90 | struct fmeter fmeter; /* memory_pressure filter */ |
101 | 91 | ||
92 | /* | ||
93 | * Tasks are being attached to this cpuset. Used to prevent | ||
94 | * zeroing cpus/mems_allowed between ->can_attach() and ->attach(). | ||
95 | */ | ||
96 | int attach_in_progress; | ||
97 | |||
102 | /* partition number for rebuild_sched_domains() */ | 98 | /* partition number for rebuild_sched_domains() */ |
103 | int pn; | 99 | int pn; |
104 | 100 | ||
105 | /* for custom sched domain */ | 101 | /* for custom sched domain */ |
106 | int relax_domain_level; | 102 | int relax_domain_level; |
107 | 103 | ||
108 | /* used for walking a cpuset hierarchy */ | 104 | struct work_struct hotplug_work; |
109 | struct list_head stack_list; | ||
110 | }; | 105 | }; |
111 | 106 | ||
112 | /* Retrieve the cpuset for a cgroup */ | 107 | /* Retrieve the cpuset for a cgroup */ |
@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
123 | struct cpuset, css); | 118 | struct cpuset, css); |
124 | } | 119 | } |
125 | 120 | ||
121 | static inline struct cpuset *parent_cs(const struct cpuset *cs) | ||
122 | { | ||
123 | struct cgroup *pcgrp = cs->css.cgroup->parent; | ||
124 | |||
125 | if (pcgrp) | ||
126 | return cgroup_cs(pcgrp); | ||
127 | return NULL; | ||
128 | } | ||
129 | |||
126 | #ifdef CONFIG_NUMA | 130 | #ifdef CONFIG_NUMA |
127 | static inline bool task_has_mempolicy(struct task_struct *task) | 131 | static inline bool task_has_mempolicy(struct task_struct *task) |
128 | { | 132 | { |
@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task) | |||
138 | 142 | ||
139 | /* bits in struct cpuset flags field */ | 143 | /* bits in struct cpuset flags field */ |
140 | typedef enum { | 144 | typedef enum { |
145 | CS_ONLINE, | ||
141 | CS_CPU_EXCLUSIVE, | 146 | CS_CPU_EXCLUSIVE, |
142 | CS_MEM_EXCLUSIVE, | 147 | CS_MEM_EXCLUSIVE, |
143 | CS_MEM_HARDWALL, | 148 | CS_MEM_HARDWALL, |
@@ -147,13 +152,12 @@ typedef enum { | |||
147 | CS_SPREAD_SLAB, | 152 | CS_SPREAD_SLAB, |
148 | } cpuset_flagbits_t; | 153 | } cpuset_flagbits_t; |
149 | 154 | ||
150 | /* the type of hotplug event */ | ||
151 | enum hotplug_event { | ||
152 | CPUSET_CPU_OFFLINE, | ||
153 | CPUSET_MEM_OFFLINE, | ||
154 | }; | ||
155 | |||
156 | /* convenient tests for these bits */ | 155 | /* convenient tests for these bits */ |
156 | static inline bool is_cpuset_online(const struct cpuset *cs) | ||
157 | { | ||
158 | return test_bit(CS_ONLINE, &cs->flags); | ||
159 | } | ||
160 | |||
157 | static inline int is_cpu_exclusive(const struct cpuset *cs) | 161 | static inline int is_cpu_exclusive(const struct cpuset *cs) |
158 | { | 162 | { |
159 | return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); | 163 | return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); |
@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs) | |||
190 | } | 194 | } |
191 | 195 | ||
192 | static struct cpuset top_cpuset = { | 196 | static struct cpuset top_cpuset = { |
193 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), | 197 | .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | |
198 | (1 << CS_MEM_EXCLUSIVE)), | ||
194 | }; | 199 | }; |
195 | 200 | ||
201 | /** | ||
202 | * cpuset_for_each_child - traverse online children of a cpuset | ||
203 | * @child_cs: loop cursor pointing to the current child | ||
204 | * @pos_cgrp: used for iteration | ||
205 | * @parent_cs: target cpuset to walk children of | ||
206 | * | ||
207 | * Walk @child_cs through the online children of @parent_cs. Must be used | ||
208 | * with RCU read locked. | ||
209 | */ | ||
210 | #define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ | ||
211 | cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ | ||
212 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) | ||
213 | |||
214 | /** | ||
215 | * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants | ||
216 | * @des_cs: loop cursor pointing to the current descendant | ||
217 | * @pos_cgrp: used for iteration | ||
218 | * @root_cs: target cpuset to walk ancestor of | ||
219 | * | ||
220 | * Walk @des_cs through the online descendants of @root_cs. Must be used | ||
221 | * with RCU read locked. The caller may modify @pos_cgrp by calling | ||
222 | * cgroup_rightmost_descendant() to skip subtree. | ||
223 | */ | ||
224 | #define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ | ||
225 | cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ | ||
226 | if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) | ||
227 | |||
196 | /* | 228 | /* |
197 | * There are two global mutexes guarding cpuset structures. The first | 229 | * There are two global mutexes guarding cpuset structures - cpuset_mutex |
198 | * is the main control groups cgroup_mutex, accessed via | 230 | * and callback_mutex. The latter may nest inside the former. We also |
199 | * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific | 231 | * require taking task_lock() when dereferencing a task's cpuset pointer. |
200 | * callback_mutex, below. They can nest. It is ok to first take | 232 | * See "The task_lock() exception", at the end of this comment. |
201 | * cgroup_mutex, then nest callback_mutex. We also require taking | 233 | * |
202 | * task_lock() when dereferencing a task's cpuset pointer. See "The | 234 | * A task must hold both mutexes to modify cpusets. If a task holds |
203 | * task_lock() exception", at the end of this comment. | 235 | * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it |
204 | * | 236 | * is the only task able to also acquire callback_mutex and be able to |
205 | * A task must hold both mutexes to modify cpusets. If a task | 237 | * modify cpusets. It can perform various checks on the cpuset structure |
206 | * holds cgroup_mutex, then it blocks others wanting that mutex, | 238 | * first, knowing nothing will change. It can also allocate memory while |
207 | * ensuring that it is the only task able to also acquire callback_mutex | 239 | * just holding cpuset_mutex. While it is performing these checks, various |
208 | * and be able to modify cpusets. It can perform various checks on | 240 | * callback routines can briefly acquire callback_mutex to query cpusets. |
209 | * the cpuset structure first, knowing nothing will change. It can | 241 | * Once it is ready to make the changes, it takes callback_mutex, blocking |
210 | * also allocate memory while just holding cgroup_mutex. While it is | 242 | * everyone else. |
211 | * performing these checks, various callback routines can briefly | ||
212 | * acquire callback_mutex to query cpusets. Once it is ready to make | ||
213 | * the changes, it takes callback_mutex, blocking everyone else. | ||
214 | * | 243 | * |
215 | * Calls to the kernel memory allocator can not be made while holding | 244 | * Calls to the kernel memory allocator can not be made while holding |
216 | * callback_mutex, as that would risk double tripping on callback_mutex | 245 | * callback_mutex, as that would risk double tripping on callback_mutex |
@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = { | |||
232 | * guidelines for accessing subsystem state in kernel/cgroup.c | 261 | * guidelines for accessing subsystem state in kernel/cgroup.c |
233 | */ | 262 | */ |
234 | 263 | ||
264 | static DEFINE_MUTEX(cpuset_mutex); | ||
235 | static DEFINE_MUTEX(callback_mutex); | 265 | static DEFINE_MUTEX(callback_mutex); |
236 | 266 | ||
237 | /* | 267 | /* |
@@ -246,6 +276,17 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; | |||
246 | static DEFINE_SPINLOCK(cpuset_buffer_lock); | 276 | static DEFINE_SPINLOCK(cpuset_buffer_lock); |
247 | 277 | ||
248 | /* | 278 | /* |
279 | * CPU / memory hotplug is handled asynchronously. | ||
280 | */ | ||
281 | static struct workqueue_struct *cpuset_propagate_hotplug_wq; | ||
282 | |||
283 | static void cpuset_hotplug_workfn(struct work_struct *work); | ||
284 | static void cpuset_propagate_hotplug_workfn(struct work_struct *work); | ||
285 | static void schedule_cpuset_propagate_hotplug(struct cpuset *cs); | ||
286 | |||
287 | static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); | ||
288 | |||
289 | /* | ||
249 | * This is ugly, but preserves the userspace API for existing cpuset | 290 | * This is ugly, but preserves the userspace API for existing cpuset |
250 | * users. If someone tries to mount the "cpuset" filesystem, we | 291 | * users. If someone tries to mount the "cpuset" filesystem, we |
251 | * silently switch it to mount "cgroup" instead | 292 | * silently switch it to mount "cgroup" instead |
@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
289 | struct cpumask *pmask) | 330 | struct cpumask *pmask) |
290 | { | 331 | { |
291 | while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) | 332 | while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) |
292 | cs = cs->parent; | 333 | cs = parent_cs(cs); |
293 | if (cs) | 334 | if (cs) |
294 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); | 335 | cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); |
295 | else | 336 | else |
@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
314 | { | 355 | { |
315 | while (cs && !nodes_intersects(cs->mems_allowed, | 356 | while (cs && !nodes_intersects(cs->mems_allowed, |
316 | node_states[N_MEMORY])) | 357 | node_states[N_MEMORY])) |
317 | cs = cs->parent; | 358 | cs = parent_cs(cs); |
318 | if (cs) | 359 | if (cs) |
319 | nodes_and(*pmask, cs->mems_allowed, | 360 | nodes_and(*pmask, cs->mems_allowed, |
320 | node_states[N_MEMORY]); | 361 | node_states[N_MEMORY]); |
@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
326 | /* | 367 | /* |
327 | * update task's spread flag if cpuset's page/slab spread flag is set | 368 | * update task's spread flag if cpuset's page/slab spread flag is set |
328 | * | 369 | * |
329 | * Called with callback_mutex/cgroup_mutex held | 370 | * Called with callback_mutex/cpuset_mutex held |
330 | */ | 371 | */ |
331 | static void cpuset_update_task_spread_flag(struct cpuset *cs, | 372 | static void cpuset_update_task_spread_flag(struct cpuset *cs, |
332 | struct task_struct *tsk) | 373 | struct task_struct *tsk) |
@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, | |||
346 | * | 387 | * |
347 | * One cpuset is a subset of another if all its allowed CPUs and | 388 | * One cpuset is a subset of another if all its allowed CPUs and |
348 | * Memory Nodes are a subset of the other, and its exclusive flags | 389 | * Memory Nodes are a subset of the other, and its exclusive flags |
349 | * are only set if the other's are set. Call holding cgroup_mutex. | 390 | * are only set if the other's are set. Call holding cpuset_mutex. |
350 | */ | 391 | */ |
351 | 392 | ||
352 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 393 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial) | |||
395 | * If we replaced the flag and mask values of the current cpuset | 436 | * If we replaced the flag and mask values of the current cpuset |
396 | * (cur) with those values in the trial cpuset (trial), would | 437 | * (cur) with those values in the trial cpuset (trial), would |
397 | * our various subset and exclusive rules still be valid? Presumes | 438 | * our various subset and exclusive rules still be valid? Presumes |
398 | * cgroup_mutex held. | 439 | * cpuset_mutex held. |
399 | * | 440 | * |
400 | * 'cur' is the address of an actual, in-use cpuset. Operations | 441 | * 'cur' is the address of an actual, in-use cpuset. Operations |
401 | * such as list traversal that depend on the actual address of the | 442 | * such as list traversal that depend on the actual address of the |
@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
412 | { | 453 | { |
413 | struct cgroup *cont; | 454 | struct cgroup *cont; |
414 | struct cpuset *c, *par; | 455 | struct cpuset *c, *par; |
456 | int ret; | ||
457 | |||
458 | rcu_read_lock(); | ||
415 | 459 | ||
416 | /* Each of our child cpusets must be a subset of us */ | 460 | /* Each of our child cpusets must be a subset of us */ |
417 | list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { | 461 | ret = -EBUSY; |
418 | if (!is_cpuset_subset(cgroup_cs(cont), trial)) | 462 | cpuset_for_each_child(c, cont, cur) |
419 | return -EBUSY; | 463 | if (!is_cpuset_subset(c, trial)) |
420 | } | 464 | goto out; |
421 | 465 | ||
422 | /* Remaining checks don't apply to root cpuset */ | 466 | /* Remaining checks don't apply to root cpuset */ |
467 | ret = 0; | ||
423 | if (cur == &top_cpuset) | 468 | if (cur == &top_cpuset) |
424 | return 0; | 469 | goto out; |
425 | 470 | ||
426 | par = cur->parent; | 471 | par = parent_cs(cur); |
427 | 472 | ||
428 | /* We must be a subset of our parent cpuset */ | 473 | /* We must be a subset of our parent cpuset */ |
474 | ret = -EACCES; | ||
429 | if (!is_cpuset_subset(trial, par)) | 475 | if (!is_cpuset_subset(trial, par)) |
430 | return -EACCES; | 476 | goto out; |
431 | 477 | ||
432 | /* | 478 | /* |
433 | * If either I or some sibling (!= me) is exclusive, we can't | 479 | * If either I or some sibling (!= me) is exclusive, we can't |
434 | * overlap | 480 | * overlap |
435 | */ | 481 | */ |
436 | list_for_each_entry(cont, &par->css.cgroup->children, sibling) { | 482 | ret = -EINVAL; |
437 | c = cgroup_cs(cont); | 483 | cpuset_for_each_child(c, cont, par) { |
438 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 484 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
439 | c != cur && | 485 | c != cur && |
440 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) | 486 | cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) |
441 | return -EINVAL; | 487 | goto out; |
442 | if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && | 488 | if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && |
443 | c != cur && | 489 | c != cur && |
444 | nodes_intersects(trial->mems_allowed, c->mems_allowed)) | 490 | nodes_intersects(trial->mems_allowed, c->mems_allowed)) |
445 | return -EINVAL; | 491 | goto out; |
446 | } | 492 | } |
447 | 493 | ||
448 | /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ | 494 | /* |
449 | if (cgroup_task_count(cur->css.cgroup)) { | 495 | * Cpusets with tasks - existing or newly being attached - can't |
450 | if (cpumask_empty(trial->cpus_allowed) || | 496 | * have empty cpus_allowed or mems_allowed. |
451 | nodes_empty(trial->mems_allowed)) { | 497 | */ |
452 | return -ENOSPC; | 498 | ret = -ENOSPC; |
453 | } | 499 | if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) && |
454 | } | 500 | (cpumask_empty(trial->cpus_allowed) || |
501 | nodes_empty(trial->mems_allowed))) | ||
502 | goto out; | ||
455 | 503 | ||
456 | return 0; | 504 | ret = 0; |
505 | out: | ||
506 | rcu_read_unlock(); | ||
507 | return ret; | ||
457 | } | 508 | } |
458 | 509 | ||
459 | #ifdef CONFIG_SMP | 510 | #ifdef CONFIG_SMP |
@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
474 | return; | 525 | return; |
475 | } | 526 | } |
476 | 527 | ||
477 | static void | 528 | static void update_domain_attr_tree(struct sched_domain_attr *dattr, |
478 | update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | 529 | struct cpuset *root_cs) |
479 | { | 530 | { |
480 | LIST_HEAD(q); | 531 | struct cpuset *cp; |
481 | 532 | struct cgroup *pos_cgrp; | |
482 | list_add(&c->stack_list, &q); | ||
483 | while (!list_empty(&q)) { | ||
484 | struct cpuset *cp; | ||
485 | struct cgroup *cont; | ||
486 | struct cpuset *child; | ||
487 | |||
488 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
489 | list_del(q.next); | ||
490 | 533 | ||
491 | if (cpumask_empty(cp->cpus_allowed)) | 534 | rcu_read_lock(); |
535 | cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { | ||
536 | /* skip the whole subtree if @cp doesn't have any CPU */ | ||
537 | if (cpumask_empty(cp->cpus_allowed)) { | ||
538 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); | ||
492 | continue; | 539 | continue; |
540 | } | ||
493 | 541 | ||
494 | if (is_sched_load_balance(cp)) | 542 | if (is_sched_load_balance(cp)) |
495 | update_domain_attr(dattr, cp); | 543 | update_domain_attr(dattr, cp); |
496 | |||
497 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
498 | child = cgroup_cs(cont); | ||
499 | list_add_tail(&child->stack_list, &q); | ||
500 | } | ||
501 | } | 544 | } |
545 | rcu_read_unlock(); | ||
502 | } | 546 | } |
503 | 547 | ||
504 | /* | 548 | /* |
@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
520 | * domains when operating in the severe memory shortage situations | 564 | * domains when operating in the severe memory shortage situations |
521 | * that could cause allocation failures below. | 565 | * that could cause allocation failures below. |
522 | * | 566 | * |
523 | * Must be called with cgroup_lock held. | 567 | * Must be called with cpuset_mutex held. |
524 | * | 568 | * |
525 | * The three key local variables below are: | 569 | * The three key local variables below are: |
526 | * q - a linked-list queue of cpuset pointers, used to implement a | 570 | * q - a linked-list queue of cpuset pointers, used to implement a |
@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
558 | static int generate_sched_domains(cpumask_var_t **domains, | 602 | static int generate_sched_domains(cpumask_var_t **domains, |
559 | struct sched_domain_attr **attributes) | 603 | struct sched_domain_attr **attributes) |
560 | { | 604 | { |
561 | LIST_HEAD(q); /* queue of cpusets to be scanned */ | ||
562 | struct cpuset *cp; /* scans q */ | 605 | struct cpuset *cp; /* scans q */ |
563 | struct cpuset **csa; /* array of all cpuset ptrs */ | 606 | struct cpuset **csa; /* array of all cpuset ptrs */ |
564 | int csn; /* how many cpuset ptrs in csa so far */ | 607 | int csn; /* how many cpuset ptrs in csa so far */ |
@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
567 | struct sched_domain_attr *dattr; /* attributes for custom domains */ | 610 | struct sched_domain_attr *dattr; /* attributes for custom domains */ |
568 | int ndoms = 0; /* number of sched domains in result */ | 611 | int ndoms = 0; /* number of sched domains in result */ |
569 | int nslot; /* next empty doms[] struct cpumask slot */ | 612 | int nslot; /* next empty doms[] struct cpumask slot */ |
613 | struct cgroup *pos_cgrp; | ||
570 | 614 | ||
571 | doms = NULL; | 615 | doms = NULL; |
572 | dattr = NULL; | 616 | dattr = NULL; |
@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
594 | goto done; | 638 | goto done; |
595 | csn = 0; | 639 | csn = 0; |
596 | 640 | ||
597 | list_add(&top_cpuset.stack_list, &q); | 641 | rcu_read_lock(); |
598 | while (!list_empty(&q)) { | 642 | cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { |
599 | struct cgroup *cont; | ||
600 | struct cpuset *child; /* scans child cpusets of cp */ | ||
601 | |||
602 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
603 | list_del(q.next); | ||
604 | |||
605 | if (cpumask_empty(cp->cpus_allowed)) | ||
606 | continue; | ||
607 | |||
608 | /* | 643 | /* |
609 | * All child cpusets contain a subset of the parent's cpus, so | 644 | * Continue traversing beyond @cp iff @cp has some CPUs and |
610 | * just skip them, and then we call update_domain_attr_tree() | 645 | * isn't load balancing. The former is obvious. The |
611 | * to calc relax_domain_level of the corresponding sched | 646 | * latter: All child cpusets contain a subset of the |
612 | * domain. | 647 | * parent's cpus, so just skip them, and then we call |
648 | * update_domain_attr_tree() to calc relax_domain_level of | ||
649 | * the corresponding sched domain. | ||
613 | */ | 650 | */ |
614 | if (is_sched_load_balance(cp)) { | 651 | if (!cpumask_empty(cp->cpus_allowed) && |
615 | csa[csn++] = cp; | 652 | !is_sched_load_balance(cp)) |
616 | continue; | 653 | continue; |
617 | } | ||
618 | 654 | ||
619 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 655 | if (is_sched_load_balance(cp)) |
620 | child = cgroup_cs(cont); | 656 | csa[csn++] = cp; |
621 | list_add_tail(&child->stack_list, &q); | 657 | |
622 | } | 658 | /* skip @cp's subtree */ |
623 | } | 659 | pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); |
660 | } | ||
661 | rcu_read_unlock(); | ||
624 | 662 | ||
625 | for (i = 0; i < csn; i++) | 663 | for (i = 0; i < csn; i++) |
626 | csa[i]->pn = i; | 664 | csa[i]->pn = i; |
@@ -725,25 +763,25 @@ done: | |||
725 | /* | 763 | /* |
726 | * Rebuild scheduler domains. | 764 | * Rebuild scheduler domains. |
727 | * | 765 | * |
728 | * Call with neither cgroup_mutex held nor within get_online_cpus(). | 766 | * If the flag 'sched_load_balance' of any cpuset with non-empty |
729 | * Takes both cgroup_mutex and get_online_cpus(). | 767 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset |
768 | * which has that flag enabled, or if any cpuset with a non-empty | ||
769 | * 'cpus' is removed, then call this routine to rebuild the | ||
770 | * scheduler's dynamic sched domains. | ||
730 | * | 771 | * |
731 | * Cannot be directly called from cpuset code handling changes | 772 | * Call with cpuset_mutex held. Takes get_online_cpus(). |
732 | * to the cpuset pseudo-filesystem, because it cannot be called | ||
733 | * from code that already holds cgroup_mutex. | ||
734 | */ | 773 | */ |
735 | static void do_rebuild_sched_domains(struct work_struct *unused) | 774 | static void rebuild_sched_domains_locked(void) |
736 | { | 775 | { |
737 | struct sched_domain_attr *attr; | 776 | struct sched_domain_attr *attr; |
738 | cpumask_var_t *doms; | 777 | cpumask_var_t *doms; |
739 | int ndoms; | 778 | int ndoms; |
740 | 779 | ||
780 | lockdep_assert_held(&cpuset_mutex); | ||
741 | get_online_cpus(); | 781 | get_online_cpus(); |
742 | 782 | ||
743 | /* Generate domain masks and attrs */ | 783 | /* Generate domain masks and attrs */ |
744 | cgroup_lock(); | ||
745 | ndoms = generate_sched_domains(&doms, &attr); | 784 | ndoms = generate_sched_domains(&doms, &attr); |
746 | cgroup_unlock(); | ||
747 | 785 | ||
748 | /* Have scheduler rebuild the domains */ | 786 | /* Have scheduler rebuild the domains */ |
749 | partition_sched_domains(ndoms, doms, attr); | 787 | partition_sched_domains(ndoms, doms, attr); |
@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) | |||
751 | put_online_cpus(); | 789 | put_online_cpus(); |
752 | } | 790 | } |
753 | #else /* !CONFIG_SMP */ | 791 | #else /* !CONFIG_SMP */ |
754 | static void do_rebuild_sched_domains(struct work_struct *unused) | 792 | static void rebuild_sched_domains_locked(void) |
755 | { | 793 | { |
756 | } | 794 | } |
757 | 795 | ||
@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
763 | } | 801 | } |
764 | #endif /* CONFIG_SMP */ | 802 | #endif /* CONFIG_SMP */ |
765 | 803 | ||
766 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); | ||
767 | |||
768 | /* | ||
769 | * Rebuild scheduler domains, asynchronously via workqueue. | ||
770 | * | ||
771 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
772 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
773 | * which has that flag enabled, or if any cpuset with a non-empty | ||
774 | * 'cpus' is removed, then call this routine to rebuild the | ||
775 | * scheduler's dynamic sched domains. | ||
776 | * | ||
777 | * The rebuild_sched_domains() and partition_sched_domains() | ||
778 | * routines must nest cgroup_lock() inside get_online_cpus(), | ||
779 | * but such cpuset changes as these must nest that locking the | ||
780 | * other way, holding cgroup_lock() for much of the code. | ||
781 | * | ||
782 | * So in order to avoid an ABBA deadlock, the cpuset code handling | ||
783 | * these user changes delegates the actual sched domain rebuilding | ||
784 | * to a separate workqueue thread, which ends up processing the | ||
785 | * above do_rebuild_sched_domains() function. | ||
786 | */ | ||
787 | static void async_rebuild_sched_domains(void) | ||
788 | { | ||
789 | queue_work(cpuset_wq, &rebuild_sched_domains_work); | ||
790 | } | ||
791 | |||
792 | /* | ||
793 | * Accomplishes the same scheduler domain rebuild as the above | ||
794 | * async_rebuild_sched_domains(), however it directly calls the | ||
795 | * rebuild routine synchronously rather than calling it via an | ||
796 | * asynchronous work thread. | ||
797 | * | ||
798 | * This can only be called from code that is not holding | ||
799 | * cgroup_mutex (not nested in a cgroup_lock() call.) | ||
800 | */ | ||
801 | void rebuild_sched_domains(void) | 804 | void rebuild_sched_domains(void) |
802 | { | 805 | { |
803 | do_rebuild_sched_domains(NULL); | 806 | mutex_lock(&cpuset_mutex); |
807 | rebuild_sched_domains_locked(); | ||
808 | mutex_unlock(&cpuset_mutex); | ||
804 | } | 809 | } |
805 | 810 | ||
806 | /** | 811 | /** |
@@ -808,7 +813,7 @@ void rebuild_sched_domains(void) | |||
808 | * @tsk: task to test | 813 | * @tsk: task to test |
809 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | 814 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner |
810 | * | 815 | * |
811 | * Call with cgroup_mutex held. May take callback_mutex during call. | 816 | * Call with cpuset_mutex held. May take callback_mutex during call. |
812 | * Called for each task in a cgroup by cgroup_scan_tasks(). | 817 | * Called for each task in a cgroup by cgroup_scan_tasks(). |
813 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other | 818 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other |
814 | * words, if its mask is not equal to its cpuset's mask). | 819 | * words, if its mask is not equal to its cpuset's mask). |
@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, | |||
829 | * cpus_allowed mask needs to be changed. | 834 | * cpus_allowed mask needs to be changed. |
830 | * | 835 | * |
831 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 836 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
832 | * holding cgroup_lock() at this point. | 837 | * holding cpuset_mutex at this point. |
833 | */ | 838 | */ |
834 | static void cpuset_change_cpumask(struct task_struct *tsk, | 839 | static void cpuset_change_cpumask(struct task_struct *tsk, |
835 | struct cgroup_scanner *scan) | 840 | struct cgroup_scanner *scan) |
@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, | |||
842 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 847 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
843 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 848 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
844 | * | 849 | * |
845 | * Called with cgroup_mutex held | 850 | * Called with cpuset_mutex held |
846 | * | 851 | * |
847 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 852 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
848 | * calling callback functions for each. | 853 | * calling callback functions for each. |
@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
920 | heap_free(&heap); | 925 | heap_free(&heap); |
921 | 926 | ||
922 | if (is_load_balanced) | 927 | if (is_load_balanced) |
923 | async_rebuild_sched_domains(); | 928 | rebuild_sched_domains_locked(); |
924 | return 0; | 929 | return 0; |
925 | } | 930 | } |
926 | 931 | ||
@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
932 | * Temporarilly set tasks mems_allowed to target nodes of migration, | 937 | * Temporarilly set tasks mems_allowed to target nodes of migration, |
933 | * so that the migration code can allocate pages on these nodes. | 938 | * so that the migration code can allocate pages on these nodes. |
934 | * | 939 | * |
935 | * Call holding cgroup_mutex, so current's cpuset won't change | 940 | * Call holding cpuset_mutex, so current's cpuset won't change |
936 | * during this call, as manage_mutex holds off any cpuset_attach() | 941 | * during this call, as manage_mutex holds off any cpuset_attach() |
937 | * calls. Therefore we don't need to take task_lock around the | 942 | * calls. Therefore we don't need to take task_lock around the |
938 | * call to guarantee_online_mems(), as we know no one is changing | 943 | * call to guarantee_online_mems(), as we know no one is changing |
@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1007 | /* | 1012 | /* |
1008 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | 1013 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy |
1009 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | 1014 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if |
1010 | * memory_migrate flag is set. Called with cgroup_mutex held. | 1015 | * memory_migrate flag is set. Called with cpuset_mutex held. |
1011 | */ | 1016 | */ |
1012 | static void cpuset_change_nodemask(struct task_struct *p, | 1017 | static void cpuset_change_nodemask(struct task_struct *p, |
1013 | struct cgroup_scanner *scan) | 1018 | struct cgroup_scanner *scan) |
@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
1016 | struct cpuset *cs; | 1021 | struct cpuset *cs; |
1017 | int migrate; | 1022 | int migrate; |
1018 | const nodemask_t *oldmem = scan->data; | 1023 | const nodemask_t *oldmem = scan->data; |
1019 | static nodemask_t newmems; /* protected by cgroup_mutex */ | 1024 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
1020 | 1025 | ||
1021 | cs = cgroup_cs(scan->cg); | 1026 | cs = cgroup_cs(scan->cg); |
1022 | guarantee_online_mems(cs, &newmems); | 1027 | guarantee_online_mems(cs, &newmems); |
@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound; | |||
1043 | * @oldmem: old mems_allowed of cpuset cs | 1048 | * @oldmem: old mems_allowed of cpuset cs |
1044 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1049 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
1045 | * | 1050 | * |
1046 | * Called with cgroup_mutex held | 1051 | * Called with cpuset_mutex held |
1047 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1052 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 |
1048 | * if @heap != NULL. | 1053 | * if @heap != NULL. |
1049 | */ | 1054 | */ |
@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
1065 | * take while holding tasklist_lock. Forks can happen - the | 1070 | * take while holding tasklist_lock. Forks can happen - the |
1066 | * mpol_dup() cpuset_being_rebound check will catch such forks, | 1071 | * mpol_dup() cpuset_being_rebound check will catch such forks, |
1067 | * and rebind their vma mempolicies too. Because we still hold | 1072 | * and rebind their vma mempolicies too. Because we still hold |
1068 | * the global cgroup_mutex, we know that no other rebind effort | 1073 | * the global cpuset_mutex, we know that no other rebind effort |
1069 | * will be contending for the global variable cpuset_being_rebound. | 1074 | * will be contending for the global variable cpuset_being_rebound. |
1070 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 1075 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
1071 | * is idempotent. Also migrate pages in each mm to new nodes. | 1076 | * is idempotent. Also migrate pages in each mm to new nodes. |
@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
1084 | * mempolicies and if the cpuset is marked 'memory_migrate', | 1089 | * mempolicies and if the cpuset is marked 'memory_migrate', |
1085 | * migrate the tasks pages to the new memory. | 1090 | * migrate the tasks pages to the new memory. |
1086 | * | 1091 | * |
1087 | * Call with cgroup_mutex held. May take callback_mutex during call. | 1092 | * Call with cpuset_mutex held. May take callback_mutex during call. |
1088 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 1093 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
1089 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 1094 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
1090 | * their mempolicies to the cpusets new mems_allowed. | 1095 | * their mempolicies to the cpusets new mems_allowed. |
@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1168 | cs->relax_domain_level = val; | 1173 | cs->relax_domain_level = val; |
1169 | if (!cpumask_empty(cs->cpus_allowed) && | 1174 | if (!cpumask_empty(cs->cpus_allowed) && |
1170 | is_sched_load_balance(cs)) | 1175 | is_sched_load_balance(cs)) |
1171 | async_rebuild_sched_domains(); | 1176 | rebuild_sched_domains_locked(); |
1172 | } | 1177 | } |
1173 | 1178 | ||
1174 | return 0; | 1179 | return 0; |
@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1182 | * Called by cgroup_scan_tasks() for each task in a cgroup. | 1187 | * Called by cgroup_scan_tasks() for each task in a cgroup. |
1183 | * | 1188 | * |
1184 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 1189 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
1185 | * holding cgroup_lock() at this point. | 1190 | * holding cpuset_mutex at this point. |
1186 | */ | 1191 | */ |
1187 | static void cpuset_change_flag(struct task_struct *tsk, | 1192 | static void cpuset_change_flag(struct task_struct *tsk, |
1188 | struct cgroup_scanner *scan) | 1193 | struct cgroup_scanner *scan) |
@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk, | |||
1195 | * @cs: the cpuset in which each task's spread flags needs to be changed | 1200 | * @cs: the cpuset in which each task's spread flags needs to be changed |
1196 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1201 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
1197 | * | 1202 | * |
1198 | * Called with cgroup_mutex held | 1203 | * Called with cpuset_mutex held |
1199 | * | 1204 | * |
1200 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 1205 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
1201 | * calling callback functions for each. | 1206 | * calling callback functions for each. |
@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) | |||
1220 | * cs: the cpuset to update | 1225 | * cs: the cpuset to update |
1221 | * turning_on: whether the flag is being set or cleared | 1226 | * turning_on: whether the flag is being set or cleared |
1222 | * | 1227 | * |
1223 | * Call with cgroup_mutex held. | 1228 | * Call with cpuset_mutex held. |
1224 | */ | 1229 | */ |
1225 | 1230 | ||
1226 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | 1231 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1260 | mutex_unlock(&callback_mutex); | 1265 | mutex_unlock(&callback_mutex); |
1261 | 1266 | ||
1262 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) | 1267 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) |
1263 | async_rebuild_sched_domains(); | 1268 | rebuild_sched_domains_locked(); |
1264 | 1269 | ||
1265 | if (spread_flag_changed) | 1270 | if (spread_flag_changed) |
1266 | update_tasks_flags(cs, &heap); | 1271 | update_tasks_flags(cs, &heap); |
@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1368 | return val; | 1373 | return val; |
1369 | } | 1374 | } |
1370 | 1375 | ||
1371 | /* | 1376 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ |
1372 | * Protected by cgroup_lock. The nodemasks must be stored globally because | ||
1373 | * dynamically allocating them is not allowed in can_attach, and they must | ||
1374 | * persist until attach. | ||
1375 | */ | ||
1376 | static cpumask_var_t cpus_attach; | ||
1377 | static nodemask_t cpuset_attach_nodemask_from; | ||
1378 | static nodemask_t cpuset_attach_nodemask_to; | ||
1379 | |||
1380 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | ||
1381 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1377 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
1382 | { | 1378 | { |
1383 | struct cpuset *cs = cgroup_cs(cgrp); | 1379 | struct cpuset *cs = cgroup_cs(cgrp); |
1384 | struct task_struct *task; | 1380 | struct task_struct *task; |
1385 | int ret; | 1381 | int ret; |
1386 | 1382 | ||
1383 | mutex_lock(&cpuset_mutex); | ||
1384 | |||
1385 | ret = -ENOSPC; | ||
1387 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1386 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
1388 | return -ENOSPC; | 1387 | goto out_unlock; |
1389 | 1388 | ||
1390 | cgroup_taskset_for_each(task, cgrp, tset) { | 1389 | cgroup_taskset_for_each(task, cgrp, tset) { |
1391 | /* | 1390 | /* |
@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1397 | * set_cpus_allowed_ptr() on all attached tasks before | 1396 | * set_cpus_allowed_ptr() on all attached tasks before |
1398 | * cpus_allowed may be changed. | 1397 | * cpus_allowed may be changed. |
1399 | */ | 1398 | */ |
1399 | ret = -EINVAL; | ||
1400 | if (task->flags & PF_THREAD_BOUND) | 1400 | if (task->flags & PF_THREAD_BOUND) |
1401 | return -EINVAL; | 1401 | goto out_unlock; |
1402 | if ((ret = security_task_setscheduler(task))) | 1402 | ret = security_task_setscheduler(task); |
1403 | return ret; | 1403 | if (ret) |
1404 | goto out_unlock; | ||
1404 | } | 1405 | } |
1405 | 1406 | ||
1406 | /* prepare for attach */ | 1407 | /* |
1407 | if (cs == &top_cpuset) | 1408 | * Mark attach is in progress. This makes validate_change() fail |
1408 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1409 | * changes which zero cpus/mems_allowed. |
1409 | else | 1410 | */ |
1410 | guarantee_online_cpus(cs, cpus_attach); | 1411 | cs->attach_in_progress++; |
1411 | 1412 | ret = 0; | |
1412 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | 1413 | out_unlock: |
1414 | mutex_unlock(&cpuset_mutex); | ||
1415 | return ret; | ||
1416 | } | ||
1413 | 1417 | ||
1414 | return 0; | 1418 | static void cpuset_cancel_attach(struct cgroup *cgrp, |
1419 | struct cgroup_taskset *tset) | ||
1420 | { | ||
1421 | mutex_lock(&cpuset_mutex); | ||
1422 | cgroup_cs(cgrp)->attach_in_progress--; | ||
1423 | mutex_unlock(&cpuset_mutex); | ||
1415 | } | 1424 | } |
1416 | 1425 | ||
1426 | /* | ||
1427 | * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() | ||
1428 | * but we can't allocate it dynamically there. Define it global and | ||
1429 | * allocate from cpuset_init(). | ||
1430 | */ | ||
1431 | static cpumask_var_t cpus_attach; | ||
1432 | |||
1417 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1433 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
1418 | { | 1434 | { |
1435 | /* static bufs protected by cpuset_mutex */ | ||
1436 | static nodemask_t cpuset_attach_nodemask_from; | ||
1437 | static nodemask_t cpuset_attach_nodemask_to; | ||
1419 | struct mm_struct *mm; | 1438 | struct mm_struct *mm; |
1420 | struct task_struct *task; | 1439 | struct task_struct *task; |
1421 | struct task_struct *leader = cgroup_taskset_first(tset); | 1440 | struct task_struct *leader = cgroup_taskset_first(tset); |
@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1423 | struct cpuset *cs = cgroup_cs(cgrp); | 1442 | struct cpuset *cs = cgroup_cs(cgrp); |
1424 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | 1443 | struct cpuset *oldcs = cgroup_cs(oldcgrp); |
1425 | 1444 | ||
1445 | mutex_lock(&cpuset_mutex); | ||
1446 | |||
1447 | /* prepare for attach */ | ||
1448 | if (cs == &top_cpuset) | ||
1449 | cpumask_copy(cpus_attach, cpu_possible_mask); | ||
1450 | else | ||
1451 | guarantee_online_cpus(cs, cpus_attach); | ||
1452 | |||
1453 | guarantee_online_mems(cs, &cpuset_attach_nodemask_to); | ||
1454 | |||
1426 | cgroup_taskset_for_each(task, cgrp, tset) { | 1455 | cgroup_taskset_for_each(task, cgrp, tset) { |
1427 | /* | 1456 | /* |
1428 | * can_attach beforehand should guarantee that this doesn't | 1457 | * can_attach beforehand should guarantee that this doesn't |
@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1448 | &cpuset_attach_nodemask_to); | 1477 | &cpuset_attach_nodemask_to); |
1449 | mmput(mm); | 1478 | mmput(mm); |
1450 | } | 1479 | } |
1480 | |||
1481 | cs->attach_in_progress--; | ||
1482 | |||
1483 | /* | ||
1484 | * We may have raced with CPU/memory hotunplug. Trigger hotplug | ||
1485 | * propagation if @cs doesn't have any CPU or memory. It will move | ||
1486 | * the newly added tasks to the nearest parent which can execute. | ||
1487 | */ | ||
1488 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | ||
1489 | schedule_cpuset_propagate_hotplug(cs); | ||
1490 | |||
1491 | mutex_unlock(&cpuset_mutex); | ||
1451 | } | 1492 | } |
1452 | 1493 | ||
1453 | /* The various types of files and directories in a cpuset file system */ | 1494 | /* The various types of files and directories in a cpuset file system */ |
@@ -1469,12 +1510,13 @@ typedef enum { | |||
1469 | 1510 | ||
1470 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1511 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) |
1471 | { | 1512 | { |
1472 | int retval = 0; | ||
1473 | struct cpuset *cs = cgroup_cs(cgrp); | 1513 | struct cpuset *cs = cgroup_cs(cgrp); |
1474 | cpuset_filetype_t type = cft->private; | 1514 | cpuset_filetype_t type = cft->private; |
1515 | int retval = -ENODEV; | ||
1475 | 1516 | ||
1476 | if (!cgroup_lock_live_group(cgrp)) | 1517 | mutex_lock(&cpuset_mutex); |
1477 | return -ENODEV; | 1518 | if (!is_cpuset_online(cs)) |
1519 | goto out_unlock; | ||
1478 | 1520 | ||
1479 | switch (type) { | 1521 | switch (type) { |
1480 | case FILE_CPU_EXCLUSIVE: | 1522 | case FILE_CPU_EXCLUSIVE: |
@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | |||
1508 | retval = -EINVAL; | 1550 | retval = -EINVAL; |
1509 | break; | 1551 | break; |
1510 | } | 1552 | } |
1511 | cgroup_unlock(); | 1553 | out_unlock: |
1554 | mutex_unlock(&cpuset_mutex); | ||
1512 | return retval; | 1555 | return retval; |
1513 | } | 1556 | } |
1514 | 1557 | ||
1515 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | 1558 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) |
1516 | { | 1559 | { |
1517 | int retval = 0; | ||
1518 | struct cpuset *cs = cgroup_cs(cgrp); | 1560 | struct cpuset *cs = cgroup_cs(cgrp); |
1519 | cpuset_filetype_t type = cft->private; | 1561 | cpuset_filetype_t type = cft->private; |
1562 | int retval = -ENODEV; | ||
1520 | 1563 | ||
1521 | if (!cgroup_lock_live_group(cgrp)) | 1564 | mutex_lock(&cpuset_mutex); |
1522 | return -ENODEV; | 1565 | if (!is_cpuset_online(cs)) |
1566 | goto out_unlock; | ||
1523 | 1567 | ||
1524 | switch (type) { | 1568 | switch (type) { |
1525 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1569 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1529 | retval = -EINVAL; | 1573 | retval = -EINVAL; |
1530 | break; | 1574 | break; |
1531 | } | 1575 | } |
1532 | cgroup_unlock(); | 1576 | out_unlock: |
1577 | mutex_unlock(&cpuset_mutex); | ||
1533 | return retval; | 1578 | return retval; |
1534 | } | 1579 | } |
1535 | 1580 | ||
@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1539 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | 1584 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, |
1540 | const char *buf) | 1585 | const char *buf) |
1541 | { | 1586 | { |
1542 | int retval = 0; | ||
1543 | struct cpuset *cs = cgroup_cs(cgrp); | 1587 | struct cpuset *cs = cgroup_cs(cgrp); |
1544 | struct cpuset *trialcs; | 1588 | struct cpuset *trialcs; |
1589 | int retval = -ENODEV; | ||
1545 | 1590 | ||
1546 | if (!cgroup_lock_live_group(cgrp)) | 1591 | /* |
1547 | return -ENODEV; | 1592 | * CPU or memory hotunplug may leave @cs w/o any execution |
1593 | * resources, in which case the hotplug code asynchronously updates | ||
1594 | * configuration and transfers all tasks to the nearest ancestor | ||
1595 | * which can execute. | ||
1596 | * | ||
1597 | * As writes to "cpus" or "mems" may restore @cs's execution | ||
1598 | * resources, wait for the previously scheduled operations before | ||
1599 | * proceeding, so that we don't end up keep removing tasks added | ||
1600 | * after execution capability is restored. | ||
1601 | * | ||
1602 | * Flushing cpuset_hotplug_work is enough to synchronize against | ||
1603 | * hotplug hanlding; however, cpuset_attach() may schedule | ||
1604 | * propagation work directly. Flush the workqueue too. | ||
1605 | */ | ||
1606 | flush_work(&cpuset_hotplug_work); | ||
1607 | flush_workqueue(cpuset_propagate_hotplug_wq); | ||
1608 | |||
1609 | mutex_lock(&cpuset_mutex); | ||
1610 | if (!is_cpuset_online(cs)) | ||
1611 | goto out_unlock; | ||
1548 | 1612 | ||
1549 | trialcs = alloc_trial_cpuset(cs); | 1613 | trialcs = alloc_trial_cpuset(cs); |
1550 | if (!trialcs) { | 1614 | if (!trialcs) { |
1551 | retval = -ENOMEM; | 1615 | retval = -ENOMEM; |
1552 | goto out; | 1616 | goto out_unlock; |
1553 | } | 1617 | } |
1554 | 1618 | ||
1555 | switch (cft->private) { | 1619 | switch (cft->private) { |
@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1565 | } | 1629 | } |
1566 | 1630 | ||
1567 | free_trial_cpuset(trialcs); | 1631 | free_trial_cpuset(trialcs); |
1568 | out: | 1632 | out_unlock: |
1569 | cgroup_unlock(); | 1633 | mutex_unlock(&cpuset_mutex); |
1570 | return retval; | 1634 | return retval; |
1571 | } | 1635 | } |
1572 | 1636 | ||
@@ -1790,15 +1854,12 @@ static struct cftype files[] = { | |||
1790 | 1854 | ||
1791 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | 1855 | static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) |
1792 | { | 1856 | { |
1793 | struct cgroup *parent_cg = cont->parent; | 1857 | struct cpuset *cs; |
1794 | struct cgroup *tmp_cg; | ||
1795 | struct cpuset *parent, *cs; | ||
1796 | 1858 | ||
1797 | if (!parent_cg) | 1859 | if (!cont->parent) |
1798 | return &top_cpuset.css; | 1860 | return &top_cpuset.css; |
1799 | parent = cgroup_cs(parent_cg); | ||
1800 | 1861 | ||
1801 | cs = kmalloc(sizeof(*cs), GFP_KERNEL); | 1862 | cs = kzalloc(sizeof(*cs), GFP_KERNEL); |
1802 | if (!cs) | 1863 | if (!cs) |
1803 | return ERR_PTR(-ENOMEM); | 1864 | return ERR_PTR(-ENOMEM); |
1804 | if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { | 1865 | if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { |
@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | |||
1806 | return ERR_PTR(-ENOMEM); | 1867 | return ERR_PTR(-ENOMEM); |
1807 | } | 1868 | } |
1808 | 1869 | ||
1809 | cs->flags = 0; | ||
1810 | if (is_spread_page(parent)) | ||
1811 | set_bit(CS_SPREAD_PAGE, &cs->flags); | ||
1812 | if (is_spread_slab(parent)) | ||
1813 | set_bit(CS_SPREAD_SLAB, &cs->flags); | ||
1814 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 1870 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
1815 | cpumask_clear(cs->cpus_allowed); | 1871 | cpumask_clear(cs->cpus_allowed); |
1816 | nodes_clear(cs->mems_allowed); | 1872 | nodes_clear(cs->mems_allowed); |
1817 | fmeter_init(&cs->fmeter); | 1873 | fmeter_init(&cs->fmeter); |
1874 | INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn); | ||
1818 | cs->relax_domain_level = -1; | 1875 | cs->relax_domain_level = -1; |
1819 | 1876 | ||
1820 | cs->parent = parent; | 1877 | return &cs->css; |
1878 | } | ||
1879 | |||
1880 | static int cpuset_css_online(struct cgroup *cgrp) | ||
1881 | { | ||
1882 | struct cpuset *cs = cgroup_cs(cgrp); | ||
1883 | struct cpuset *parent = parent_cs(cs); | ||
1884 | struct cpuset *tmp_cs; | ||
1885 | struct cgroup *pos_cg; | ||
1886 | |||
1887 | if (!parent) | ||
1888 | return 0; | ||
1889 | |||
1890 | mutex_lock(&cpuset_mutex); | ||
1891 | |||
1892 | set_bit(CS_ONLINE, &cs->flags); | ||
1893 | if (is_spread_page(parent)) | ||
1894 | set_bit(CS_SPREAD_PAGE, &cs->flags); | ||
1895 | if (is_spread_slab(parent)) | ||
1896 | set_bit(CS_SPREAD_SLAB, &cs->flags); | ||
1897 | |||
1821 | number_of_cpusets++; | 1898 | number_of_cpusets++; |
1822 | 1899 | ||
1823 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) | 1900 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) |
1824 | goto skip_clone; | 1901 | goto out_unlock; |
1825 | 1902 | ||
1826 | /* | 1903 | /* |
1827 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is | 1904 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is |
@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) | |||
1836 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | 1913 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive |
1837 | * (and likewise for mems) to the new cgroup. | 1914 | * (and likewise for mems) to the new cgroup. |
1838 | */ | 1915 | */ |
1839 | list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { | 1916 | rcu_read_lock(); |
1840 | struct cpuset *tmp_cs = cgroup_cs(tmp_cg); | 1917 | cpuset_for_each_child(tmp_cs, pos_cg, parent) { |
1841 | 1918 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { | |
1842 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) | 1919 | rcu_read_unlock(); |
1843 | goto skip_clone; | 1920 | goto out_unlock; |
1921 | } | ||
1844 | } | 1922 | } |
1923 | rcu_read_unlock(); | ||
1845 | 1924 | ||
1846 | mutex_lock(&callback_mutex); | 1925 | mutex_lock(&callback_mutex); |
1847 | cs->mems_allowed = parent->mems_allowed; | 1926 | cs->mems_allowed = parent->mems_allowed; |
1848 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | 1927 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); |
1849 | mutex_unlock(&callback_mutex); | 1928 | mutex_unlock(&callback_mutex); |
1850 | skip_clone: | 1929 | out_unlock: |
1851 | return &cs->css; | 1930 | mutex_unlock(&cpuset_mutex); |
1931 | return 0; | ||
1932 | } | ||
1933 | |||
1934 | static void cpuset_css_offline(struct cgroup *cgrp) | ||
1935 | { | ||
1936 | struct cpuset *cs = cgroup_cs(cgrp); | ||
1937 | |||
1938 | mutex_lock(&cpuset_mutex); | ||
1939 | |||
1940 | if (is_sched_load_balance(cs)) | ||
1941 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | ||
1942 | |||
1943 | number_of_cpusets--; | ||
1944 | clear_bit(CS_ONLINE, &cs->flags); | ||
1945 | |||
1946 | mutex_unlock(&cpuset_mutex); | ||
1852 | } | 1947 | } |
1853 | 1948 | ||
1854 | /* | 1949 | /* |
1855 | * If the cpuset being removed has its flag 'sched_load_balance' | 1950 | * If the cpuset being removed has its flag 'sched_load_balance' |
1856 | * enabled, then simulate turning sched_load_balance off, which | 1951 | * enabled, then simulate turning sched_load_balance off, which |
1857 | * will call async_rebuild_sched_domains(). | 1952 | * will call rebuild_sched_domains_locked(). |
1858 | */ | 1953 | */ |
1859 | 1954 | ||
1860 | static void cpuset_css_free(struct cgroup *cont) | 1955 | static void cpuset_css_free(struct cgroup *cont) |
1861 | { | 1956 | { |
1862 | struct cpuset *cs = cgroup_cs(cont); | 1957 | struct cpuset *cs = cgroup_cs(cont); |
1863 | 1958 | ||
1864 | if (is_sched_load_balance(cs)) | ||
1865 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | ||
1866 | |||
1867 | number_of_cpusets--; | ||
1868 | free_cpumask_var(cs->cpus_allowed); | 1959 | free_cpumask_var(cs->cpus_allowed); |
1869 | kfree(cs); | 1960 | kfree(cs); |
1870 | } | 1961 | } |
@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont) | |||
1872 | struct cgroup_subsys cpuset_subsys = { | 1963 | struct cgroup_subsys cpuset_subsys = { |
1873 | .name = "cpuset", | 1964 | .name = "cpuset", |
1874 | .css_alloc = cpuset_css_alloc, | 1965 | .css_alloc = cpuset_css_alloc, |
1966 | .css_online = cpuset_css_online, | ||
1967 | .css_offline = cpuset_css_offline, | ||
1875 | .css_free = cpuset_css_free, | 1968 | .css_free = cpuset_css_free, |
1876 | .can_attach = cpuset_can_attach, | 1969 | .can_attach = cpuset_can_attach, |
1970 | .cancel_attach = cpuset_cancel_attach, | ||
1877 | .attach = cpuset_attach, | 1971 | .attach = cpuset_attach, |
1878 | .subsys_id = cpuset_subsys_id, | 1972 | .subsys_id = cpuset_subsys_id, |
1879 | .base_cftypes = files, | 1973 | .base_cftypes = files, |
@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk, | |||
1924 | { | 2018 | { |
1925 | struct cgroup *new_cgroup = scan->data; | 2019 | struct cgroup *new_cgroup = scan->data; |
1926 | 2020 | ||
2021 | cgroup_lock(); | ||
1927 | cgroup_attach_task(new_cgroup, tsk); | 2022 | cgroup_attach_task(new_cgroup, tsk); |
2023 | cgroup_unlock(); | ||
1928 | } | 2024 | } |
1929 | 2025 | ||
1930 | /** | 2026 | /** |
@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk, | |||
1932 | * @from: cpuset in which the tasks currently reside | 2028 | * @from: cpuset in which the tasks currently reside |
1933 | * @to: cpuset to which the tasks will be moved | 2029 | * @to: cpuset to which the tasks will be moved |
1934 | * | 2030 | * |
1935 | * Called with cgroup_mutex held | 2031 | * Called with cpuset_mutex held |
1936 | * callback_mutex must not be held, as cpuset_attach() will take it. | 2032 | * callback_mutex must not be held, as cpuset_attach() will take it. |
1937 | * | 2033 | * |
1938 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 2034 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
1959 | * removing that CPU or node from all cpusets. If this removes the | 2055 | * removing that CPU or node from all cpusets. If this removes the |
1960 | * last CPU or node from a cpuset, then move the tasks in the empty | 2056 | * last CPU or node from a cpuset, then move the tasks in the empty |
1961 | * cpuset to its next-highest non-empty parent. | 2057 | * cpuset to its next-highest non-empty parent. |
1962 | * | ||
1963 | * Called with cgroup_mutex held | ||
1964 | * callback_mutex must not be held, as cpuset_attach() will take it. | ||
1965 | */ | 2058 | */ |
1966 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | 2059 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) |
1967 | { | 2060 | { |
1968 | struct cpuset *parent; | 2061 | struct cpuset *parent; |
1969 | 2062 | ||
1970 | /* | 2063 | /* |
1971 | * The cgroup's css_sets list is in use if there are tasks | ||
1972 | * in the cpuset; the list is empty if there are none; | ||
1973 | * the cs->css.refcnt seems always 0. | ||
1974 | */ | ||
1975 | if (list_empty(&cs->css.cgroup->css_sets)) | ||
1976 | return; | ||
1977 | |||
1978 | /* | ||
1979 | * Find its next-highest non-empty parent, (top cpuset | 2064 | * Find its next-highest non-empty parent, (top cpuset |
1980 | * has online cpus, so can't be empty). | 2065 | * has online cpus, so can't be empty). |
1981 | */ | 2066 | */ |
1982 | parent = cs->parent; | 2067 | parent = parent_cs(cs); |
1983 | while (cpumask_empty(parent->cpus_allowed) || | 2068 | while (cpumask_empty(parent->cpus_allowed) || |
1984 | nodes_empty(parent->mems_allowed)) | 2069 | nodes_empty(parent->mems_allowed)) |
1985 | parent = parent->parent; | 2070 | parent = parent_cs(parent); |
1986 | 2071 | ||
1987 | move_member_tasks_to_cpuset(cs, parent); | 2072 | move_member_tasks_to_cpuset(cs, parent); |
1988 | } | 2073 | } |
1989 | 2074 | ||
1990 | /* | 2075 | /** |
1991 | * Helper function to traverse cpusets. | 2076 | * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset |
1992 | * It can be used to walk the cpuset tree from top to bottom, completing | 2077 | * @cs: cpuset in interest |
1993 | * one layer before dropping down to the next (thus always processing a | 2078 | * |
1994 | * node before any of its children). | 2079 | * Compare @cs's cpu and mem masks against top_cpuset and if some have gone |
2080 | * offline, update @cs accordingly. If @cs ends up with no CPU or memory, | ||
2081 | * all its tasks are moved to the nearest ancestor with both resources. | ||
1995 | */ | 2082 | */ |
1996 | static struct cpuset *cpuset_next(struct list_head *queue) | 2083 | static void cpuset_propagate_hotplug_workfn(struct work_struct *work) |
1997 | { | 2084 | { |
1998 | struct cpuset *cp; | 2085 | static cpumask_t off_cpus; |
1999 | struct cpuset *child; /* scans child cpusets of cp */ | 2086 | static nodemask_t off_mems, tmp_mems; |
2000 | struct cgroup *cont; | 2087 | struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); |
2088 | bool is_empty; | ||
2001 | 2089 | ||
2002 | if (list_empty(queue)) | 2090 | mutex_lock(&cpuset_mutex); |
2003 | return NULL; | 2091 | |
2092 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); | ||
2093 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); | ||
2004 | 2094 | ||
2005 | cp = list_first_entry(queue, struct cpuset, stack_list); | 2095 | /* remove offline cpus from @cs */ |
2006 | list_del(queue->next); | 2096 | if (!cpumask_empty(&off_cpus)) { |
2007 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 2097 | mutex_lock(&callback_mutex); |
2008 | child = cgroup_cs(cont); | 2098 | cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); |
2009 | list_add_tail(&child->stack_list, queue); | 2099 | mutex_unlock(&callback_mutex); |
2100 | update_tasks_cpumask(cs, NULL); | ||
2010 | } | 2101 | } |
2011 | 2102 | ||
2012 | return cp; | 2103 | /* remove offline mems from @cs */ |
2104 | if (!nodes_empty(off_mems)) { | ||
2105 | tmp_mems = cs->mems_allowed; | ||
2106 | mutex_lock(&callback_mutex); | ||
2107 | nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); | ||
2108 | mutex_unlock(&callback_mutex); | ||
2109 | update_tasks_nodemask(cs, &tmp_mems, NULL); | ||
2110 | } | ||
2111 | |||
2112 | is_empty = cpumask_empty(cs->cpus_allowed) || | ||
2113 | nodes_empty(cs->mems_allowed); | ||
2114 | |||
2115 | mutex_unlock(&cpuset_mutex); | ||
2116 | |||
2117 | /* | ||
2118 | * If @cs became empty, move tasks to the nearest ancestor with | ||
2119 | * execution resources. This is full cgroup operation which will | ||
2120 | * also call back into cpuset. Should be done outside any lock. | ||
2121 | */ | ||
2122 | if (is_empty) | ||
2123 | remove_tasks_in_empty_cpuset(cs); | ||
2124 | |||
2125 | /* the following may free @cs, should be the last operation */ | ||
2126 | css_put(&cs->css); | ||
2013 | } | 2127 | } |
2014 | 2128 | ||
2129 | /** | ||
2130 | * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset | ||
2131 | * @cs: cpuset of interest | ||
2132 | * | ||
2133 | * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and | ||
2134 | * memory masks according to top_cpuset. | ||
2135 | */ | ||
2136 | static void schedule_cpuset_propagate_hotplug(struct cpuset *cs) | ||
2137 | { | ||
2138 | /* | ||
2139 | * Pin @cs. The refcnt will be released when the work item | ||
2140 | * finishes executing. | ||
2141 | */ | ||
2142 | if (!css_tryget(&cs->css)) | ||
2143 | return; | ||
2015 | 2144 | ||
2016 | /* | 2145 | /* |
2017 | * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory | 2146 | * Queue @cs->hotplug_work. If already pending, lose the css ref. |
2018 | * online/offline) and update the cpusets accordingly. | 2147 | * cpuset_propagate_hotplug_wq is ordered and propagation will |
2019 | * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such | 2148 | * happen in the order this function is called. |
2020 | * cpuset must be moved to a parent cpuset. | 2149 | */ |
2150 | if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work)) | ||
2151 | css_put(&cs->css); | ||
2152 | } | ||
2153 | |||
2154 | /** | ||
2155 | * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset | ||
2021 | * | 2156 | * |
2022 | * Called with cgroup_mutex held. We take callback_mutex to modify | 2157 | * This function is called after either CPU or memory configuration has |
2023 | * cpus_allowed and mems_allowed. | 2158 | * changed and updates cpuset accordingly. The top_cpuset is always |
2159 | * synchronized to cpu_active_mask and N_MEMORY, which is necessary in | ||
2160 | * order to make cpusets transparent (of no affect) on systems that are | ||
2161 | * actively using CPU hotplug but making no active use of cpusets. | ||
2024 | * | 2162 | * |
2025 | * This walk processes the tree from top to bottom, completing one layer | 2163 | * Non-root cpusets are only affected by offlining. If any CPUs or memory |
2026 | * before dropping down to the next. It always processes a node before | 2164 | * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all |
2027 | * any of its children. | 2165 | * descendants. |
2028 | * | 2166 | * |
2029 | * In the case of memory hot-unplug, it will remove nodes from N_MEMORY | 2167 | * Note that CPU offlining during suspend is ignored. We don't modify |
2030 | * if all present pages from a node are offlined. | 2168 | * cpusets across suspend/resume cycles at all. |
2031 | */ | 2169 | */ |
2032 | static void | 2170 | static void cpuset_hotplug_workfn(struct work_struct *work) |
2033 | scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | ||
2034 | { | 2171 | { |
2035 | LIST_HEAD(queue); | 2172 | static cpumask_t new_cpus, tmp_cpus; |
2036 | struct cpuset *cp; /* scans cpusets being updated */ | 2173 | static nodemask_t new_mems, tmp_mems; |
2037 | static nodemask_t oldmems; /* protected by cgroup_mutex */ | 2174 | bool cpus_updated, mems_updated; |
2175 | bool cpus_offlined, mems_offlined; | ||
2038 | 2176 | ||
2039 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 2177 | mutex_lock(&cpuset_mutex); |
2040 | 2178 | ||
2041 | switch (event) { | 2179 | /* fetch the available cpus/mems and find out which changed how */ |
2042 | case CPUSET_CPU_OFFLINE: | 2180 | cpumask_copy(&new_cpus, cpu_active_mask); |
2043 | while ((cp = cpuset_next(&queue)) != NULL) { | 2181 | new_mems = node_states[N_MEMORY]; |
2044 | 2182 | ||
2045 | /* Continue past cpusets with all cpus online */ | 2183 | cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); |
2046 | if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) | 2184 | cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed, |
2047 | continue; | 2185 | &new_cpus); |
2048 | 2186 | ||
2049 | /* Remove offline cpus from this cpuset. */ | 2187 | mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); |
2050 | mutex_lock(&callback_mutex); | 2188 | nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems); |
2051 | cpumask_and(cp->cpus_allowed, cp->cpus_allowed, | 2189 | mems_offlined = !nodes_empty(tmp_mems); |
2052 | cpu_active_mask); | ||
2053 | mutex_unlock(&callback_mutex); | ||
2054 | 2190 | ||
2055 | /* Move tasks from the empty cpuset to a parent */ | 2191 | /* synchronize cpus_allowed to cpu_active_mask */ |
2056 | if (cpumask_empty(cp->cpus_allowed)) | 2192 | if (cpus_updated) { |
2057 | remove_tasks_in_empty_cpuset(cp); | 2193 | mutex_lock(&callback_mutex); |
2058 | else | 2194 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); |
2059 | update_tasks_cpumask(cp, NULL); | 2195 | mutex_unlock(&callback_mutex); |
2060 | } | 2196 | /* we don't mess with cpumasks of tasks in top_cpuset */ |
2061 | break; | 2197 | } |
2062 | 2198 | ||
2063 | case CPUSET_MEM_OFFLINE: | 2199 | /* synchronize mems_allowed to N_MEMORY */ |
2064 | while ((cp = cpuset_next(&queue)) != NULL) { | 2200 | if (mems_updated) { |
2201 | tmp_mems = top_cpuset.mems_allowed; | ||
2202 | mutex_lock(&callback_mutex); | ||
2203 | top_cpuset.mems_allowed = new_mems; | ||
2204 | mutex_unlock(&callback_mutex); | ||
2205 | update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL); | ||
2206 | } | ||
2065 | 2207 | ||
2066 | /* Continue past cpusets with all mems online */ | 2208 | /* if cpus or mems went down, we need to propagate to descendants */ |
2067 | if (nodes_subset(cp->mems_allowed, | 2209 | if (cpus_offlined || mems_offlined) { |
2068 | node_states[N_MEMORY])) | 2210 | struct cpuset *cs; |
2069 | continue; | 2211 | struct cgroup *pos_cgrp; |
2070 | 2212 | ||
2071 | oldmems = cp->mems_allowed; | 2213 | rcu_read_lock(); |
2214 | cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) | ||
2215 | schedule_cpuset_propagate_hotplug(cs); | ||
2216 | rcu_read_unlock(); | ||
2217 | } | ||
2072 | 2218 | ||
2073 | /* Remove offline mems from this cpuset. */ | 2219 | mutex_unlock(&cpuset_mutex); |
2074 | mutex_lock(&callback_mutex); | ||
2075 | nodes_and(cp->mems_allowed, cp->mems_allowed, | ||
2076 | node_states[N_MEMORY]); | ||
2077 | mutex_unlock(&callback_mutex); | ||
2078 | 2220 | ||
2079 | /* Move tasks from the empty cpuset to a parent */ | 2221 | /* wait for propagations to finish */ |
2080 | if (nodes_empty(cp->mems_allowed)) | 2222 | flush_workqueue(cpuset_propagate_hotplug_wq); |
2081 | remove_tasks_in_empty_cpuset(cp); | 2223 | |
2082 | else | 2224 | /* rebuild sched domains if cpus_allowed has changed */ |
2083 | update_tasks_nodemask(cp, &oldmems, NULL); | 2225 | if (cpus_updated) { |
2084 | } | 2226 | struct sched_domain_attr *attr; |
2227 | cpumask_var_t *doms; | ||
2228 | int ndoms; | ||
2229 | |||
2230 | mutex_lock(&cpuset_mutex); | ||
2231 | ndoms = generate_sched_domains(&doms, &attr); | ||
2232 | mutex_unlock(&cpuset_mutex); | ||
2233 | |||
2234 | partition_sched_domains(ndoms, doms, attr); | ||
2085 | } | 2235 | } |
2086 | } | 2236 | } |
2087 | 2237 | ||
2088 | /* | ||
2089 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | ||
2090 | * period. This is necessary in order to make cpusets transparent | ||
2091 | * (of no affect) on systems that are actively using CPU hotplug | ||
2092 | * but making no active use of cpusets. | ||
2093 | * | ||
2094 | * The only exception to this is suspend/resume, where we don't | ||
2095 | * modify cpusets at all. | ||
2096 | * | ||
2097 | * This routine ensures that top_cpuset.cpus_allowed tracks | ||
2098 | * cpu_active_mask on each CPU hotplug (cpuhp) event. | ||
2099 | * | ||
2100 | * Called within get_online_cpus(). Needs to call cgroup_lock() | ||
2101 | * before calling generate_sched_domains(). | ||
2102 | * | ||
2103 | * @cpu_online: Indicates whether this is a CPU online event (true) or | ||
2104 | * a CPU offline event (false). | ||
2105 | */ | ||
2106 | void cpuset_update_active_cpus(bool cpu_online) | 2238 | void cpuset_update_active_cpus(bool cpu_online) |
2107 | { | 2239 | { |
2108 | struct sched_domain_attr *attr; | 2240 | /* |
2109 | cpumask_var_t *doms; | 2241 | * We're inside cpu hotplug critical region which usually nests |
2110 | int ndoms; | 2242 | * inside cgroup synchronization. Bounce actual hotplug processing |
2111 | 2243 | * to a work item to avoid reverse locking order. | |
2112 | cgroup_lock(); | 2244 | * |
2113 | mutex_lock(&callback_mutex); | 2245 | * We still need to do partition_sched_domains() synchronously; |
2114 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2246 | * otherwise, the scheduler will get confused and put tasks to the |
2115 | mutex_unlock(&callback_mutex); | 2247 | * dead CPU. Fall back to the default single domain. |
2116 | 2248 | * cpuset_hotplug_workfn() will rebuild it as necessary. | |
2117 | if (!cpu_online) | 2249 | */ |
2118 | scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); | 2250 | partition_sched_domains(1, NULL, NULL); |
2119 | 2251 | schedule_work(&cpuset_hotplug_work); | |
2120 | ndoms = generate_sched_domains(&doms, &attr); | ||
2121 | cgroup_unlock(); | ||
2122 | |||
2123 | /* Have scheduler rebuild the domains */ | ||
2124 | partition_sched_domains(ndoms, doms, attr); | ||
2125 | } | 2252 | } |
2126 | 2253 | ||
2127 | #ifdef CONFIG_MEMORY_HOTPLUG | 2254 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online) | |||
2133 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2260 | static int cpuset_track_online_nodes(struct notifier_block *self, |
2134 | unsigned long action, void *arg) | 2261 | unsigned long action, void *arg) |
2135 | { | 2262 | { |
2136 | static nodemask_t oldmems; /* protected by cgroup_mutex */ | 2263 | schedule_work(&cpuset_hotplug_work); |
2137 | |||
2138 | cgroup_lock(); | ||
2139 | switch (action) { | ||
2140 | case MEM_ONLINE: | ||
2141 | oldmems = top_cpuset.mems_allowed; | ||
2142 | mutex_lock(&callback_mutex); | ||
2143 | top_cpuset.mems_allowed = node_states[N_MEMORY]; | ||
2144 | mutex_unlock(&callback_mutex); | ||
2145 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); | ||
2146 | break; | ||
2147 | case MEM_OFFLINE: | ||
2148 | /* | ||
2149 | * needn't update top_cpuset.mems_allowed explicitly because | ||
2150 | * scan_cpusets_upon_hotplug() will update it. | ||
2151 | */ | ||
2152 | scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE); | ||
2153 | break; | ||
2154 | default: | ||
2155 | break; | ||
2156 | } | ||
2157 | cgroup_unlock(); | ||
2158 | |||
2159 | return NOTIFY_OK; | 2264 | return NOTIFY_OK; |
2160 | } | 2265 | } |
2161 | #endif | 2266 | #endif |
@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void) | |||
2173 | 2278 | ||
2174 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2279 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
2175 | 2280 | ||
2176 | cpuset_wq = create_singlethread_workqueue("cpuset"); | 2281 | cpuset_propagate_hotplug_wq = |
2177 | BUG_ON(!cpuset_wq); | 2282 | alloc_ordered_workqueue("cpuset_hotplug", 0); |
2283 | BUG_ON(!cpuset_propagate_hotplug_wq); | ||
2178 | } | 2284 | } |
2179 | 2285 | ||
2180 | /** | 2286 | /** |
@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
2273 | */ | 2379 | */ |
2274 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) | 2380 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) |
2275 | { | 2381 | { |
2276 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) | 2382 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) |
2277 | cs = cs->parent; | 2383 | cs = parent_cs(cs); |
2278 | return cs; | 2384 | return cs; |
2279 | } | 2385 | } |
2280 | 2386 | ||
@@ -2412,17 +2518,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | |||
2412 | } | 2518 | } |
2413 | 2519 | ||
2414 | /** | 2520 | /** |
2415 | * cpuset_unlock - release lock on cpuset changes | ||
2416 | * | ||
2417 | * Undo the lock taken in a previous cpuset_lock() call. | ||
2418 | */ | ||
2419 | |||
2420 | void cpuset_unlock(void) | ||
2421 | { | ||
2422 | mutex_unlock(&callback_mutex); | ||
2423 | } | ||
2424 | |||
2425 | /** | ||
2426 | * cpuset_mem_spread_node() - On which node to begin search for a file page | 2521 | * cpuset_mem_spread_node() - On which node to begin search for a file page |
2427 | * cpuset_slab_spread_node() - On which node to begin search for a slab page | 2522 | * cpuset_slab_spread_node() - On which node to begin search for a slab page |
2428 | * | 2523 | * |
@@ -2568,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void) | |||
2568 | * - Used for /proc/<pid>/cpuset. | 2663 | * - Used for /proc/<pid>/cpuset. |
2569 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | 2664 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
2570 | * doesn't really matter if tsk->cpuset changes after we read it, | 2665 | * doesn't really matter if tsk->cpuset changes after we read it, |
2571 | * and we take cgroup_mutex, keeping cpuset_attach() from changing it | 2666 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it |
2572 | * anyway. | 2667 | * anyway. |
2573 | */ | 2668 | */ |
2574 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) | 2669 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) |
@@ -2590,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
2590 | if (!tsk) | 2685 | if (!tsk) |
2591 | goto out_free; | 2686 | goto out_free; |
2592 | 2687 | ||
2593 | retval = -EINVAL; | 2688 | rcu_read_lock(); |
2594 | cgroup_lock(); | ||
2595 | css = task_subsys_state(tsk, cpuset_subsys_id); | 2689 | css = task_subsys_state(tsk, cpuset_subsys_id); |
2596 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); | 2690 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); |
2691 | rcu_read_unlock(); | ||
2597 | if (retval < 0) | 2692 | if (retval < 0) |
2598 | goto out_unlock; | 2693 | goto out_put_task; |
2599 | seq_puts(m, buf); | 2694 | seq_puts(m, buf); |
2600 | seq_putc(m, '\n'); | 2695 | seq_putc(m, '\n'); |
2601 | out_unlock: | 2696 | out_put_task: |
2602 | cgroup_unlock(); | ||
2603 | put_task_struct(tsk); | 2697 | put_task_struct(tsk); |
2604 | out_free: | 2698 | out_free: |
2605 | kfree(buf); | 2699 | kfree(buf); |