diff options
author | Tejun Heo <tj@kernel.org> | 2013-01-07 11:51:08 -0500 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2013-01-07 11:51:08 -0500 |
commit | 5d21cc2db040d01f8c19b8602f6987813e1176b4 (patch) | |
tree | 0dcb94aefa3fee2e4c436a50fc5eeb9e45fa3988 /kernel/cpuset.c | |
parent | 02bb586372a71595203b3ff19a9be48eaa076f6c (diff) |
cpuset: replace cgroup_mutex locking with cpuset internal locking
Supposedly for historical reasons, cpuset depends on cgroup core for
locking. It depends on cgroup_mutex in cgroup callbacks and grabs
cgroup_mutex from other places where it wants to be synchronized.
This is majorly messy and highly prone to introducing circular locking
dependency especially because cgroup_mutex is supposed to be one of
the outermost locks.
As previous patches already plugged possible races which may happen by
decoupling from cgroup_mutex, replacing cgroup_mutex with cpuset
specific cpuset_mutex is mostly straight-forward. Introduce
cpuset_mutex, replace all occurrences of cgroup_mutex with it, and add
cpuset_mutex locking to places which inherited cgroup_mutex from
cgroup core.
The only complication is from cpuset wanting to initiate task
migration when a cpuset loses all cpus or memory nodes. Task
migration may go through full cgroup and all subsystem locking and
should be initiated without holding any cpuset specific lock; however,
a previous patch already made hotplug handled asynchronously and
moving the task migration part outside other locks is easy.
cpuset_propagate_hotplug_workfn() now invokes
remove_tasks_in_empty_cpuset() without holding any lock.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 188 |
1 files changed, 107 insertions, 81 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 644281003f5d..5e348ae37ce9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -208,23 +208,20 @@ static struct cpuset top_cpuset = { | |||
208 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) | 208 | if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) |
209 | 209 | ||
210 | /* | 210 | /* |
211 | * There are two global mutexes guarding cpuset structures. The first | 211 | * There are two global mutexes guarding cpuset structures - cpuset_mutex |
212 | * is the main control groups cgroup_mutex, accessed via | 212 | * and callback_mutex. The latter may nest inside the former. We also |
213 | * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific | 213 | * require taking task_lock() when dereferencing a task's cpuset pointer. |
214 | * callback_mutex, below. They can nest. It is ok to first take | 214 | * See "The task_lock() exception", at the end of this comment. |
215 | * cgroup_mutex, then nest callback_mutex. We also require taking | 215 | * |
216 | * task_lock() when dereferencing a task's cpuset pointer. See "The | 216 | * A task must hold both mutexes to modify cpusets. If a task holds |
217 | * task_lock() exception", at the end of this comment. | 217 | * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it |
218 | * | 218 | * is the only task able to also acquire callback_mutex and be able to |
219 | * A task must hold both mutexes to modify cpusets. If a task | 219 | * modify cpusets. It can perform various checks on the cpuset structure |
220 | * holds cgroup_mutex, then it blocks others wanting that mutex, | 220 | * first, knowing nothing will change. It can also allocate memory while |
221 | * ensuring that it is the only task able to also acquire callback_mutex | 221 | * just holding cpuset_mutex. While it is performing these checks, various |
222 | * and be able to modify cpusets. It can perform various checks on | 222 | * callback routines can briefly acquire callback_mutex to query cpusets. |
223 | * the cpuset structure first, knowing nothing will change. It can | 223 | * Once it is ready to make the changes, it takes callback_mutex, blocking |
224 | * also allocate memory while just holding cgroup_mutex. While it is | 224 | * everyone else. |
225 | * performing these checks, various callback routines can briefly | ||
226 | * acquire callback_mutex to query cpusets. Once it is ready to make | ||
227 | * the changes, it takes callback_mutex, blocking everyone else. | ||
228 | * | 225 | * |
229 | * Calls to the kernel memory allocator can not be made while holding | 226 | * Calls to the kernel memory allocator can not be made while holding |
230 | * callback_mutex, as that would risk double tripping on callback_mutex | 227 | * callback_mutex, as that would risk double tripping on callback_mutex |
@@ -246,6 +243,7 @@ static struct cpuset top_cpuset = { | |||
246 | * guidelines for accessing subsystem state in kernel/cgroup.c | 243 | * guidelines for accessing subsystem state in kernel/cgroup.c |
247 | */ | 244 | */ |
248 | 245 | ||
246 | static DEFINE_MUTEX(cpuset_mutex); | ||
249 | static DEFINE_MUTEX(callback_mutex); | 247 | static DEFINE_MUTEX(callback_mutex); |
250 | 248 | ||
251 | /* | 249 | /* |
@@ -351,7 +349,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
351 | /* | 349 | /* |
352 | * update task's spread flag if cpuset's page/slab spread flag is set | 350 | * update task's spread flag if cpuset's page/slab spread flag is set |
353 | * | 351 | * |
354 | * Called with callback_mutex/cgroup_mutex held | 352 | * Called with callback_mutex/cpuset_mutex held |
355 | */ | 353 | */ |
356 | static void cpuset_update_task_spread_flag(struct cpuset *cs, | 354 | static void cpuset_update_task_spread_flag(struct cpuset *cs, |
357 | struct task_struct *tsk) | 355 | struct task_struct *tsk) |
@@ -371,7 +369,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, | |||
371 | * | 369 | * |
372 | * One cpuset is a subset of another if all its allowed CPUs and | 370 | * One cpuset is a subset of another if all its allowed CPUs and |
373 | * Memory Nodes are a subset of the other, and its exclusive flags | 371 | * Memory Nodes are a subset of the other, and its exclusive flags |
374 | * are only set if the other's are set. Call holding cgroup_mutex. | 372 | * are only set if the other's are set. Call holding cpuset_mutex. |
375 | */ | 373 | */ |
376 | 374 | ||
377 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 375 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
@@ -420,7 +418,7 @@ static void free_trial_cpuset(struct cpuset *trial) | |||
420 | * If we replaced the flag and mask values of the current cpuset | 418 | * If we replaced the flag and mask values of the current cpuset |
421 | * (cur) with those values in the trial cpuset (trial), would | 419 | * (cur) with those values in the trial cpuset (trial), would |
422 | * our various subset and exclusive rules still be valid? Presumes | 420 | * our various subset and exclusive rules still be valid? Presumes |
423 | * cgroup_mutex held. | 421 | * cpuset_mutex held. |
424 | * | 422 | * |
425 | * 'cur' is the address of an actual, in-use cpuset. Operations | 423 | * 'cur' is the address of an actual, in-use cpuset. Operations |
426 | * such as list traversal that depend on the actual address of the | 424 | * such as list traversal that depend on the actual address of the |
@@ -555,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | |||
555 | * domains when operating in the severe memory shortage situations | 553 | * domains when operating in the severe memory shortage situations |
556 | * that could cause allocation failures below. | 554 | * that could cause allocation failures below. |
557 | * | 555 | * |
558 | * Must be called with cgroup_lock held. | 556 | * Must be called with cpuset_mutex held. |
559 | * | 557 | * |
560 | * The three key local variables below are: | 558 | * The three key local variables below are: |
561 | * q - a linked-list queue of cpuset pointers, used to implement a | 559 | * q - a linked-list queue of cpuset pointers, used to implement a |
@@ -766,7 +764,7 @@ done: | |||
766 | * 'cpus' is removed, then call this routine to rebuild the | 764 | * 'cpus' is removed, then call this routine to rebuild the |
767 | * scheduler's dynamic sched domains. | 765 | * scheduler's dynamic sched domains. |
768 | * | 766 | * |
769 | * Call with cgroup_mutex held. Takes get_online_cpus(). | 767 | * Call with cpuset_mutex held. Takes get_online_cpus(). |
770 | */ | 768 | */ |
771 | static void rebuild_sched_domains_locked(void) | 769 | static void rebuild_sched_domains_locked(void) |
772 | { | 770 | { |
@@ -774,7 +772,7 @@ static void rebuild_sched_domains_locked(void) | |||
774 | cpumask_var_t *doms; | 772 | cpumask_var_t *doms; |
775 | int ndoms; | 773 | int ndoms; |
776 | 774 | ||
777 | WARN_ON_ONCE(!cgroup_lock_is_held()); | 775 | lockdep_assert_held(&cpuset_mutex); |
778 | get_online_cpus(); | 776 | get_online_cpus(); |
779 | 777 | ||
780 | /* Generate domain masks and attrs */ | 778 | /* Generate domain masks and attrs */ |
@@ -800,9 +798,9 @@ static int generate_sched_domains(cpumask_var_t **domains, | |||
800 | 798 | ||
801 | void rebuild_sched_domains(void) | 799 | void rebuild_sched_domains(void) |
802 | { | 800 | { |
803 | cgroup_lock(); | 801 | mutex_lock(&cpuset_mutex); |
804 | rebuild_sched_domains_locked(); | 802 | rebuild_sched_domains_locked(); |
805 | cgroup_unlock(); | 803 | mutex_unlock(&cpuset_mutex); |
806 | } | 804 | } |
807 | 805 | ||
808 | /** | 806 | /** |
@@ -810,7 +808,7 @@ void rebuild_sched_domains(void) | |||
810 | * @tsk: task to test | 808 | * @tsk: task to test |
811 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | 809 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner |
812 | * | 810 | * |
813 | * Call with cgroup_mutex held. May take callback_mutex during call. | 811 | * Call with cpuset_mutex held. May take callback_mutex during call. |
814 | * Called for each task in a cgroup by cgroup_scan_tasks(). | 812 | * Called for each task in a cgroup by cgroup_scan_tasks(). |
815 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other | 813 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other |
816 | * words, if its mask is not equal to its cpuset's mask). | 814 | * words, if its mask is not equal to its cpuset's mask). |
@@ -831,7 +829,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, | |||
831 | * cpus_allowed mask needs to be changed. | 829 | * cpus_allowed mask needs to be changed. |
832 | * | 830 | * |
833 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 831 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
834 | * holding cgroup_lock() at this point. | 832 | * holding cpuset_mutex at this point. |
835 | */ | 833 | */ |
836 | static void cpuset_change_cpumask(struct task_struct *tsk, | 834 | static void cpuset_change_cpumask(struct task_struct *tsk, |
837 | struct cgroup_scanner *scan) | 835 | struct cgroup_scanner *scan) |
@@ -844,7 +842,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, | |||
844 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | 842 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed |
845 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 843 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
846 | * | 844 | * |
847 | * Called with cgroup_mutex held | 845 | * Called with cpuset_mutex held |
848 | * | 846 | * |
849 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 847 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
850 | * calling callback functions for each. | 848 | * calling callback functions for each. |
@@ -934,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
934 | * Temporarilly set tasks mems_allowed to target nodes of migration, | 932 | * Temporarilly set tasks mems_allowed to target nodes of migration, |
935 | * so that the migration code can allocate pages on these nodes. | 933 | * so that the migration code can allocate pages on these nodes. |
936 | * | 934 | * |
937 | * Call holding cgroup_mutex, so current's cpuset won't change | 935 | * Call holding cpuset_mutex, so current's cpuset won't change |
938 | * during this call, as manage_mutex holds off any cpuset_attach() | 936 | * during this call, as manage_mutex holds off any cpuset_attach() |
939 | * calls. Therefore we don't need to take task_lock around the | 937 | * calls. Therefore we don't need to take task_lock around the |
940 | * call to guarantee_online_mems(), as we know no one is changing | 938 | * call to guarantee_online_mems(), as we know no one is changing |
@@ -1009,7 +1007,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, | |||
1009 | /* | 1007 | /* |
1010 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy | 1008 | * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy |
1011 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if | 1009 | * of it to cpuset's new mems_allowed, and migrate pages to new nodes if |
1012 | * memory_migrate flag is set. Called with cgroup_mutex held. | 1010 | * memory_migrate flag is set. Called with cpuset_mutex held. |
1013 | */ | 1011 | */ |
1014 | static void cpuset_change_nodemask(struct task_struct *p, | 1012 | static void cpuset_change_nodemask(struct task_struct *p, |
1015 | struct cgroup_scanner *scan) | 1013 | struct cgroup_scanner *scan) |
@@ -1018,7 +1016,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
1018 | struct cpuset *cs; | 1016 | struct cpuset *cs; |
1019 | int migrate; | 1017 | int migrate; |
1020 | const nodemask_t *oldmem = scan->data; | 1018 | const nodemask_t *oldmem = scan->data; |
1021 | static nodemask_t newmems; /* protected by cgroup_mutex */ | 1019 | static nodemask_t newmems; /* protected by cpuset_mutex */ |
1022 | 1020 | ||
1023 | cs = cgroup_cs(scan->cg); | 1021 | cs = cgroup_cs(scan->cg); |
1024 | guarantee_online_mems(cs, &newmems); | 1022 | guarantee_online_mems(cs, &newmems); |
@@ -1045,7 +1043,7 @@ static void *cpuset_being_rebound; | |||
1045 | * @oldmem: old mems_allowed of cpuset cs | 1043 | * @oldmem: old mems_allowed of cpuset cs |
1046 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1044 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
1047 | * | 1045 | * |
1048 | * Called with cgroup_mutex held | 1046 | * Called with cpuset_mutex held |
1049 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | 1047 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 |
1050 | * if @heap != NULL. | 1048 | * if @heap != NULL. |
1051 | */ | 1049 | */ |
@@ -1067,7 +1065,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
1067 | * take while holding tasklist_lock. Forks can happen - the | 1065 | * take while holding tasklist_lock. Forks can happen - the |
1068 | * mpol_dup() cpuset_being_rebound check will catch such forks, | 1066 | * mpol_dup() cpuset_being_rebound check will catch such forks, |
1069 | * and rebind their vma mempolicies too. Because we still hold | 1067 | * and rebind their vma mempolicies too. Because we still hold |
1070 | * the global cgroup_mutex, we know that no other rebind effort | 1068 | * the global cpuset_mutex, we know that no other rebind effort |
1071 | * will be contending for the global variable cpuset_being_rebound. | 1069 | * will be contending for the global variable cpuset_being_rebound. |
1072 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 1070 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
1073 | * is idempotent. Also migrate pages in each mm to new nodes. | 1071 | * is idempotent. Also migrate pages in each mm to new nodes. |
@@ -1086,7 +1084,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, | |||
1086 | * mempolicies and if the cpuset is marked 'memory_migrate', | 1084 | * mempolicies and if the cpuset is marked 'memory_migrate', |
1087 | * migrate the tasks pages to the new memory. | 1085 | * migrate the tasks pages to the new memory. |
1088 | * | 1086 | * |
1089 | * Call with cgroup_mutex held. May take callback_mutex during call. | 1087 | * Call with cpuset_mutex held. May take callback_mutex during call. |
1090 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 1088 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
1091 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 1089 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
1092 | * their mempolicies to the cpusets new mems_allowed. | 1090 | * their mempolicies to the cpusets new mems_allowed. |
@@ -1184,7 +1182,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1184 | * Called by cgroup_scan_tasks() for each task in a cgroup. | 1182 | * Called by cgroup_scan_tasks() for each task in a cgroup. |
1185 | * | 1183 | * |
1186 | * We don't need to re-check for the cgroup/cpuset membership, since we're | 1184 | * We don't need to re-check for the cgroup/cpuset membership, since we're |
1187 | * holding cgroup_lock() at this point. | 1185 | * holding cpuset_mutex at this point. |
1188 | */ | 1186 | */ |
1189 | static void cpuset_change_flag(struct task_struct *tsk, | 1187 | static void cpuset_change_flag(struct task_struct *tsk, |
1190 | struct cgroup_scanner *scan) | 1188 | struct cgroup_scanner *scan) |
@@ -1197,7 +1195,7 @@ static void cpuset_change_flag(struct task_struct *tsk, | |||
1197 | * @cs: the cpuset in which each task's spread flags needs to be changed | 1195 | * @cs: the cpuset in which each task's spread flags needs to be changed |
1198 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | 1196 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() |
1199 | * | 1197 | * |
1200 | * Called with cgroup_mutex held | 1198 | * Called with cpuset_mutex held |
1201 | * | 1199 | * |
1202 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 1200 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
1203 | * calling callback functions for each. | 1201 | * calling callback functions for each. |
@@ -1222,7 +1220,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) | |||
1222 | * cs: the cpuset to update | 1220 | * cs: the cpuset to update |
1223 | * turning_on: whether the flag is being set or cleared | 1221 | * turning_on: whether the flag is being set or cleared |
1224 | * | 1222 | * |
1225 | * Call with cgroup_mutex held. | 1223 | * Call with cpuset_mutex held. |
1226 | */ | 1224 | */ |
1227 | 1225 | ||
1228 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | 1226 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
@@ -1370,15 +1368,18 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
1370 | return val; | 1368 | return val; |
1371 | } | 1369 | } |
1372 | 1370 | ||
1373 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | 1371 | /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ |
1374 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1372 | static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
1375 | { | 1373 | { |
1376 | struct cpuset *cs = cgroup_cs(cgrp); | 1374 | struct cpuset *cs = cgroup_cs(cgrp); |
1377 | struct task_struct *task; | 1375 | struct task_struct *task; |
1378 | int ret; | 1376 | int ret; |
1379 | 1377 | ||
1378 | mutex_lock(&cpuset_mutex); | ||
1379 | |||
1380 | ret = -ENOSPC; | ||
1380 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1381 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
1381 | return -ENOSPC; | 1382 | goto out_unlock; |
1382 | 1383 | ||
1383 | cgroup_taskset_for_each(task, cgrp, tset) { | 1384 | cgroup_taskset_for_each(task, cgrp, tset) { |
1384 | /* | 1385 | /* |
@@ -1390,10 +1391,12 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1390 | * set_cpus_allowed_ptr() on all attached tasks before | 1391 | * set_cpus_allowed_ptr() on all attached tasks before |
1391 | * cpus_allowed may be changed. | 1392 | * cpus_allowed may be changed. |
1392 | */ | 1393 | */ |
1394 | ret = -EINVAL; | ||
1393 | if (task->flags & PF_THREAD_BOUND) | 1395 | if (task->flags & PF_THREAD_BOUND) |
1394 | return -EINVAL; | 1396 | goto out_unlock; |
1395 | if ((ret = security_task_setscheduler(task))) | 1397 | ret = security_task_setscheduler(task); |
1396 | return ret; | 1398 | if (ret) |
1399 | goto out_unlock; | ||
1397 | } | 1400 | } |
1398 | 1401 | ||
1399 | /* | 1402 | /* |
@@ -1401,18 +1404,22 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1401 | * changes which zero cpus/mems_allowed. | 1404 | * changes which zero cpus/mems_allowed. |
1402 | */ | 1405 | */ |
1403 | cs->attach_in_progress++; | 1406 | cs->attach_in_progress++; |
1404 | 1407 | ret = 0; | |
1405 | return 0; | 1408 | out_unlock: |
1409 | mutex_unlock(&cpuset_mutex); | ||
1410 | return ret; | ||
1406 | } | 1411 | } |
1407 | 1412 | ||
1408 | static void cpuset_cancel_attach(struct cgroup *cgrp, | 1413 | static void cpuset_cancel_attach(struct cgroup *cgrp, |
1409 | struct cgroup_taskset *tset) | 1414 | struct cgroup_taskset *tset) |
1410 | { | 1415 | { |
1416 | mutex_lock(&cpuset_mutex); | ||
1411 | cgroup_cs(cgrp)->attach_in_progress--; | 1417 | cgroup_cs(cgrp)->attach_in_progress--; |
1418 | mutex_unlock(&cpuset_mutex); | ||
1412 | } | 1419 | } |
1413 | 1420 | ||
1414 | /* | 1421 | /* |
1415 | * Protected by cgroup_mutex. cpus_attach is used only by cpuset_attach() | 1422 | * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() |
1416 | * but we can't allocate it dynamically there. Define it global and | 1423 | * but we can't allocate it dynamically there. Define it global and |
1417 | * allocate from cpuset_init(). | 1424 | * allocate from cpuset_init(). |
1418 | */ | 1425 | */ |
@@ -1420,7 +1427,7 @@ static cpumask_var_t cpus_attach; | |||
1420 | 1427 | ||
1421 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | 1428 | static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) |
1422 | { | 1429 | { |
1423 | /* static bufs protected by cgroup_mutex */ | 1430 | /* static bufs protected by cpuset_mutex */ |
1424 | static nodemask_t cpuset_attach_nodemask_from; | 1431 | static nodemask_t cpuset_attach_nodemask_from; |
1425 | static nodemask_t cpuset_attach_nodemask_to; | 1432 | static nodemask_t cpuset_attach_nodemask_to; |
1426 | struct mm_struct *mm; | 1433 | struct mm_struct *mm; |
@@ -1430,6 +1437,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1430 | struct cpuset *cs = cgroup_cs(cgrp); | 1437 | struct cpuset *cs = cgroup_cs(cgrp); |
1431 | struct cpuset *oldcs = cgroup_cs(oldcgrp); | 1438 | struct cpuset *oldcs = cgroup_cs(oldcgrp); |
1432 | 1439 | ||
1440 | mutex_lock(&cpuset_mutex); | ||
1441 | |||
1433 | /* prepare for attach */ | 1442 | /* prepare for attach */ |
1434 | if (cs == &top_cpuset) | 1443 | if (cs == &top_cpuset) |
1435 | cpumask_copy(cpus_attach, cpu_possible_mask); | 1444 | cpumask_copy(cpus_attach, cpu_possible_mask); |
@@ -1473,6 +1482,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) | |||
1473 | */ | 1482 | */ |
1474 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 1483 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
1475 | schedule_cpuset_propagate_hotplug(cs); | 1484 | schedule_cpuset_propagate_hotplug(cs); |
1485 | |||
1486 | mutex_unlock(&cpuset_mutex); | ||
1476 | } | 1487 | } |
1477 | 1488 | ||
1478 | /* The various types of files and directories in a cpuset file system */ | 1489 | /* The various types of files and directories in a cpuset file system */ |
@@ -1494,12 +1505,13 @@ typedef enum { | |||
1494 | 1505 | ||
1495 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1506 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) |
1496 | { | 1507 | { |
1497 | int retval = 0; | ||
1498 | struct cpuset *cs = cgroup_cs(cgrp); | 1508 | struct cpuset *cs = cgroup_cs(cgrp); |
1499 | cpuset_filetype_t type = cft->private; | 1509 | cpuset_filetype_t type = cft->private; |
1510 | int retval = -ENODEV; | ||
1500 | 1511 | ||
1501 | if (!cgroup_lock_live_group(cgrp)) | 1512 | mutex_lock(&cpuset_mutex); |
1502 | return -ENODEV; | 1513 | if (!is_cpuset_online(cs)) |
1514 | goto out_unlock; | ||
1503 | 1515 | ||
1504 | switch (type) { | 1516 | switch (type) { |
1505 | case FILE_CPU_EXCLUSIVE: | 1517 | case FILE_CPU_EXCLUSIVE: |
@@ -1533,18 +1545,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | |||
1533 | retval = -EINVAL; | 1545 | retval = -EINVAL; |
1534 | break; | 1546 | break; |
1535 | } | 1547 | } |
1536 | cgroup_unlock(); | 1548 | out_unlock: |
1549 | mutex_unlock(&cpuset_mutex); | ||
1537 | return retval; | 1550 | return retval; |
1538 | } | 1551 | } |
1539 | 1552 | ||
1540 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | 1553 | static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) |
1541 | { | 1554 | { |
1542 | int retval = 0; | ||
1543 | struct cpuset *cs = cgroup_cs(cgrp); | 1555 | struct cpuset *cs = cgroup_cs(cgrp); |
1544 | cpuset_filetype_t type = cft->private; | 1556 | cpuset_filetype_t type = cft->private; |
1557 | int retval = -ENODEV; | ||
1545 | 1558 | ||
1546 | if (!cgroup_lock_live_group(cgrp)) | 1559 | mutex_lock(&cpuset_mutex); |
1547 | return -ENODEV; | 1560 | if (!is_cpuset_online(cs)) |
1561 | goto out_unlock; | ||
1548 | 1562 | ||
1549 | switch (type) { | 1563 | switch (type) { |
1550 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1564 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
@@ -1554,7 +1568,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1554 | retval = -EINVAL; | 1568 | retval = -EINVAL; |
1555 | break; | 1569 | break; |
1556 | } | 1570 | } |
1557 | cgroup_unlock(); | 1571 | out_unlock: |
1572 | mutex_unlock(&cpuset_mutex); | ||
1558 | return retval; | 1573 | return retval; |
1559 | } | 1574 | } |
1560 | 1575 | ||
@@ -1564,9 +1579,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1564 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | 1579 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, |
1565 | const char *buf) | 1580 | const char *buf) |
1566 | { | 1581 | { |
1567 | int retval = 0; | ||
1568 | struct cpuset *cs = cgroup_cs(cgrp); | 1582 | struct cpuset *cs = cgroup_cs(cgrp); |
1569 | struct cpuset *trialcs; | 1583 | struct cpuset *trialcs; |
1584 | int retval = -ENODEV; | ||
1570 | 1585 | ||
1571 | /* | 1586 | /* |
1572 | * CPU or memory hotunplug may leave @cs w/o any execution | 1587 | * CPU or memory hotunplug may leave @cs w/o any execution |
@@ -1586,13 +1601,14 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1586 | flush_work(&cpuset_hotplug_work); | 1601 | flush_work(&cpuset_hotplug_work); |
1587 | flush_workqueue(cpuset_propagate_hotplug_wq); | 1602 | flush_workqueue(cpuset_propagate_hotplug_wq); |
1588 | 1603 | ||
1589 | if (!cgroup_lock_live_group(cgrp)) | 1604 | mutex_lock(&cpuset_mutex); |
1590 | return -ENODEV; | 1605 | if (!is_cpuset_online(cs)) |
1606 | goto out_unlock; | ||
1591 | 1607 | ||
1592 | trialcs = alloc_trial_cpuset(cs); | 1608 | trialcs = alloc_trial_cpuset(cs); |
1593 | if (!trialcs) { | 1609 | if (!trialcs) { |
1594 | retval = -ENOMEM; | 1610 | retval = -ENOMEM; |
1595 | goto out; | 1611 | goto out_unlock; |
1596 | } | 1612 | } |
1597 | 1613 | ||
1598 | switch (cft->private) { | 1614 | switch (cft->private) { |
@@ -1608,8 +1624,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | |||
1608 | } | 1624 | } |
1609 | 1625 | ||
1610 | free_trial_cpuset(trialcs); | 1626 | free_trial_cpuset(trialcs); |
1611 | out: | 1627 | out_unlock: |
1612 | cgroup_unlock(); | 1628 | mutex_unlock(&cpuset_mutex); |
1613 | return retval; | 1629 | return retval; |
1614 | } | 1630 | } |
1615 | 1631 | ||
@@ -1867,6 +1883,8 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
1867 | if (!parent) | 1883 | if (!parent) |
1868 | return 0; | 1884 | return 0; |
1869 | 1885 | ||
1886 | mutex_lock(&cpuset_mutex); | ||
1887 | |||
1870 | set_bit(CS_ONLINE, &cs->flags); | 1888 | set_bit(CS_ONLINE, &cs->flags); |
1871 | if (is_spread_page(parent)) | 1889 | if (is_spread_page(parent)) |
1872 | set_bit(CS_SPREAD_PAGE, &cs->flags); | 1890 | set_bit(CS_SPREAD_PAGE, &cs->flags); |
@@ -1876,7 +1894,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
1876 | number_of_cpusets++; | 1894 | number_of_cpusets++; |
1877 | 1895 | ||
1878 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) | 1896 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) |
1879 | return 0; | 1897 | goto out_unlock; |
1880 | 1898 | ||
1881 | /* | 1899 | /* |
1882 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is | 1900 | * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is |
@@ -1895,7 +1913,7 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
1895 | cpuset_for_each_child(tmp_cs, pos_cg, parent) { | 1913 | cpuset_for_each_child(tmp_cs, pos_cg, parent) { |
1896 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { | 1914 | if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { |
1897 | rcu_read_unlock(); | 1915 | rcu_read_unlock(); |
1898 | return 0; | 1916 | goto out_unlock; |
1899 | } | 1917 | } |
1900 | } | 1918 | } |
1901 | rcu_read_unlock(); | 1919 | rcu_read_unlock(); |
@@ -1904,7 +1922,8 @@ static int cpuset_css_online(struct cgroup *cgrp) | |||
1904 | cs->mems_allowed = parent->mems_allowed; | 1922 | cs->mems_allowed = parent->mems_allowed; |
1905 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | 1923 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); |
1906 | mutex_unlock(&callback_mutex); | 1924 | mutex_unlock(&callback_mutex); |
1907 | 1925 | out_unlock: | |
1926 | mutex_unlock(&cpuset_mutex); | ||
1908 | return 0; | 1927 | return 0; |
1909 | } | 1928 | } |
1910 | 1929 | ||
@@ -1912,8 +1931,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) | |||
1912 | { | 1931 | { |
1913 | struct cpuset *cs = cgroup_cs(cgrp); | 1932 | struct cpuset *cs = cgroup_cs(cgrp); |
1914 | 1933 | ||
1915 | /* css_offline is called w/o cgroup_mutex, grab it */ | 1934 | mutex_lock(&cpuset_mutex); |
1916 | cgroup_lock(); | ||
1917 | 1935 | ||
1918 | if (is_sched_load_balance(cs)) | 1936 | if (is_sched_load_balance(cs)) |
1919 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); | 1937 | update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); |
@@ -1921,7 +1939,7 @@ static void cpuset_css_offline(struct cgroup *cgrp) | |||
1921 | number_of_cpusets--; | 1939 | number_of_cpusets--; |
1922 | clear_bit(CS_ONLINE, &cs->flags); | 1940 | clear_bit(CS_ONLINE, &cs->flags); |
1923 | 1941 | ||
1924 | cgroup_unlock(); | 1942 | mutex_unlock(&cpuset_mutex); |
1925 | } | 1943 | } |
1926 | 1944 | ||
1927 | /* | 1945 | /* |
@@ -1996,7 +2014,9 @@ static void cpuset_do_move_task(struct task_struct *tsk, | |||
1996 | { | 2014 | { |
1997 | struct cgroup *new_cgroup = scan->data; | 2015 | struct cgroup *new_cgroup = scan->data; |
1998 | 2016 | ||
2017 | cgroup_lock(); | ||
1999 | cgroup_attach_task(new_cgroup, tsk); | 2018 | cgroup_attach_task(new_cgroup, tsk); |
2019 | cgroup_unlock(); | ||
2000 | } | 2020 | } |
2001 | 2021 | ||
2002 | /** | 2022 | /** |
@@ -2004,7 +2024,7 @@ static void cpuset_do_move_task(struct task_struct *tsk, | |||
2004 | * @from: cpuset in which the tasks currently reside | 2024 | * @from: cpuset in which the tasks currently reside |
2005 | * @to: cpuset to which the tasks will be moved | 2025 | * @to: cpuset to which the tasks will be moved |
2006 | * | 2026 | * |
2007 | * Called with cgroup_mutex held | 2027 | * Called with cpuset_mutex held |
2008 | * callback_mutex must not be held, as cpuset_attach() will take it. | 2028 | * callback_mutex must not be held, as cpuset_attach() will take it. |
2009 | * | 2029 | * |
2010 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | 2030 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, |
@@ -2031,9 +2051,6 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
2031 | * removing that CPU or node from all cpusets. If this removes the | 2051 | * removing that CPU or node from all cpusets. If this removes the |
2032 | * last CPU or node from a cpuset, then move the tasks in the empty | 2052 | * last CPU or node from a cpuset, then move the tasks in the empty |
2033 | * cpuset to its next-highest non-empty parent. | 2053 | * cpuset to its next-highest non-empty parent. |
2034 | * | ||
2035 | * Called with cgroup_mutex held | ||
2036 | * callback_mutex must not be held, as cpuset_attach() will take it. | ||
2037 | */ | 2054 | */ |
2038 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | 2055 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) |
2039 | { | 2056 | { |
@@ -2089,8 +2106,9 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) | |||
2089 | static cpumask_t off_cpus; | 2106 | static cpumask_t off_cpus; |
2090 | static nodemask_t off_mems, tmp_mems; | 2107 | static nodemask_t off_mems, tmp_mems; |
2091 | struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); | 2108 | struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); |
2109 | bool is_empty; | ||
2092 | 2110 | ||
2093 | cgroup_lock(); | 2111 | mutex_lock(&cpuset_mutex); |
2094 | 2112 | ||
2095 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); | 2113 | cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); |
2096 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); | 2114 | nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); |
@@ -2112,10 +2130,18 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work) | |||
2112 | update_tasks_nodemask(cs, &tmp_mems, NULL); | 2130 | update_tasks_nodemask(cs, &tmp_mems, NULL); |
2113 | } | 2131 | } |
2114 | 2132 | ||
2115 | if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 2133 | is_empty = cpumask_empty(cs->cpus_allowed) || |
2116 | remove_tasks_in_empty_cpuset(cs); | 2134 | nodes_empty(cs->mems_allowed); |
2117 | 2135 | ||
2118 | cgroup_unlock(); | 2136 | mutex_unlock(&cpuset_mutex); |
2137 | |||
2138 | /* | ||
2139 | * If @cs became empty, move tasks to the nearest ancestor with | ||
2140 | * execution resources. This is full cgroup operation which will | ||
2141 | * also call back into cpuset. Should be done outside any lock. | ||
2142 | */ | ||
2143 | if (is_empty) | ||
2144 | remove_tasks_in_empty_cpuset(cs); | ||
2119 | 2145 | ||
2120 | /* the following may free @cs, should be the last operation */ | 2146 | /* the following may free @cs, should be the last operation */ |
2121 | css_put(&cs->css); | 2147 | css_put(&cs->css); |
@@ -2169,7 +2195,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2169 | bool cpus_updated, mems_updated; | 2195 | bool cpus_updated, mems_updated; |
2170 | bool cpus_offlined, mems_offlined; | 2196 | bool cpus_offlined, mems_offlined; |
2171 | 2197 | ||
2172 | cgroup_lock(); | 2198 | mutex_lock(&cpuset_mutex); |
2173 | 2199 | ||
2174 | /* fetch the available cpus/mems and find out which changed how */ | 2200 | /* fetch the available cpus/mems and find out which changed how */ |
2175 | cpumask_copy(&new_cpus, cpu_active_mask); | 2201 | cpumask_copy(&new_cpus, cpu_active_mask); |
@@ -2211,7 +2237,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2211 | schedule_cpuset_propagate_hotplug(cs); | 2237 | schedule_cpuset_propagate_hotplug(cs); |
2212 | } | 2238 | } |
2213 | 2239 | ||
2214 | cgroup_unlock(); | 2240 | mutex_unlock(&cpuset_mutex); |
2215 | 2241 | ||
2216 | /* wait for propagations to finish */ | 2242 | /* wait for propagations to finish */ |
2217 | flush_workqueue(cpuset_propagate_hotplug_wq); | 2243 | flush_workqueue(cpuset_propagate_hotplug_wq); |
@@ -2222,9 +2248,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2222 | cpumask_var_t *doms; | 2248 | cpumask_var_t *doms; |
2223 | int ndoms; | 2249 | int ndoms; |
2224 | 2250 | ||
2225 | cgroup_lock(); | 2251 | mutex_lock(&cpuset_mutex); |
2226 | ndoms = generate_sched_domains(&doms, &attr); | 2252 | ndoms = generate_sched_domains(&doms, &attr); |
2227 | cgroup_unlock(); | 2253 | mutex_unlock(&cpuset_mutex); |
2228 | 2254 | ||
2229 | partition_sched_domains(ndoms, doms, attr); | 2255 | partition_sched_domains(ndoms, doms, attr); |
2230 | } | 2256 | } |
@@ -2650,7 +2676,7 @@ void __cpuset_memory_pressure_bump(void) | |||
2650 | * - Used for /proc/<pid>/cpuset. | 2676 | * - Used for /proc/<pid>/cpuset. |
2651 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | 2677 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
2652 | * doesn't really matter if tsk->cpuset changes after we read it, | 2678 | * doesn't really matter if tsk->cpuset changes after we read it, |
2653 | * and we take cgroup_mutex, keeping cpuset_attach() from changing it | 2679 | * and we take cpuset_mutex, keeping cpuset_attach() from changing it |
2654 | * anyway. | 2680 | * anyway. |
2655 | */ | 2681 | */ |
2656 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) | 2682 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) |
@@ -2673,7 +2699,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
2673 | goto out_free; | 2699 | goto out_free; |
2674 | 2700 | ||
2675 | retval = -EINVAL; | 2701 | retval = -EINVAL; |
2676 | cgroup_lock(); | 2702 | mutex_lock(&cpuset_mutex); |
2677 | css = task_subsys_state(tsk, cpuset_subsys_id); | 2703 | css = task_subsys_state(tsk, cpuset_subsys_id); |
2678 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); | 2704 | retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); |
2679 | if (retval < 0) | 2705 | if (retval < 0) |
@@ -2681,7 +2707,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) | |||
2681 | seq_puts(m, buf); | 2707 | seq_puts(m, buf); |
2682 | seq_putc(m, '\n'); | 2708 | seq_putc(m, '\n'); |
2683 | out_unlock: | 2709 | out_unlock: |
2684 | cgroup_unlock(); | 2710 | mutex_unlock(&cpuset_mutex); |
2685 | put_task_struct(tsk); | 2711 | put_task_struct(tsk); |
2686 | out_free: | 2712 | out_free: |
2687 | kfree(buf); | 2713 | kfree(buf); |