aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-01-07 11:51:08 -0500
committerTejun Heo <tj@kernel.org>2013-01-07 11:51:08 -0500
commit5d21cc2db040d01f8c19b8602f6987813e1176b4 (patch)
tree0dcb94aefa3fee2e4c436a50fc5eeb9e45fa3988 /kernel
parent02bb586372a71595203b3ff19a9be48eaa076f6c (diff)
cpuset: replace cgroup_mutex locking with cpuset internal locking
Supposedly for historical reasons, cpuset depends on cgroup core for locking. It depends on cgroup_mutex in cgroup callbacks and grabs cgroup_mutex from other places where it wants to be synchronized. This is majorly messy and highly prone to introducing circular locking dependency especially because cgroup_mutex is supposed to be one of the outermost locks. As previous patches already plugged possible races which may happen by decoupling from cgroup_mutex, replacing cgroup_mutex with cpuset specific cpuset_mutex is mostly straight-forward. Introduce cpuset_mutex, replace all occurrences of cgroup_mutex with it, and add cpuset_mutex locking to places which inherited cgroup_mutex from cgroup core. The only complication is from cpuset wanting to initiate task migration when a cpuset loses all cpus or memory nodes. Task migration may go through full cgroup and all subsystem locking and should be initiated without holding any cpuset specific lock; however, a previous patch already made hotplug handled asynchronously and moving the task migration part outside other locks is easy. cpuset_propagate_hotplug_workfn() now invokes remove_tasks_in_empty_cpuset() without holding any lock. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c188
1 files changed, 107 insertions, 81 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 644281003f5d..5e348ae37ce9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -208,23 +208,20 @@ static struct cpuset top_cpuset = {
208 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) 208 if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
209 209
210/* 210/*
211 * There are two global mutexes guarding cpuset structures. The first 211 * There are two global mutexes guarding cpuset structures - cpuset_mutex
212 * is the main control groups cgroup_mutex, accessed via 212 * and callback_mutex. The latter may nest inside the former. We also
213 * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific 213 * require taking task_lock() when dereferencing a task's cpuset pointer.
214 * callback_mutex, below. They can nest. It is ok to first take 214 * See "The task_lock() exception", at the end of this comment.
215 * cgroup_mutex, then nest callback_mutex. We also require taking 215 *
216 * task_lock() when dereferencing a task's cpuset pointer. See "The 216 * A task must hold both mutexes to modify cpusets. If a task holds
217 * task_lock() exception", at the end of this comment. 217 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
218 * 218 * is the only task able to also acquire callback_mutex and be able to
219 * A task must hold both mutexes to modify cpusets. If a task 219 * modify cpusets. It can perform various checks on the cpuset structure
220 * holds cgroup_mutex, then it blocks others wanting that mutex, 220 * first, knowing nothing will change. It can also allocate memory while
221 * ensuring that it is the only task able to also acquire callback_mutex 221 * just holding cpuset_mutex. While it is performing these checks, various
222 * and be able to modify cpusets. It can perform various checks on 222 * callback routines can briefly acquire callback_mutex to query cpusets.
223 * the cpuset structure first, knowing nothing will change. It can 223 * Once it is ready to make the changes, it takes callback_mutex, blocking
224 * also allocate memory while just holding cgroup_mutex. While it is 224 * everyone else.
225 * performing these checks, various callback routines can briefly
226 * acquire callback_mutex to query cpusets. Once it is ready to make
227 * the changes, it takes callback_mutex, blocking everyone else.
228 * 225 *
229 * Calls to the kernel memory allocator can not be made while holding 226 * Calls to the kernel memory allocator can not be made while holding
230 * callback_mutex, as that would risk double tripping on callback_mutex 227 * callback_mutex, as that would risk double tripping on callback_mutex
@@ -246,6 +243,7 @@ static struct cpuset top_cpuset = {
246 * guidelines for accessing subsystem state in kernel/cgroup.c 243 * guidelines for accessing subsystem state in kernel/cgroup.c
247 */ 244 */
248 245
246static DEFINE_MUTEX(cpuset_mutex);
249static DEFINE_MUTEX(callback_mutex); 247static DEFINE_MUTEX(callback_mutex);
250 248
251/* 249/*
@@ -351,7 +349,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
351/* 349/*
352 * update task's spread flag if cpuset's page/slab spread flag is set 350 * update task's spread flag if cpuset's page/slab spread flag is set
353 * 351 *
354 * Called with callback_mutex/cgroup_mutex held 352 * Called with callback_mutex/cpuset_mutex held
355 */ 353 */
356static void cpuset_update_task_spread_flag(struct cpuset *cs, 354static void cpuset_update_task_spread_flag(struct cpuset *cs,
357 struct task_struct *tsk) 355 struct task_struct *tsk)
@@ -371,7 +369,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
371 * 369 *
372 * One cpuset is a subset of another if all its allowed CPUs and 370 * One cpuset is a subset of another if all its allowed CPUs and
373 * Memory Nodes are a subset of the other, and its exclusive flags 371 * Memory Nodes are a subset of the other, and its exclusive flags
374 * are only set if the other's are set. Call holding cgroup_mutex. 372 * are only set if the other's are set. Call holding cpuset_mutex.
375 */ 373 */
376 374
377static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 375static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -420,7 +418,7 @@ static void free_trial_cpuset(struct cpuset *trial)
420 * If we replaced the flag and mask values of the current cpuset 418 * If we replaced the flag and mask values of the current cpuset
421 * (cur) with those values in the trial cpuset (trial), would 419 * (cur) with those values in the trial cpuset (trial), would
422 * our various subset and exclusive rules still be valid? Presumes 420 * our various subset and exclusive rules still be valid? Presumes
423 * cgroup_mutex held. 421 * cpuset_mutex held.
424 * 422 *
425 * 'cur' is the address of an actual, in-use cpuset. Operations 423 * 'cur' is the address of an actual, in-use cpuset. Operations
426 * such as list traversal that depend on the actual address of the 424 * such as list traversal that depend on the actual address of the
@@ -555,7 +553,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
555 * domains when operating in the severe memory shortage situations 553 * domains when operating in the severe memory shortage situations
556 * that could cause allocation failures below. 554 * that could cause allocation failures below.
557 * 555 *
558 * Must be called with cgroup_lock held. 556 * Must be called with cpuset_mutex held.
559 * 557 *
560 * The three key local variables below are: 558 * The three key local variables below are:
561 * q - a linked-list queue of cpuset pointers, used to implement a 559 * q - a linked-list queue of cpuset pointers, used to implement a
@@ -766,7 +764,7 @@ done:
766 * 'cpus' is removed, then call this routine to rebuild the 764 * 'cpus' is removed, then call this routine to rebuild the
767 * scheduler's dynamic sched domains. 765 * scheduler's dynamic sched domains.
768 * 766 *
769 * Call with cgroup_mutex held. Takes get_online_cpus(). 767 * Call with cpuset_mutex held. Takes get_online_cpus().
770 */ 768 */
771static void rebuild_sched_domains_locked(void) 769static void rebuild_sched_domains_locked(void)
772{ 770{
@@ -774,7 +772,7 @@ static void rebuild_sched_domains_locked(void)
774 cpumask_var_t *doms; 772 cpumask_var_t *doms;
775 int ndoms; 773 int ndoms;
776 774
777 WARN_ON_ONCE(!cgroup_lock_is_held()); 775 lockdep_assert_held(&cpuset_mutex);
778 get_online_cpus(); 776 get_online_cpus();
779 777
780 /* Generate domain masks and attrs */ 778 /* Generate domain masks and attrs */
@@ -800,9 +798,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
800 798
801void rebuild_sched_domains(void) 799void rebuild_sched_domains(void)
802{ 800{
803 cgroup_lock(); 801 mutex_lock(&cpuset_mutex);
804 rebuild_sched_domains_locked(); 802 rebuild_sched_domains_locked();
805 cgroup_unlock(); 803 mutex_unlock(&cpuset_mutex);
806} 804}
807 805
808/** 806/**
@@ -810,7 +808,7 @@ void rebuild_sched_domains(void)
810 * @tsk: task to test 808 * @tsk: task to test
811 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner 809 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
812 * 810 *
813 * Call with cgroup_mutex held. May take callback_mutex during call. 811 * Call with cpuset_mutex held. May take callback_mutex during call.
814 * Called for each task in a cgroup by cgroup_scan_tasks(). 812 * Called for each task in a cgroup by cgroup_scan_tasks().
815 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other 813 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
816 * words, if its mask is not equal to its cpuset's mask). 814 * words, if its mask is not equal to its cpuset's mask).
@@ -831,7 +829,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
831 * cpus_allowed mask needs to be changed. 829 * cpus_allowed mask needs to be changed.
832 * 830 *
833 * We don't need to re-check for the cgroup/cpuset membership, since we're 831 * We don't need to re-check for the cgroup/cpuset membership, since we're
834 * holding cgroup_lock() at this point. 832 * holding cpuset_mutex at this point.
835 */ 833 */
836static void cpuset_change_cpumask(struct task_struct *tsk, 834static void cpuset_change_cpumask(struct task_struct *tsk,
837 struct cgroup_scanner *scan) 835 struct cgroup_scanner *scan)
@@ -844,7 +842,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
844 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed 842 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
845 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 843 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
846 * 844 *
847 * Called with cgroup_mutex held 845 * Called with cpuset_mutex held
848 * 846 *
849 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 847 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
850 * calling callback functions for each. 848 * calling callback functions for each.
@@ -934,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
934 * Temporarilly set tasks mems_allowed to target nodes of migration, 932 * Temporarilly set tasks mems_allowed to target nodes of migration,
935 * so that the migration code can allocate pages on these nodes. 933 * so that the migration code can allocate pages on these nodes.
936 * 934 *
937 * Call holding cgroup_mutex, so current's cpuset won't change 935 * Call holding cpuset_mutex, so current's cpuset won't change
938 * during this call, as manage_mutex holds off any cpuset_attach() 936 * during this call, as manage_mutex holds off any cpuset_attach()
939 * calls. Therefore we don't need to take task_lock around the 937 * calls. Therefore we don't need to take task_lock around the
940 * call to guarantee_online_mems(), as we know no one is changing 938 * call to guarantee_online_mems(), as we know no one is changing
@@ -1009,7 +1007,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
1009/* 1007/*
1010 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy 1008 * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
1011 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if 1009 * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
1012 * memory_migrate flag is set. Called with cgroup_mutex held. 1010 * memory_migrate flag is set. Called with cpuset_mutex held.
1013 */ 1011 */
1014static void cpuset_change_nodemask(struct task_struct *p, 1012static void cpuset_change_nodemask(struct task_struct *p,
1015 struct cgroup_scanner *scan) 1013 struct cgroup_scanner *scan)
@@ -1018,7 +1016,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
1018 struct cpuset *cs; 1016 struct cpuset *cs;
1019 int migrate; 1017 int migrate;
1020 const nodemask_t *oldmem = scan->data; 1018 const nodemask_t *oldmem = scan->data;
1021 static nodemask_t newmems; /* protected by cgroup_mutex */ 1019 static nodemask_t newmems; /* protected by cpuset_mutex */
1022 1020
1023 cs = cgroup_cs(scan->cg); 1021 cs = cgroup_cs(scan->cg);
1024 guarantee_online_mems(cs, &newmems); 1022 guarantee_online_mems(cs, &newmems);
@@ -1045,7 +1043,7 @@ static void *cpuset_being_rebound;
1045 * @oldmem: old mems_allowed of cpuset cs 1043 * @oldmem: old mems_allowed of cpuset cs
1046 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1044 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1047 * 1045 *
1048 * Called with cgroup_mutex held 1046 * Called with cpuset_mutex held
1049 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 1047 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
1050 * if @heap != NULL. 1048 * if @heap != NULL.
1051 */ 1049 */
@@ -1067,7 +1065,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1067 * take while holding tasklist_lock. Forks can happen - the 1065 * take while holding tasklist_lock. Forks can happen - the
1068 * mpol_dup() cpuset_being_rebound check will catch such forks, 1066 * mpol_dup() cpuset_being_rebound check will catch such forks,
1069 * and rebind their vma mempolicies too. Because we still hold 1067 * and rebind their vma mempolicies too. Because we still hold
1070 * the global cgroup_mutex, we know that no other rebind effort 1068 * the global cpuset_mutex, we know that no other rebind effort
1071 * will be contending for the global variable cpuset_being_rebound. 1069 * will be contending for the global variable cpuset_being_rebound.
1072 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1070 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1073 * is idempotent. Also migrate pages in each mm to new nodes. 1071 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -1086,7 +1084,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1086 * mempolicies and if the cpuset is marked 'memory_migrate', 1084 * mempolicies and if the cpuset is marked 'memory_migrate',
1087 * migrate the tasks pages to the new memory. 1085 * migrate the tasks pages to the new memory.
1088 * 1086 *
1089 * Call with cgroup_mutex held. May take callback_mutex during call. 1087 * Call with cpuset_mutex held. May take callback_mutex during call.
1090 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1088 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1091 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1089 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1092 * their mempolicies to the cpusets new mems_allowed. 1090 * their mempolicies to the cpusets new mems_allowed.
@@ -1184,7 +1182,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1184 * Called by cgroup_scan_tasks() for each task in a cgroup. 1182 * Called by cgroup_scan_tasks() for each task in a cgroup.
1185 * 1183 *
1186 * We don't need to re-check for the cgroup/cpuset membership, since we're 1184 * We don't need to re-check for the cgroup/cpuset membership, since we're
1187 * holding cgroup_lock() at this point. 1185 * holding cpuset_mutex at this point.
1188 */ 1186 */
1189static void cpuset_change_flag(struct task_struct *tsk, 1187static void cpuset_change_flag(struct task_struct *tsk,
1190 struct cgroup_scanner *scan) 1188 struct cgroup_scanner *scan)
@@ -1197,7 +1195,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
1197 * @cs: the cpuset in which each task's spread flags needs to be changed 1195 * @cs: the cpuset in which each task's spread flags needs to be changed
1198 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() 1196 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1199 * 1197 *
1200 * Called with cgroup_mutex held 1198 * Called with cpuset_mutex held
1201 * 1199 *
1202 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 1200 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1203 * calling callback functions for each. 1201 * calling callback functions for each.
@@ -1222,7 +1220,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
1222 * cs: the cpuset to update 1220 * cs: the cpuset to update
1223 * turning_on: whether the flag is being set or cleared 1221 * turning_on: whether the flag is being set or cleared
1224 * 1222 *
1225 * Call with cgroup_mutex held. 1223 * Call with cpuset_mutex held.
1226 */ 1224 */
1227 1225
1228static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1226static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1370,15 +1368,18 @@ static int fmeter_getrate(struct fmeter *fmp)
1370 return val; 1368 return val;
1371} 1369}
1372 1370
1373/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1371/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
1374static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1372static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1375{ 1373{
1376 struct cpuset *cs = cgroup_cs(cgrp); 1374 struct cpuset *cs = cgroup_cs(cgrp);
1377 struct task_struct *task; 1375 struct task_struct *task;
1378 int ret; 1376 int ret;
1379 1377
1378 mutex_lock(&cpuset_mutex);
1379
1380 ret = -ENOSPC;
1380 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1381 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1381 return -ENOSPC; 1382 goto out_unlock;
1382 1383
1383 cgroup_taskset_for_each(task, cgrp, tset) { 1384 cgroup_taskset_for_each(task, cgrp, tset) {
1384 /* 1385 /*
@@ -1390,10 +1391,12 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1390 * set_cpus_allowed_ptr() on all attached tasks before 1391 * set_cpus_allowed_ptr() on all attached tasks before
1391 * cpus_allowed may be changed. 1392 * cpus_allowed may be changed.
1392 */ 1393 */
1394 ret = -EINVAL;
1393 if (task->flags & PF_THREAD_BOUND) 1395 if (task->flags & PF_THREAD_BOUND)
1394 return -EINVAL; 1396 goto out_unlock;
1395 if ((ret = security_task_setscheduler(task))) 1397 ret = security_task_setscheduler(task);
1396 return ret; 1398 if (ret)
1399 goto out_unlock;
1397 } 1400 }
1398 1401
1399 /* 1402 /*
@@ -1401,18 +1404,22 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1401 * changes which zero cpus/mems_allowed. 1404 * changes which zero cpus/mems_allowed.
1402 */ 1405 */
1403 cs->attach_in_progress++; 1406 cs->attach_in_progress++;
1404 1407 ret = 0;
1405 return 0; 1408out_unlock:
1409 mutex_unlock(&cpuset_mutex);
1410 return ret;
1406} 1411}
1407 1412
1408static void cpuset_cancel_attach(struct cgroup *cgrp, 1413static void cpuset_cancel_attach(struct cgroup *cgrp,
1409 struct cgroup_taskset *tset) 1414 struct cgroup_taskset *tset)
1410{ 1415{
1416 mutex_lock(&cpuset_mutex);
1411 cgroup_cs(cgrp)->attach_in_progress--; 1417 cgroup_cs(cgrp)->attach_in_progress--;
1418 mutex_unlock(&cpuset_mutex);
1412} 1419}
1413 1420
1414/* 1421/*
1415 * Protected by cgroup_mutex. cpus_attach is used only by cpuset_attach() 1422 * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
1416 * but we can't allocate it dynamically there. Define it global and 1423 * but we can't allocate it dynamically there. Define it global and
1417 * allocate from cpuset_init(). 1424 * allocate from cpuset_init().
1418 */ 1425 */
@@ -1420,7 +1427,7 @@ static cpumask_var_t cpus_attach;
1420 1427
1421static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 1428static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1422{ 1429{
1423 /* static bufs protected by cgroup_mutex */ 1430 /* static bufs protected by cpuset_mutex */
1424 static nodemask_t cpuset_attach_nodemask_from; 1431 static nodemask_t cpuset_attach_nodemask_from;
1425 static nodemask_t cpuset_attach_nodemask_to; 1432 static nodemask_t cpuset_attach_nodemask_to;
1426 struct mm_struct *mm; 1433 struct mm_struct *mm;
@@ -1430,6 +1437,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1430 struct cpuset *cs = cgroup_cs(cgrp); 1437 struct cpuset *cs = cgroup_cs(cgrp);
1431 struct cpuset *oldcs = cgroup_cs(oldcgrp); 1438 struct cpuset *oldcs = cgroup_cs(oldcgrp);
1432 1439
1440 mutex_lock(&cpuset_mutex);
1441
1433 /* prepare for attach */ 1442 /* prepare for attach */
1434 if (cs == &top_cpuset) 1443 if (cs == &top_cpuset)
1435 cpumask_copy(cpus_attach, cpu_possible_mask); 1444 cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1473,6 +1482,8 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1473 */ 1482 */
1474 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1483 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1475 schedule_cpuset_propagate_hotplug(cs); 1484 schedule_cpuset_propagate_hotplug(cs);
1485
1486 mutex_unlock(&cpuset_mutex);
1476} 1487}
1477 1488
1478/* The various types of files and directories in a cpuset file system */ 1489/* The various types of files and directories in a cpuset file system */
@@ -1494,12 +1505,13 @@ typedef enum {
1494 1505
1495static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1506static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1496{ 1507{
1497 int retval = 0;
1498 struct cpuset *cs = cgroup_cs(cgrp); 1508 struct cpuset *cs = cgroup_cs(cgrp);
1499 cpuset_filetype_t type = cft->private; 1509 cpuset_filetype_t type = cft->private;
1510 int retval = -ENODEV;
1500 1511
1501 if (!cgroup_lock_live_group(cgrp)) 1512 mutex_lock(&cpuset_mutex);
1502 return -ENODEV; 1513 if (!is_cpuset_online(cs))
1514 goto out_unlock;
1503 1515
1504 switch (type) { 1516 switch (type) {
1505 case FILE_CPU_EXCLUSIVE: 1517 case FILE_CPU_EXCLUSIVE:
@@ -1533,18 +1545,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1533 retval = -EINVAL; 1545 retval = -EINVAL;
1534 break; 1546 break;
1535 } 1547 }
1536 cgroup_unlock(); 1548out_unlock:
1549 mutex_unlock(&cpuset_mutex);
1537 return retval; 1550 return retval;
1538} 1551}
1539 1552
1540static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) 1553static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1541{ 1554{
1542 int retval = 0;
1543 struct cpuset *cs = cgroup_cs(cgrp); 1555 struct cpuset *cs = cgroup_cs(cgrp);
1544 cpuset_filetype_t type = cft->private; 1556 cpuset_filetype_t type = cft->private;
1557 int retval = -ENODEV;
1545 1558
1546 if (!cgroup_lock_live_group(cgrp)) 1559 mutex_lock(&cpuset_mutex);
1547 return -ENODEV; 1560 if (!is_cpuset_online(cs))
1561 goto out_unlock;
1548 1562
1549 switch (type) { 1563 switch (type) {
1550 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1564 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
@@ -1554,7 +1568,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1554 retval = -EINVAL; 1568 retval = -EINVAL;
1555 break; 1569 break;
1556 } 1570 }
1557 cgroup_unlock(); 1571out_unlock:
1572 mutex_unlock(&cpuset_mutex);
1558 return retval; 1573 return retval;
1559} 1574}
1560 1575
@@ -1564,9 +1579,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1564static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, 1579static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1565 const char *buf) 1580 const char *buf)
1566{ 1581{
1567 int retval = 0;
1568 struct cpuset *cs = cgroup_cs(cgrp); 1582 struct cpuset *cs = cgroup_cs(cgrp);
1569 struct cpuset *trialcs; 1583 struct cpuset *trialcs;
1584 int retval = -ENODEV;
1570 1585
1571 /* 1586 /*
1572 * CPU or memory hotunplug may leave @cs w/o any execution 1587 * CPU or memory hotunplug may leave @cs w/o any execution
@@ -1586,13 +1601,14 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1586 flush_work(&cpuset_hotplug_work); 1601 flush_work(&cpuset_hotplug_work);
1587 flush_workqueue(cpuset_propagate_hotplug_wq); 1602 flush_workqueue(cpuset_propagate_hotplug_wq);
1588 1603
1589 if (!cgroup_lock_live_group(cgrp)) 1604 mutex_lock(&cpuset_mutex);
1590 return -ENODEV; 1605 if (!is_cpuset_online(cs))
1606 goto out_unlock;
1591 1607
1592 trialcs = alloc_trial_cpuset(cs); 1608 trialcs = alloc_trial_cpuset(cs);
1593 if (!trialcs) { 1609 if (!trialcs) {
1594 retval = -ENOMEM; 1610 retval = -ENOMEM;
1595 goto out; 1611 goto out_unlock;
1596 } 1612 }
1597 1613
1598 switch (cft->private) { 1614 switch (cft->private) {
@@ -1608,8 +1624,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1608 } 1624 }
1609 1625
1610 free_trial_cpuset(trialcs); 1626 free_trial_cpuset(trialcs);
1611out: 1627out_unlock:
1612 cgroup_unlock(); 1628 mutex_unlock(&cpuset_mutex);
1613 return retval; 1629 return retval;
1614} 1630}
1615 1631
@@ -1867,6 +1883,8 @@ static int cpuset_css_online(struct cgroup *cgrp)
1867 if (!parent) 1883 if (!parent)
1868 return 0; 1884 return 0;
1869 1885
1886 mutex_lock(&cpuset_mutex);
1887
1870 set_bit(CS_ONLINE, &cs->flags); 1888 set_bit(CS_ONLINE, &cs->flags);
1871 if (is_spread_page(parent)) 1889 if (is_spread_page(parent))
1872 set_bit(CS_SPREAD_PAGE, &cs->flags); 1890 set_bit(CS_SPREAD_PAGE, &cs->flags);
@@ -1876,7 +1894,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
1876 number_of_cpusets++; 1894 number_of_cpusets++;
1877 1895
1878 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) 1896 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
1879 return 0; 1897 goto out_unlock;
1880 1898
1881 /* 1899 /*
1882 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is 1900 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
@@ -1895,7 +1913,7 @@ static int cpuset_css_online(struct cgroup *cgrp)
1895 cpuset_for_each_child(tmp_cs, pos_cg, parent) { 1913 cpuset_for_each_child(tmp_cs, pos_cg, parent) {
1896 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { 1914 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
1897 rcu_read_unlock(); 1915 rcu_read_unlock();
1898 return 0; 1916 goto out_unlock;
1899 } 1917 }
1900 } 1918 }
1901 rcu_read_unlock(); 1919 rcu_read_unlock();
@@ -1904,7 +1922,8 @@ static int cpuset_css_online(struct cgroup *cgrp)
1904 cs->mems_allowed = parent->mems_allowed; 1922 cs->mems_allowed = parent->mems_allowed;
1905 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 1923 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1906 mutex_unlock(&callback_mutex); 1924 mutex_unlock(&callback_mutex);
1907 1925out_unlock:
1926 mutex_unlock(&cpuset_mutex);
1908 return 0; 1927 return 0;
1909} 1928}
1910 1929
@@ -1912,8 +1931,7 @@ static void cpuset_css_offline(struct cgroup *cgrp)
1912{ 1931{
1913 struct cpuset *cs = cgroup_cs(cgrp); 1932 struct cpuset *cs = cgroup_cs(cgrp);
1914 1933
1915 /* css_offline is called w/o cgroup_mutex, grab it */ 1934 mutex_lock(&cpuset_mutex);
1916 cgroup_lock();
1917 1935
1918 if (is_sched_load_balance(cs)) 1936 if (is_sched_load_balance(cs))
1919 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1937 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -1921,7 +1939,7 @@ static void cpuset_css_offline(struct cgroup *cgrp)
1921 number_of_cpusets--; 1939 number_of_cpusets--;
1922 clear_bit(CS_ONLINE, &cs->flags); 1940 clear_bit(CS_ONLINE, &cs->flags);
1923 1941
1924 cgroup_unlock(); 1942 mutex_unlock(&cpuset_mutex);
1925} 1943}
1926 1944
1927/* 1945/*
@@ -1996,7 +2014,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1996{ 2014{
1997 struct cgroup *new_cgroup = scan->data; 2015 struct cgroup *new_cgroup = scan->data;
1998 2016
2017 cgroup_lock();
1999 cgroup_attach_task(new_cgroup, tsk); 2018 cgroup_attach_task(new_cgroup, tsk);
2019 cgroup_unlock();
2000} 2020}
2001 2021
2002/** 2022/**
@@ -2004,7 +2024,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
2004 * @from: cpuset in which the tasks currently reside 2024 * @from: cpuset in which the tasks currently reside
2005 * @to: cpuset to which the tasks will be moved 2025 * @to: cpuset to which the tasks will be moved
2006 * 2026 *
2007 * Called with cgroup_mutex held 2027 * Called with cpuset_mutex held
2008 * callback_mutex must not be held, as cpuset_attach() will take it. 2028 * callback_mutex must not be held, as cpuset_attach() will take it.
2009 * 2029 *
2010 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, 2030 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
@@ -2031,9 +2051,6 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
2031 * removing that CPU or node from all cpusets. If this removes the 2051 * removing that CPU or node from all cpusets. If this removes the
2032 * last CPU or node from a cpuset, then move the tasks in the empty 2052 * last CPU or node from a cpuset, then move the tasks in the empty
2033 * cpuset to its next-highest non-empty parent. 2053 * cpuset to its next-highest non-empty parent.
2034 *
2035 * Called with cgroup_mutex held
2036 * callback_mutex must not be held, as cpuset_attach() will take it.
2037 */ 2054 */
2038static void remove_tasks_in_empty_cpuset(struct cpuset *cs) 2055static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2039{ 2056{
@@ -2089,8 +2106,9 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
2089 static cpumask_t off_cpus; 2106 static cpumask_t off_cpus;
2090 static nodemask_t off_mems, tmp_mems; 2107 static nodemask_t off_mems, tmp_mems;
2091 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work); 2108 struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
2109 bool is_empty;
2092 2110
2093 cgroup_lock(); 2111 mutex_lock(&cpuset_mutex);
2094 2112
2095 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2113 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2096 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2114 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
@@ -2112,10 +2130,18 @@ static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
2112 update_tasks_nodemask(cs, &tmp_mems, NULL); 2130 update_tasks_nodemask(cs, &tmp_mems, NULL);
2113 } 2131 }
2114 2132
2115 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 2133 is_empty = cpumask_empty(cs->cpus_allowed) ||
2116 remove_tasks_in_empty_cpuset(cs); 2134 nodes_empty(cs->mems_allowed);
2117 2135
2118 cgroup_unlock(); 2136 mutex_unlock(&cpuset_mutex);
2137
2138 /*
2139 * If @cs became empty, move tasks to the nearest ancestor with
2140 * execution resources. This is full cgroup operation which will
2141 * also call back into cpuset. Should be done outside any lock.
2142 */
2143 if (is_empty)
2144 remove_tasks_in_empty_cpuset(cs);
2119 2145
2120 /* the following may free @cs, should be the last operation */ 2146 /* the following may free @cs, should be the last operation */
2121 css_put(&cs->css); 2147 css_put(&cs->css);
@@ -2169,7 +2195,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2169 bool cpus_updated, mems_updated; 2195 bool cpus_updated, mems_updated;
2170 bool cpus_offlined, mems_offlined; 2196 bool cpus_offlined, mems_offlined;
2171 2197
2172 cgroup_lock(); 2198 mutex_lock(&cpuset_mutex);
2173 2199
2174 /* fetch the available cpus/mems and find out which changed how */ 2200 /* fetch the available cpus/mems and find out which changed how */
2175 cpumask_copy(&new_cpus, cpu_active_mask); 2201 cpumask_copy(&new_cpus, cpu_active_mask);
@@ -2211,7 +2237,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2211 schedule_cpuset_propagate_hotplug(cs); 2237 schedule_cpuset_propagate_hotplug(cs);
2212 } 2238 }
2213 2239
2214 cgroup_unlock(); 2240 mutex_unlock(&cpuset_mutex);
2215 2241
2216 /* wait for propagations to finish */ 2242 /* wait for propagations to finish */
2217 flush_workqueue(cpuset_propagate_hotplug_wq); 2243 flush_workqueue(cpuset_propagate_hotplug_wq);
@@ -2222,9 +2248,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2222 cpumask_var_t *doms; 2248 cpumask_var_t *doms;
2223 int ndoms; 2249 int ndoms;
2224 2250
2225 cgroup_lock(); 2251 mutex_lock(&cpuset_mutex);
2226 ndoms = generate_sched_domains(&doms, &attr); 2252 ndoms = generate_sched_domains(&doms, &attr);
2227 cgroup_unlock(); 2253 mutex_unlock(&cpuset_mutex);
2228 2254
2229 partition_sched_domains(ndoms, doms, attr); 2255 partition_sched_domains(ndoms, doms, attr);
2230 } 2256 }
@@ -2650,7 +2676,7 @@ void __cpuset_memory_pressure_bump(void)
2650 * - Used for /proc/<pid>/cpuset. 2676 * - Used for /proc/<pid>/cpuset.
2651 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2677 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2652 * doesn't really matter if tsk->cpuset changes after we read it, 2678 * doesn't really matter if tsk->cpuset changes after we read it,
2653 * and we take cgroup_mutex, keeping cpuset_attach() from changing it 2679 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2654 * anyway. 2680 * anyway.
2655 */ 2681 */
2656static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2682static int proc_cpuset_show(struct seq_file *m, void *unused_v)
@@ -2673,7 +2699,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2673 goto out_free; 2699 goto out_free;
2674 2700
2675 retval = -EINVAL; 2701 retval = -EINVAL;
2676 cgroup_lock(); 2702 mutex_lock(&cpuset_mutex);
2677 css = task_subsys_state(tsk, cpuset_subsys_id); 2703 css = task_subsys_state(tsk, cpuset_subsys_id);
2678 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); 2704 retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
2679 if (retval < 0) 2705 if (retval < 0)
@@ -2681,7 +2707,7 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2681 seq_puts(m, buf); 2707 seq_puts(m, buf);
2682 seq_putc(m, '\n'); 2708 seq_putc(m, '\n');
2683out_unlock: 2709out_unlock:
2684 cgroup_unlock(); 2710 mutex_unlock(&cpuset_mutex);
2685 put_task_struct(tsk); 2711 put_task_struct(tsk);
2686out_free: 2712out_free:
2687 kfree(buf); 2713 kfree(buf);