aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c195
1 files changed, 82 insertions, 113 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe7..9c9b7545c810 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
231 * users. If someone tries to mount the "cpuset" filesystem, we 231 * users. If someone tries to mount the "cpuset" filesystem, we
232 * silently switch it to mount "cgroup" instead 232 * silently switch it to mount "cgroup" instead
233 */ 233 */
234static int cpuset_get_sb(struct file_system_type *fs_type, 234static struct dentry *cpuset_mount(struct file_system_type *fs_type,
235 int flags, const char *unused_dev_name, 235 int flags, const char *unused_dev_name, void *data)
236 void *data, struct vfsmount *mnt)
237{ 236{
238 struct file_system_type *cgroup_fs = get_fs_type("cgroup"); 237 struct file_system_type *cgroup_fs = get_fs_type("cgroup");
239 int ret = -ENODEV; 238 struct dentry *ret = ERR_PTR(-ENODEV);
240 if (cgroup_fs) { 239 if (cgroup_fs) {
241 char mountopts[] = 240 char mountopts[] =
242 "cpuset,noprefix," 241 "cpuset,noprefix,"
243 "release_agent=/sbin/cpuset_release_agent"; 242 "release_agent=/sbin/cpuset_release_agent";
244 ret = cgroup_fs->get_sb(cgroup_fs, flags, 243 ret = cgroup_fs->mount(cgroup_fs, flags,
245 unused_dev_name, mountopts, mnt); 244 unused_dev_name, mountopts);
246 put_filesystem(cgroup_fs); 245 put_filesystem(cgroup_fs);
247 } 246 }
248 return ret; 247 return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
250 249
251static struct file_system_type cpuset_fs_type = { 250static struct file_system_type cpuset_fs_type = {
252 .name = "cpuset", 251 .name = "cpuset",
253 .get_sb = cpuset_get_sb, 252 .mount = cpuset_mount,
254}; 253};
255 254
256/* 255/*
@@ -1016,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
1016 struct cpuset *cs; 1015 struct cpuset *cs;
1017 int migrate; 1016 int migrate;
1018 const nodemask_t *oldmem = scan->data; 1017 const nodemask_t *oldmem = scan->data;
1019 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); 1018 static nodemask_t newmems; /* protected by cgroup_mutex */
1020
1021 if (!newmems)
1022 return;
1023 1019
1024 cs = cgroup_cs(scan->cg); 1020 cs = cgroup_cs(scan->cg);
1025 guarantee_online_mems(cs, newmems); 1021 guarantee_online_mems(cs, &newmems);
1026 1022
1027 cpuset_change_task_nodemask(p, newmems); 1023 cpuset_change_task_nodemask(p, &newmems);
1028
1029 NODEMASK_FREE(newmems);
1030 1024
1031 mm = get_task_mm(p); 1025 mm = get_task_mm(p);
1032 if (!mm) 1026 if (!mm)
@@ -1165,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
1165static int update_relax_domain_level(struct cpuset *cs, s64 val) 1159static int update_relax_domain_level(struct cpuset *cs, s64 val)
1166{ 1160{
1167#ifdef CONFIG_SMP 1161#ifdef CONFIG_SMP
1168 if (val < -1 || val >= SD_LV_MAX) 1162 if (val < -1 || val >= sched_domain_level_max)
1169 return -EINVAL; 1163 return -EINVAL;
1170#endif 1164#endif
1171 1165
@@ -1373,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1373 return val; 1367 return val;
1374} 1368}
1375 1369
1376/* Protected by cgroup_lock */
1377static cpumask_var_t cpus_attach;
1378
1379/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1370/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1380static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1371static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1381 struct task_struct *tsk, bool threadgroup) 1372 struct task_struct *tsk)
1382{ 1373{
1383 int ret;
1384 struct cpuset *cs = cgroup_cs(cont); 1374 struct cpuset *cs = cgroup_cs(cont);
1385 1375
1386 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1397,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1397 if (tsk->flags & PF_THREAD_BOUND) 1387 if (tsk->flags & PF_THREAD_BOUND)
1398 return -EINVAL; 1388 return -EINVAL;
1399 1389
1400 ret = security_task_setscheduler(tsk, 0, NULL);
1401 if (ret)
1402 return ret;
1403 if (threadgroup) {
1404 struct task_struct *c;
1405
1406 rcu_read_lock();
1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1408 ret = security_task_setscheduler(c, 0, NULL);
1409 if (ret) {
1410 rcu_read_unlock();
1411 return ret;
1412 }
1413 }
1414 rcu_read_unlock();
1415 }
1416 return 0; 1390 return 0;
1417} 1391}
1418 1392
1419static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, 1393static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1420 struct cpuset *cs) 1394{
1395 return security_task_setscheduler(task);
1396}
1397
1398/*
1399 * Protected by cgroup_lock. The nodemasks must be stored globally because
1400 * dynamically allocating them is not allowed in pre_attach, and they must
1401 * persist among pre_attach, attach_task, and attach.
1402 */
1403static cpumask_var_t cpus_attach;
1404static nodemask_t cpuset_attach_nodemask_from;
1405static nodemask_t cpuset_attach_nodemask_to;
1406
1407/* Set-up work for before attaching each task. */
1408static void cpuset_pre_attach(struct cgroup *cont)
1409{
1410 struct cpuset *cs = cgroup_cs(cont);
1411
1412 if (cs == &top_cpuset)
1413 cpumask_copy(cpus_attach, cpu_possible_mask);
1414 else
1415 guarantee_online_cpus(cs, cpus_attach);
1416
1417 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1418}
1419
1420/* Per-thread attachment work. */
1421static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1421{ 1422{
1422 int err; 1423 int err;
1424 struct cpuset *cs = cgroup_cs(cont);
1425
1423 /* 1426 /*
1424 * can_attach beforehand should guarantee that this doesn't fail. 1427 * can_attach beforehand should guarantee that this doesn't fail.
1425 * TODO: have a better way to handle failure here 1428 * TODO: have a better way to handle failure here
@@ -1427,56 +1430,31 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1427 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1430 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1428 WARN_ON_ONCE(err); 1431 WARN_ON_ONCE(err);
1429 1432
1430 cpuset_change_task_nodemask(tsk, to); 1433 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1431 cpuset_update_task_spread_flag(cs, tsk); 1434 cpuset_update_task_spread_flag(cs, tsk);
1432
1433} 1435}
1434 1436
1435static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1437static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1436 struct cgroup *oldcont, struct task_struct *tsk, 1438 struct cgroup *oldcont, struct task_struct *tsk)
1437 bool threadgroup)
1438{ 1439{
1439 struct mm_struct *mm; 1440 struct mm_struct *mm;
1440 struct cpuset *cs = cgroup_cs(cont); 1441 struct cpuset *cs = cgroup_cs(cont);
1441 struct cpuset *oldcs = cgroup_cs(oldcont); 1442 struct cpuset *oldcs = cgroup_cs(oldcont);
1442 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1443 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1444
1445 if (from == NULL || to == NULL)
1446 goto alloc_fail;
1447 1443
1448 if (cs == &top_cpuset) { 1444 /*
1449 cpumask_copy(cpus_attach, cpu_possible_mask); 1445 * Change mm, possibly for multiple threads in a threadgroup. This is
1450 } else { 1446 * expensive and may sleep.
1451 guarantee_online_cpus(cs, cpus_attach); 1447 */
1452 } 1448 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1453 guarantee_online_mems(cs, to); 1449 cpuset_attach_nodemask_to = cs->mems_allowed;
1454
1455 /* do per-task migration stuff possibly for each in the threadgroup */
1456 cpuset_attach_task(tsk, to, cs);
1457 if (threadgroup) {
1458 struct task_struct *c;
1459 rcu_read_lock();
1460 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1461 cpuset_attach_task(c, to, cs);
1462 }
1463 rcu_read_unlock();
1464 }
1465
1466 /* change mm; only needs to be done once even if threadgroup */
1467 *from = oldcs->mems_allowed;
1468 *to = cs->mems_allowed;
1469 mm = get_task_mm(tsk); 1450 mm = get_task_mm(tsk);
1470 if (mm) { 1451 if (mm) {
1471 mpol_rebind_mm(mm, to); 1452 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1472 if (is_memory_migrate(cs)) 1453 if (is_memory_migrate(cs))
1473 cpuset_migrate_mm(mm, from, to); 1454 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1455 &cpuset_attach_nodemask_to);
1474 mmput(mm); 1456 mmput(mm);
1475 } 1457 }
1476
1477alloc_fail:
1478 NODEMASK_FREE(from);
1479 NODEMASK_FREE(to);
1480} 1458}
1481 1459
1482/* The various types of files and directories in a cpuset file system */ 1460/* The various types of files and directories in a cpuset file system */
@@ -1576,8 +1554,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1576 return -ENODEV; 1554 return -ENODEV;
1577 1555
1578 trialcs = alloc_trial_cpuset(cs); 1556 trialcs = alloc_trial_cpuset(cs);
1579 if (!trialcs) 1557 if (!trialcs) {
1580 return -ENOMEM; 1558 retval = -ENOMEM;
1559 goto out;
1560 }
1581 1561
1582 switch (cft->private) { 1562 switch (cft->private) {
1583 case FILE_CPULIST: 1563 case FILE_CPULIST:
@@ -1592,6 +1572,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1592 } 1572 }
1593 1573
1594 free_trial_cpuset(trialcs); 1574 free_trial_cpuset(trialcs);
1575out:
1595 cgroup_unlock(); 1576 cgroup_unlock();
1596 return retval; 1577 return retval;
1597} 1578}
@@ -1608,34 +1589,26 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1608 * across a page fault. 1589 * across a page fault.
1609 */ 1590 */
1610 1591
1611static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) 1592static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1612{ 1593{
1613 int ret; 1594 size_t count;
1614 1595
1615 mutex_lock(&callback_mutex); 1596 mutex_lock(&callback_mutex);
1616 ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); 1597 count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
1617 mutex_unlock(&callback_mutex); 1598 mutex_unlock(&callback_mutex);
1618 1599
1619 return ret; 1600 return count;
1620} 1601}
1621 1602
1622static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1603static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1623{ 1604{
1624 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); 1605 size_t count;
1625 int retval;
1626
1627 if (mask == NULL)
1628 return -ENOMEM;
1629 1606
1630 mutex_lock(&callback_mutex); 1607 mutex_lock(&callback_mutex);
1631 *mask = cs->mems_allowed; 1608 count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
1632 mutex_unlock(&callback_mutex); 1609 mutex_unlock(&callback_mutex);
1633 1610
1634 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); 1611 return count;
1635
1636 NODEMASK_FREE(mask);
1637
1638 return retval;
1639} 1612}
1640 1613
1641static ssize_t cpuset_common_file_read(struct cgroup *cont, 1614static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1829,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1829} 1802}
1830 1803
1831/* 1804/*
1832 * post_clone() is called at the end of cgroup_clone(). 1805 * post_clone() is called during cgroup_create() when the
1833 * 'cgroup' was just created automatically as a result of 1806 * clone_children mount argument was specified. The cgroup
1834 * a cgroup_clone(), and the current task is about to 1807 * can not yet have any tasks.
1835 * be moved into 'cgroup'.
1836 * 1808 *
1837 * Currently we refuse to set up the cgroup - thereby 1809 * Currently we refuse to set up the cgroup - thereby
1838 * refusing the task to be entered, and as a result refusing 1810 * refusing the task to be entered, and as a result refusing
@@ -1860,8 +1832,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1860 cs = cgroup_cs(cgroup); 1832 cs = cgroup_cs(cgroup);
1861 parent_cs = cgroup_cs(parent); 1833 parent_cs = cgroup_cs(parent);
1862 1834
1835 mutex_lock(&callback_mutex);
1863 cs->mems_allowed = parent_cs->mems_allowed; 1836 cs->mems_allowed = parent_cs->mems_allowed;
1864 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); 1837 cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
1838 mutex_unlock(&callback_mutex);
1865 return; 1839 return;
1866} 1840}
1867 1841
@@ -1929,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
1929 .create = cpuset_create, 1903 .create = cpuset_create,
1930 .destroy = cpuset_destroy, 1904 .destroy = cpuset_destroy,
1931 .can_attach = cpuset_can_attach, 1905 .can_attach = cpuset_can_attach,
1906 .can_attach_task = cpuset_can_attach_task,
1907 .pre_attach = cpuset_pre_attach,
1908 .attach_task = cpuset_attach_task,
1932 .attach = cpuset_attach, 1909 .attach = cpuset_attach,
1933 .populate = cpuset_populate, 1910 .populate = cpuset_populate,
1934 .post_clone = cpuset_post_clone, 1911 .post_clone = cpuset_post_clone,
@@ -2064,10 +2041,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2064 struct cpuset *cp; /* scans cpusets being updated */ 2041 struct cpuset *cp; /* scans cpusets being updated */
2065 struct cpuset *child; /* scans child cpusets of cp */ 2042 struct cpuset *child; /* scans child cpusets of cp */
2066 struct cgroup *cont; 2043 struct cgroup *cont;
2067 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2044 static nodemask_t oldmems; /* protected by cgroup_mutex */
2068
2069 if (oldmems == NULL)
2070 return;
2071 2045
2072 list_add_tail((struct list_head *)&root->stack_list, &queue); 2046 list_add_tail((struct list_head *)&root->stack_list, &queue);
2073 2047
@@ -2084,7 +2058,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2084 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2058 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2085 continue; 2059 continue;
2086 2060
2087 *oldmems = cp->mems_allowed; 2061 oldmems = cp->mems_allowed;
2088 2062
2089 /* Remove offline cpus and mems from this cpuset. */ 2063 /* Remove offline cpus and mems from this cpuset. */
2090 mutex_lock(&callback_mutex); 2064 mutex_lock(&callback_mutex);
@@ -2100,10 +2074,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2100 remove_tasks_in_empty_cpuset(cp); 2074 remove_tasks_in_empty_cpuset(cp);
2101 else { 2075 else {
2102 update_tasks_cpumask(cp, NULL); 2076 update_tasks_cpumask(cp, NULL);
2103 update_tasks_nodemask(cp, oldmems, NULL); 2077 update_tasks_nodemask(cp, &oldmems, NULL);
2104 } 2078 }
2105 } 2079 }
2106 NODEMASK_FREE(oldmems);
2107} 2080}
2108 2081
2109/* 2082/*
@@ -2145,19 +2118,16 @@ void cpuset_update_active_cpus(void)
2145static int cpuset_track_online_nodes(struct notifier_block *self, 2118static int cpuset_track_online_nodes(struct notifier_block *self,
2146 unsigned long action, void *arg) 2119 unsigned long action, void *arg)
2147{ 2120{
2148 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); 2121 static nodemask_t oldmems; /* protected by cgroup_mutex */
2149
2150 if (oldmems == NULL)
2151 return NOTIFY_DONE;
2152 2122
2153 cgroup_lock(); 2123 cgroup_lock();
2154 switch (action) { 2124 switch (action) {
2155 case MEM_ONLINE: 2125 case MEM_ONLINE:
2156 *oldmems = top_cpuset.mems_allowed; 2126 oldmems = top_cpuset.mems_allowed;
2157 mutex_lock(&callback_mutex); 2127 mutex_lock(&callback_mutex);
2158 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2128 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2159 mutex_unlock(&callback_mutex); 2129 mutex_unlock(&callback_mutex);
2160 update_tasks_nodemask(&top_cpuset, oldmems, NULL); 2130 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2161 break; 2131 break;
2162 case MEM_OFFLINE: 2132 case MEM_OFFLINE:
2163 /* 2133 /*
@@ -2171,7 +2141,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2171 } 2141 }
2172 cgroup_unlock(); 2142 cgroup_unlock();
2173 2143
2174 NODEMASK_FREE(oldmems);
2175 return NOTIFY_OK; 2144 return NOTIFY_OK;
2176} 2145}
2177#endif 2146#endif
@@ -2221,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2221 rcu_read_lock(); 2190 rcu_read_lock();
2222 cs = task_cs(tsk); 2191 cs = task_cs(tsk);
2223 if (cs) 2192 if (cs)
2224 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); 2193 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2225 rcu_read_unlock(); 2194 rcu_read_unlock();
2226 2195
2227 /* 2196 /*
@@ -2248,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2248 * Like above we can temporary set any mask and rely on 2217 * Like above we can temporary set any mask and rely on
2249 * set_cpus_allowed_ptr() as synchronization point. 2218 * set_cpus_allowed_ptr() as synchronization point.
2250 */ 2219 */
2251 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); 2220 do_set_cpus_allowed(tsk, cpu_possible_mask);
2252 cpu = cpumask_any(cpu_active_mask); 2221 cpu = cpumask_any(cpu_active_mask);
2253 } 2222 }
2254 2223