aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c179
1 files changed, 160 insertions, 19 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b602f73fb38d..8c3c400cce91 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -18,7 +18,6 @@
18 * distribution for more details. 18 * distribution for more details.
19 */ 19 */
20 20
21#include <linux/config.h>
22#include <linux/cpu.h> 21#include <linux/cpu.h>
23#include <linux/cpumask.h> 22#include <linux/cpumask.h>
24#include <linux/cpuset.h> 23#include <linux/cpuset.h>
@@ -241,7 +240,7 @@ static struct super_block *cpuset_sb;
241 * A cpuset can only be deleted if both its 'count' of using tasks 240 * A cpuset can only be deleted if both its 'count' of using tasks
242 * is zero, and its list of 'children' cpusets is empty. Since all 241 * is zero, and its list of 'children' cpusets is empty. Since all
243 * tasks in the system use _some_ cpuset, and since there is always at 242 * tasks in the system use _some_ cpuset, and since there is always at
244 * least one task in the system (init, pid == 1), therefore, top_cpuset 243 * least one task in the system (init), therefore, top_cpuset
245 * always has either children cpusets and/or using tasks. So we don't 244 * always has either children cpusets and/or using tasks. So we don't
246 * need a special hack to ensure that top_cpuset cannot be deleted. 245 * need a special hack to ensure that top_cpuset cannot be deleted.
247 * 246 *
@@ -290,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
290 inode->i_mode = mode; 289 inode->i_mode = mode;
291 inode->i_uid = current->fsuid; 290 inode->i_uid = current->fsuid;
292 inode->i_gid = current->fsgid; 291 inode->i_gid = current->fsgid;
293 inode->i_blksize = PAGE_CACHE_SIZE;
294 inode->i_blocks = 0; 292 inode->i_blocks = 0;
295 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 293 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
296 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; 294 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
@@ -763,6 +761,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
763 * 761 *
764 * Call with manage_mutex held. May nest a call to the 762 * Call with manage_mutex held. May nest a call to the
765 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 763 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
764 * Must not be called holding callback_mutex, because we must
765 * not call lock_cpu_hotplug() while holding callback_mutex.
766 */ 766 */
767 767
768static void update_cpu_domains(struct cpuset *cur) 768static void update_cpu_domains(struct cpuset *cur)
@@ -782,7 +782,7 @@ static void update_cpu_domains(struct cpuset *cur)
782 if (is_cpu_exclusive(c)) 782 if (is_cpu_exclusive(c))
783 cpus_andnot(pspan, pspan, c->cpus_allowed); 783 cpus_andnot(pspan, pspan, c->cpus_allowed);
784 } 784 }
785 if (is_removed(cur) || !is_cpu_exclusive(cur)) { 785 if (!is_cpu_exclusive(cur)) {
786 cpus_or(pspan, pspan, cur->cpus_allowed); 786 cpus_or(pspan, pspan, cur->cpus_allowed);
787 if (cpus_equal(pspan, cur->cpus_allowed)) 787 if (cpus_equal(pspan, cur->cpus_allowed))
788 return; 788 return;
@@ -815,6 +815,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
815 struct cpuset trialcs; 815 struct cpuset trialcs;
816 int retval, cpus_unchanged; 816 int retval, cpus_unchanged;
817 817
818 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
819 if (cs == &top_cpuset)
820 return -EACCES;
821
818 trialcs = *cs; 822 trialcs = *cs;
819 retval = cpulist_parse(buf, trialcs.cpus_allowed); 823 retval = cpulist_parse(buf, trialcs.cpus_allowed);
820 if (retval < 0) 824 if (retval < 0)
@@ -908,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
908 int fudge; 912 int fudge;
909 int retval; 913 int retval;
910 914
915 /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
916 if (cs == &top_cpuset)
917 return -EACCES;
918
911 trialcs = *cs; 919 trialcs = *cs;
912 retval = nodelist_parse(buf, trialcs.mems_allowed); 920 retval = nodelist_parse(buf, trialcs.mems_allowed);
913 if (retval < 0) 921 if (retval < 0)
@@ -1064,7 +1072,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1064} 1072}
1065 1073
1066/* 1074/*
1067 * Frequency meter - How fast is some event occuring? 1075 * Frequency meter - How fast is some event occurring?
1068 * 1076 *
1069 * These routines manage a digitally filtered, constant time based, 1077 * These routines manage a digitally filtered, constant time based,
1070 * event frequency meter. There are four routines: 1078 * event frequency meter. There are four routines:
@@ -1217,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1217 1225
1218 task_lock(tsk); 1226 task_lock(tsk);
1219 oldcs = tsk->cpuset; 1227 oldcs = tsk->cpuset;
1220 if (!oldcs) { 1228 /*
1229 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
1230 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
1231 * then fail this attach_task(), to avoid breaking top_cpuset.count.
1232 */
1233 if (tsk->flags & PF_EXITING) {
1221 task_unlock(tsk); 1234 task_unlock(tsk);
1222 mutex_unlock(&callback_mutex); 1235 mutex_unlock(&callback_mutex);
1223 put_task_struct(tsk); 1236 put_task_struct(tsk);
@@ -1918,6 +1931,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1918 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); 1931 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1919} 1932}
1920 1933
1934/*
1935 * Locking note on the strange update_flag() call below:
1936 *
1937 * If the cpuset being removed is marked cpu_exclusive, then simulate
1938 * turning cpu_exclusive off, which will call update_cpu_domains().
1939 * The lock_cpu_hotplug() call in update_cpu_domains() must not be
1940 * made while holding callback_mutex. Elsewhere the kernel nests
1941 * callback_mutex inside lock_cpu_hotplug() calls. So the reverse
1942 * nesting would risk an ABBA deadlock.
1943 */
1944
1921static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) 1945static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1922{ 1946{
1923 struct cpuset *cs = dentry->d_fsdata; 1947 struct cpuset *cs = dentry->d_fsdata;
@@ -1937,11 +1961,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1937 mutex_unlock(&manage_mutex); 1961 mutex_unlock(&manage_mutex);
1938 return -EBUSY; 1962 return -EBUSY;
1939 } 1963 }
1964 if (is_cpu_exclusive(cs)) {
1965 int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
1966 if (retval < 0) {
1967 mutex_unlock(&manage_mutex);
1968 return retval;
1969 }
1970 }
1940 parent = cs->parent; 1971 parent = cs->parent;
1941 mutex_lock(&callback_mutex); 1972 mutex_lock(&callback_mutex);
1942 set_bit(CS_REMOVED, &cs->flags); 1973 set_bit(CS_REMOVED, &cs->flags);
1943 if (is_cpu_exclusive(cs))
1944 update_cpu_domains(cs);
1945 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1974 list_del(&cs->sibling); /* delete my sibling from parent->children */
1946 spin_lock(&cs->dentry->d_lock); 1975 spin_lock(&cs->dentry->d_lock);
1947 d = dget(cs->dentry); 1976 d = dget(cs->dentry);
@@ -2016,6 +2045,104 @@ out:
2016 return err; 2045 return err;
2017} 2046}
2018 2047
2048#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
2049/*
2050 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
2051 * or memory nodes, we need to walk over the cpuset hierarchy,
2052 * removing that CPU or node from all cpusets. If this removes the
2053 * last CPU or node from a cpuset, then the guarantee_online_cpus()
2054 * or guarantee_online_mems() code will use that emptied cpusets
2055 * parent online CPUs or nodes. Cpusets that were already empty of
2056 * CPUs or nodes are left empty.
2057 *
2058 * This routine is intentionally inefficient in a couple of regards.
2059 * It will check all cpusets in a subtree even if the top cpuset of
2060 * the subtree has no offline CPUs or nodes. It checks both CPUs and
2061 * nodes, even though the caller could have been coded to know that
2062 * only one of CPUs or nodes needed to be checked on a given call.
2063 * This was done to minimize text size rather than cpu cycles.
2064 *
2065 * Call with both manage_mutex and callback_mutex held.
2066 *
2067 * Recursive, on depth of cpuset subtree.
2068 */
2069
2070static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2071{
2072 struct cpuset *c;
2073
2074 /* Each of our child cpusets mems must be online */
2075 list_for_each_entry(c, &cur->children, sibling) {
2076 guarantee_online_cpus_mems_in_subtree(c);
2077 if (!cpus_empty(c->cpus_allowed))
2078 guarantee_online_cpus(c, &c->cpus_allowed);
2079 if (!nodes_empty(c->mems_allowed))
2080 guarantee_online_mems(c, &c->mems_allowed);
2081 }
2082}
2083
2084/*
2085 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
2086 * cpu_online_map and node_online_map. Force the top cpuset to track
2087 * whats online after any CPU or memory node hotplug or unplug event.
2088 *
2089 * To ensure that we don't remove a CPU or node from the top cpuset
2090 * that is currently in use by a child cpuset (which would violate
2091 * the rule that cpusets must be subsets of their parent), we first
2092 * call the recursive routine guarantee_online_cpus_mems_in_subtree().
2093 *
2094 * Since there are two callers of this routine, one for CPU hotplug
2095 * events and one for memory node hotplug events, we could have coded
2096 * two separate routines here. We code it as a single common routine
2097 * in order to minimize text size.
2098 */
2099
2100static void common_cpu_mem_hotplug_unplug(void)
2101{
2102 mutex_lock(&manage_mutex);
2103 mutex_lock(&callback_mutex);
2104
2105 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
2106 top_cpuset.cpus_allowed = cpu_online_map;
2107 top_cpuset.mems_allowed = node_online_map;
2108
2109 mutex_unlock(&callback_mutex);
2110 mutex_unlock(&manage_mutex);
2111}
2112#endif
2113
2114#ifdef CONFIG_HOTPLUG_CPU
2115/*
2116 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2117 * period. This is necessary in order to make cpusets transparent
2118 * (of no affect) on systems that are actively using CPU hotplug
2119 * but making no active use of cpusets.
2120 *
2121 * This routine ensures that top_cpuset.cpus_allowed tracks
2122 * cpu_online_map on each CPU hotplug (cpuhp) event.
2123 */
2124
2125static int cpuset_handle_cpuhp(struct notifier_block *nb,
2126 unsigned long phase, void *cpu)
2127{
2128 common_cpu_mem_hotplug_unplug();
2129 return 0;
2130}
2131#endif
2132
2133#ifdef CONFIG_MEMORY_HOTPLUG
2134/*
2135 * Keep top_cpuset.mems_allowed tracking node_online_map.
2136 * Call this routine anytime after you change node_online_map.
2137 * See also the previous routine cpuset_handle_cpuhp().
2138 */
2139
2140void cpuset_track_online_nodes()
2141{
2142 common_cpu_mem_hotplug_unplug();
2143}
2144#endif
2145
2019/** 2146/**
2020 * cpuset_init_smp - initialize cpus_allowed 2147 * cpuset_init_smp - initialize cpus_allowed
2021 * 2148 *
@@ -2026,6 +2153,8 @@ void __init cpuset_init_smp(void)
2026{ 2153{
2027 top_cpuset.cpus_allowed = cpu_online_map; 2154 top_cpuset.cpus_allowed = cpu_online_map;
2028 top_cpuset.mems_allowed = node_online_map; 2155 top_cpuset.mems_allowed = node_online_map;
2156
2157 hotcpu_notifier(cpuset_handle_cpuhp, 0);
2029} 2158}
2030 2159
2031/** 2160/**
@@ -2195,7 +2324,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
2195 int i; 2324 int i;
2196 2325
2197 for (i = 0; zl->zones[i]; i++) { 2326 for (i = 0; zl->zones[i]; i++) {
2198 int nid = zl->zones[i]->zone_pgdat->node_id; 2327 int nid = zone_to_nid(zl->zones[i]);
2199 2328
2200 if (node_isset(nid, current->mems_allowed)) 2329 if (node_isset(nid, current->mems_allowed))
2201 return 1; 2330 return 1;
@@ -2266,9 +2395,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2266 const struct cpuset *cs; /* current cpuset ancestors */ 2395 const struct cpuset *cs; /* current cpuset ancestors */
2267 int allowed; /* is allocation in zone z allowed? */ 2396 int allowed; /* is allocation in zone z allowed? */
2268 2397
2269 if (in_interrupt()) 2398 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2270 return 1; 2399 return 1;
2271 node = z->zone_pgdat->node_id; 2400 node = zone_to_nid(z);
2272 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2401 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2273 if (node_isset(node, current->mems_allowed)) 2402 if (node_isset(node, current->mems_allowed))
2274 return 1; 2403 return 1;
@@ -2370,7 +2499,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2370int cpuset_excl_nodes_overlap(const struct task_struct *p) 2499int cpuset_excl_nodes_overlap(const struct task_struct *p)
2371{ 2500{
2372 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 2501 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
2373 int overlap = 0; /* do cpusets overlap? */ 2502 int overlap = 1; /* do cpusets overlap? */
2374 2503
2375 task_lock(current); 2504 task_lock(current);
2376 if (current->flags & PF_EXITING) { 2505 if (current->flags & PF_EXITING) {
@@ -2442,31 +2571,43 @@ void __cpuset_memory_pressure_bump(void)
2442 */ 2571 */
2443static int proc_cpuset_show(struct seq_file *m, void *v) 2572static int proc_cpuset_show(struct seq_file *m, void *v)
2444{ 2573{
2574 struct pid *pid;
2445 struct task_struct *tsk; 2575 struct task_struct *tsk;
2446 char *buf; 2576 char *buf;
2447 int retval = 0; 2577 int retval;
2448 2578
2579 retval = -ENOMEM;
2449 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2580 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2450 if (!buf) 2581 if (!buf)
2451 return -ENOMEM; 2582 goto out;
2583
2584 retval = -ESRCH;
2585 pid = m->private;
2586 tsk = get_pid_task(pid, PIDTYPE_PID);
2587 if (!tsk)
2588 goto out_free;
2452 2589
2453 tsk = m->private; 2590 retval = -EINVAL;
2454 mutex_lock(&manage_mutex); 2591 mutex_lock(&manage_mutex);
2592
2455 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2593 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2456 if (retval < 0) 2594 if (retval < 0)
2457 goto out; 2595 goto out_unlock;
2458 seq_puts(m, buf); 2596 seq_puts(m, buf);
2459 seq_putc(m, '\n'); 2597 seq_putc(m, '\n');
2460out: 2598out_unlock:
2461 mutex_unlock(&manage_mutex); 2599 mutex_unlock(&manage_mutex);
2600 put_task_struct(tsk);
2601out_free:
2462 kfree(buf); 2602 kfree(buf);
2603out:
2463 return retval; 2604 return retval;
2464} 2605}
2465 2606
2466static int cpuset_open(struct inode *inode, struct file *file) 2607static int cpuset_open(struct inode *inode, struct file *file)
2467{ 2608{
2468 struct task_struct *tsk = PROC_I(inode)->task; 2609 struct pid *pid = PROC_I(inode)->pid;
2469 return single_open(file, proc_cpuset_show, tsk); 2610 return single_open(file, proc_cpuset_show, pid);
2470} 2611}
2471 2612
2472struct file_operations proc_cpuset_operations = { 2613struct file_operations proc_cpuset_operations = {