diff options
Diffstat (limited to 'kernel/cpuset.c')
| -rw-r--r-- | kernel/cpuset.c | 179 | 
1 files changed, 160 insertions, 19 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b602f73fb38d..8c3c400cce91 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c  | |||
| @@ -18,7 +18,6 @@ | |||
| 18 | * distribution for more details. | 18 | * distribution for more details. | 
| 19 | */ | 19 | */ | 
| 20 | 20 | ||
| 21 | #include <linux/config.h> | ||
| 22 | #include <linux/cpu.h> | 21 | #include <linux/cpu.h> | 
| 23 | #include <linux/cpumask.h> | 22 | #include <linux/cpumask.h> | 
| 24 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> | 
| @@ -241,7 +240,7 @@ static struct super_block *cpuset_sb; | |||
| 241 | * A cpuset can only be deleted if both its 'count' of using tasks | 240 | * A cpuset can only be deleted if both its 'count' of using tasks | 
| 242 | * is zero, and its list of 'children' cpusets is empty. Since all | 241 | * is zero, and its list of 'children' cpusets is empty. Since all | 
| 243 | * tasks in the system use _some_ cpuset, and since there is always at | 242 | * tasks in the system use _some_ cpuset, and since there is always at | 
| 244 | * least one task in the system (init, pid == 1), therefore, top_cpuset | 243 | * least one task in the system (init), therefore, top_cpuset | 
| 245 | * always has either children cpusets and/or using tasks. So we don't | 244 | * always has either children cpusets and/or using tasks. So we don't | 
| 246 | * need a special hack to ensure that top_cpuset cannot be deleted. | 245 | * need a special hack to ensure that top_cpuset cannot be deleted. | 
| 247 | * | 246 | * | 
| @@ -290,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode) | |||
| 290 | inode->i_mode = mode; | 289 | inode->i_mode = mode; | 
| 291 | inode->i_uid = current->fsuid; | 290 | inode->i_uid = current->fsuid; | 
| 292 | inode->i_gid = current->fsgid; | 291 | inode->i_gid = current->fsgid; | 
| 293 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
| 294 | inode->i_blocks = 0; | 292 | inode->i_blocks = 0; | 
| 295 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 293 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 
| 296 | inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; | 294 | inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; | 
| @@ -763,6 +761,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 763 | * | 761 | * | 
| 764 | * Call with manage_mutex held. May nest a call to the | 762 | * Call with manage_mutex held. May nest a call to the | 
| 765 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 763 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 
| 764 | * Must not be called holding callback_mutex, because we must | ||
| 765 | * not call lock_cpu_hotplug() while holding callback_mutex. | ||
| 766 | */ | 766 | */ | 
| 767 | 767 | ||
| 768 | static void update_cpu_domains(struct cpuset *cur) | 768 | static void update_cpu_domains(struct cpuset *cur) | 
| @@ -782,7 +782,7 @@ static void update_cpu_domains(struct cpuset *cur) | |||
| 782 | if (is_cpu_exclusive(c)) | 782 | if (is_cpu_exclusive(c)) | 
| 783 | cpus_andnot(pspan, pspan, c->cpus_allowed); | 783 | cpus_andnot(pspan, pspan, c->cpus_allowed); | 
| 784 | } | 784 | } | 
| 785 | if (is_removed(cur) || !is_cpu_exclusive(cur)) { | 785 | if (!is_cpu_exclusive(cur)) { | 
| 786 | cpus_or(pspan, pspan, cur->cpus_allowed); | 786 | cpus_or(pspan, pspan, cur->cpus_allowed); | 
| 787 | if (cpus_equal(pspan, cur->cpus_allowed)) | 787 | if (cpus_equal(pspan, cur->cpus_allowed)) | 
| 788 | return; | 788 | return; | 
| @@ -815,6 +815,10 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 815 | struct cpuset trialcs; | 815 | struct cpuset trialcs; | 
| 816 | int retval, cpus_unchanged; | 816 | int retval, cpus_unchanged; | 
| 817 | 817 | ||
| 818 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | ||
| 819 | if (cs == &top_cpuset) | ||
| 820 | return -EACCES; | ||
| 821 | |||
| 818 | trialcs = *cs; | 822 | trialcs = *cs; | 
| 819 | retval = cpulist_parse(buf, trialcs.cpus_allowed); | 823 | retval = cpulist_parse(buf, trialcs.cpus_allowed); | 
| 820 | if (retval < 0) | 824 | if (retval < 0) | 
| @@ -908,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 908 | int fudge; | 912 | int fudge; | 
| 909 | int retval; | 913 | int retval; | 
| 910 | 914 | ||
| 915 | /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */ | ||
| 916 | if (cs == &top_cpuset) | ||
| 917 | return -EACCES; | ||
| 918 | |||
| 911 | trialcs = *cs; | 919 | trialcs = *cs; | 
| 912 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 920 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 
| 913 | if (retval < 0) | 921 | if (retval < 0) | 
| @@ -1064,7 +1072,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
| 1064 | } | 1072 | } | 
| 1065 | 1073 | ||
| 1066 | /* | 1074 | /* | 
| 1067 | * Frequency meter - How fast is some event occuring? | 1075 | * Frequency meter - How fast is some event occurring? | 
| 1068 | * | 1076 | * | 
| 1069 | * These routines manage a digitally filtered, constant time based, | 1077 | * These routines manage a digitally filtered, constant time based, | 
| 1070 | * event frequency meter. There are four routines: | 1078 | * event frequency meter. There are four routines: | 
| @@ -1217,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 1217 | 1225 | ||
| 1218 | task_lock(tsk); | 1226 | task_lock(tsk); | 
| 1219 | oldcs = tsk->cpuset; | 1227 | oldcs = tsk->cpuset; | 
| 1220 | if (!oldcs) { | 1228 | /* | 
| 1229 | * After getting 'oldcs' cpuset ptr, be sure still not exiting. | ||
| 1230 | * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack | ||
| 1231 | * then fail this attach_task(), to avoid breaking top_cpuset.count. | ||
| 1232 | */ | ||
| 1233 | if (tsk->flags & PF_EXITING) { | ||
| 1221 | task_unlock(tsk); | 1234 | task_unlock(tsk); | 
| 1222 | mutex_unlock(&callback_mutex); | 1235 | mutex_unlock(&callback_mutex); | 
| 1223 | put_task_struct(tsk); | 1236 | put_task_struct(tsk); | 
| @@ -1918,6 +1931,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
| 1918 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); | 1931 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); | 
| 1919 | } | 1932 | } | 
| 1920 | 1933 | ||
| 1934 | /* | ||
| 1935 | * Locking note on the strange update_flag() call below: | ||
| 1936 | * | ||
| 1937 | * If the cpuset being removed is marked cpu_exclusive, then simulate | ||
| 1938 | * turning cpu_exclusive off, which will call update_cpu_domains(). | ||
| 1939 | * The lock_cpu_hotplug() call in update_cpu_domains() must not be | ||
| 1940 | * made while holding callback_mutex. Elsewhere the kernel nests | ||
| 1941 | * callback_mutex inside lock_cpu_hotplug() calls. So the reverse | ||
| 1942 | * nesting would risk an ABBA deadlock. | ||
| 1943 | */ | ||
| 1944 | |||
| 1921 | static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | 1945 | static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | 
| 1922 | { | 1946 | { | 
| 1923 | struct cpuset *cs = dentry->d_fsdata; | 1947 | struct cpuset *cs = dentry->d_fsdata; | 
| @@ -1937,11 +1961,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1937 | mutex_unlock(&manage_mutex); | 1961 | mutex_unlock(&manage_mutex); | 
| 1938 | return -EBUSY; | 1962 | return -EBUSY; | 
| 1939 | } | 1963 | } | 
| 1964 | if (is_cpu_exclusive(cs)) { | ||
| 1965 | int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0"); | ||
| 1966 | if (retval < 0) { | ||
| 1967 | mutex_unlock(&manage_mutex); | ||
| 1968 | return retval; | ||
| 1969 | } | ||
| 1970 | } | ||
| 1940 | parent = cs->parent; | 1971 | parent = cs->parent; | 
| 1941 | mutex_lock(&callback_mutex); | 1972 | mutex_lock(&callback_mutex); | 
| 1942 | set_bit(CS_REMOVED, &cs->flags); | 1973 | set_bit(CS_REMOVED, &cs->flags); | 
| 1943 | if (is_cpu_exclusive(cs)) | ||
| 1944 | update_cpu_domains(cs); | ||
| 1945 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1974 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 
| 1946 | spin_lock(&cs->dentry->d_lock); | 1975 | spin_lock(&cs->dentry->d_lock); | 
| 1947 | d = dget(cs->dentry); | 1976 | d = dget(cs->dentry); | 
| @@ -2016,6 +2045,104 @@ out: | |||
| 2016 | return err; | 2045 | return err; | 
| 2017 | } | 2046 | } | 
| 2018 | 2047 | ||
| 2048 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) | ||
| 2049 | /* | ||
| 2050 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | ||
| 2051 | * or memory nodes, we need to walk over the cpuset hierarchy, | ||
| 2052 | * removing that CPU or node from all cpusets. If this removes the | ||
| 2053 | * last CPU or node from a cpuset, then the guarantee_online_cpus() | ||
| 2054 | * or guarantee_online_mems() code will use that emptied cpusets | ||
| 2055 | * parent online CPUs or nodes. Cpusets that were already empty of | ||
| 2056 | * CPUs or nodes are left empty. | ||
| 2057 | * | ||
| 2058 | * This routine is intentionally inefficient in a couple of regards. | ||
| 2059 | * It will check all cpusets in a subtree even if the top cpuset of | ||
| 2060 | * the subtree has no offline CPUs or nodes. It checks both CPUs and | ||
| 2061 | * nodes, even though the caller could have been coded to know that | ||
| 2062 | * only one of CPUs or nodes needed to be checked on a given call. | ||
| 2063 | * This was done to minimize text size rather than cpu cycles. | ||
| 2064 | * | ||
| 2065 | * Call with both manage_mutex and callback_mutex held. | ||
| 2066 | * | ||
| 2067 | * Recursive, on depth of cpuset subtree. | ||
| 2068 | */ | ||
| 2069 | |||
| 2070 | static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) | ||
| 2071 | { | ||
| 2072 | struct cpuset *c; | ||
| 2073 | |||
| 2074 | /* Each of our child cpusets mems must be online */ | ||
| 2075 | list_for_each_entry(c, &cur->children, sibling) { | ||
| 2076 | guarantee_online_cpus_mems_in_subtree(c); | ||
| 2077 | if (!cpus_empty(c->cpus_allowed)) | ||
| 2078 | guarantee_online_cpus(c, &c->cpus_allowed); | ||
| 2079 | if (!nodes_empty(c->mems_allowed)) | ||
| 2080 | guarantee_online_mems(c, &c->mems_allowed); | ||
| 2081 | } | ||
| 2082 | } | ||
| 2083 | |||
| 2084 | /* | ||
| 2085 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
| 2086 | * cpu_online_map and node_online_map. Force the top cpuset to track | ||
| 2087 | * whats online after any CPU or memory node hotplug or unplug event. | ||
| 2088 | * | ||
| 2089 | * To ensure that we don't remove a CPU or node from the top cpuset | ||
| 2090 | * that is currently in use by a child cpuset (which would violate | ||
| 2091 | * the rule that cpusets must be subsets of their parent), we first | ||
| 2092 | * call the recursive routine guarantee_online_cpus_mems_in_subtree(). | ||
| 2093 | * | ||
| 2094 | * Since there are two callers of this routine, one for CPU hotplug | ||
| 2095 | * events and one for memory node hotplug events, we could have coded | ||
| 2096 | * two separate routines here. We code it as a single common routine | ||
| 2097 | * in order to minimize text size. | ||
| 2098 | */ | ||
| 2099 | |||
| 2100 | static void common_cpu_mem_hotplug_unplug(void) | ||
| 2101 | { | ||
| 2102 | mutex_lock(&manage_mutex); | ||
| 2103 | mutex_lock(&callback_mutex); | ||
| 2104 | |||
| 2105 | guarantee_online_cpus_mems_in_subtree(&top_cpuset); | ||
| 2106 | top_cpuset.cpus_allowed = cpu_online_map; | ||
| 2107 | top_cpuset.mems_allowed = node_online_map; | ||
| 2108 | |||
| 2109 | mutex_unlock(&callback_mutex); | ||
| 2110 | mutex_unlock(&manage_mutex); | ||
| 2111 | } | ||
| 2112 | #endif | ||
| 2113 | |||
| 2114 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 2115 | /* | ||
| 2116 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | ||
| 2117 | * period. This is necessary in order to make cpusets transparent | ||
| 2118 | * (of no affect) on systems that are actively using CPU hotplug | ||
| 2119 | * but making no active use of cpusets. | ||
| 2120 | * | ||
| 2121 | * This routine ensures that top_cpuset.cpus_allowed tracks | ||
| 2122 | * cpu_online_map on each CPU hotplug (cpuhp) event. | ||
| 2123 | */ | ||
| 2124 | |||
| 2125 | static int cpuset_handle_cpuhp(struct notifier_block *nb, | ||
| 2126 | unsigned long phase, void *cpu) | ||
| 2127 | { | ||
| 2128 | common_cpu_mem_hotplug_unplug(); | ||
| 2129 | return 0; | ||
| 2130 | } | ||
| 2131 | #endif | ||
| 2132 | |||
| 2133 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 2134 | /* | ||
| 2135 | * Keep top_cpuset.mems_allowed tracking node_online_map. | ||
| 2136 | * Call this routine anytime after you change node_online_map. | ||
| 2137 | * See also the previous routine cpuset_handle_cpuhp(). | ||
| 2138 | */ | ||
| 2139 | |||
| 2140 | void cpuset_track_online_nodes() | ||
| 2141 | { | ||
| 2142 | common_cpu_mem_hotplug_unplug(); | ||
| 2143 | } | ||
| 2144 | #endif | ||
| 2145 | |||
| 2019 | /** | 2146 | /** | 
| 2020 | * cpuset_init_smp - initialize cpus_allowed | 2147 | * cpuset_init_smp - initialize cpus_allowed | 
| 2021 | * | 2148 | * | 
| @@ -2026,6 +2153,8 @@ void __init cpuset_init_smp(void) | |||
| 2026 | { | 2153 | { | 
| 2027 | top_cpuset.cpus_allowed = cpu_online_map; | 2154 | top_cpuset.cpus_allowed = cpu_online_map; | 
| 2028 | top_cpuset.mems_allowed = node_online_map; | 2155 | top_cpuset.mems_allowed = node_online_map; | 
| 2156 | |||
| 2157 | hotcpu_notifier(cpuset_handle_cpuhp, 0); | ||
| 2029 | } | 2158 | } | 
| 2030 | 2159 | ||
| 2031 | /** | 2160 | /** | 
| @@ -2195,7 +2324,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
| 2195 | int i; | 2324 | int i; | 
| 2196 | 2325 | ||
| 2197 | for (i = 0; zl->zones[i]; i++) { | 2326 | for (i = 0; zl->zones[i]; i++) { | 
| 2198 | int nid = zl->zones[i]->zone_pgdat->node_id; | 2327 | int nid = zone_to_nid(zl->zones[i]); | 
| 2199 | 2328 | ||
| 2200 | if (node_isset(nid, current->mems_allowed)) | 2329 | if (node_isset(nid, current->mems_allowed)) | 
| 2201 | return 1; | 2330 | return 1; | 
| @@ -2266,9 +2395,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
| 2266 | const struct cpuset *cs; /* current cpuset ancestors */ | 2395 | const struct cpuset *cs; /* current cpuset ancestors */ | 
| 2267 | int allowed; /* is allocation in zone z allowed? */ | 2396 | int allowed; /* is allocation in zone z allowed? */ | 
| 2268 | 2397 | ||
| 2269 | if (in_interrupt()) | 2398 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 
| 2270 | return 1; | 2399 | return 1; | 
| 2271 | node = z->zone_pgdat->node_id; | 2400 | node = zone_to_nid(z); | 
| 2272 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | 2401 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | 
| 2273 | if (node_isset(node, current->mems_allowed)) | 2402 | if (node_isset(node, current->mems_allowed)) | 
| 2274 | return 1; | 2403 | return 1; | 
| @@ -2370,7 +2499,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); | |||
| 2370 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 2499 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 
| 2371 | { | 2500 | { | 
| 2372 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 2501 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 
| 2373 | int overlap = 0; /* do cpusets overlap? */ | 2502 | int overlap = 1; /* do cpusets overlap? */ | 
| 2374 | 2503 | ||
| 2375 | task_lock(current); | 2504 | task_lock(current); | 
| 2376 | if (current->flags & PF_EXITING) { | 2505 | if (current->flags & PF_EXITING) { | 
| @@ -2442,31 +2571,43 @@ void __cpuset_memory_pressure_bump(void) | |||
| 2442 | */ | 2571 | */ | 
| 2443 | static int proc_cpuset_show(struct seq_file *m, void *v) | 2572 | static int proc_cpuset_show(struct seq_file *m, void *v) | 
| 2444 | { | 2573 | { | 
| 2574 | struct pid *pid; | ||
| 2445 | struct task_struct *tsk; | 2575 | struct task_struct *tsk; | 
| 2446 | char *buf; | 2576 | char *buf; | 
| 2447 | int retval = 0; | 2577 | int retval; | 
| 2448 | 2578 | ||
| 2579 | retval = -ENOMEM; | ||
| 2449 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 2580 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 
| 2450 | if (!buf) | 2581 | if (!buf) | 
| 2451 | return -ENOMEM; | 2582 | goto out; | 
| 2583 | |||
| 2584 | retval = -ESRCH; | ||
| 2585 | pid = m->private; | ||
| 2586 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
| 2587 | if (!tsk) | ||
| 2588 | goto out_free; | ||
| 2452 | 2589 | ||
| 2453 | tsk = m->private; | 2590 | retval = -EINVAL; | 
| 2454 | mutex_lock(&manage_mutex); | 2591 | mutex_lock(&manage_mutex); | 
| 2592 | |||
| 2455 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); | 2593 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); | 
| 2456 | if (retval < 0) | 2594 | if (retval < 0) | 
| 2457 | goto out; | 2595 | goto out_unlock; | 
| 2458 | seq_puts(m, buf); | 2596 | seq_puts(m, buf); | 
| 2459 | seq_putc(m, '\n'); | 2597 | seq_putc(m, '\n'); | 
| 2460 | out: | 2598 | out_unlock: | 
| 2461 | mutex_unlock(&manage_mutex); | 2599 | mutex_unlock(&manage_mutex); | 
| 2600 | put_task_struct(tsk); | ||
| 2601 | out_free: | ||
| 2462 | kfree(buf); | 2602 | kfree(buf); | 
| 2603 | out: | ||
| 2463 | return retval; | 2604 | return retval; | 
| 2464 | } | 2605 | } | 
| 2465 | 2606 | ||
| 2466 | static int cpuset_open(struct inode *inode, struct file *file) | 2607 | static int cpuset_open(struct inode *inode, struct file *file) | 
| 2467 | { | 2608 | { | 
| 2468 | struct task_struct *tsk = PROC_I(inode)->task; | 2609 | struct pid *pid = PROC_I(inode)->pid; | 
| 2469 | return single_open(file, proc_cpuset_show, tsk); | 2610 | return single_open(file, proc_cpuset_show, pid); | 
| 2470 | } | 2611 | } | 
| 2471 | 2612 | ||
| 2472 | struct file_operations proc_cpuset_operations = { | 2613 | struct file_operations proc_cpuset_operations = { | 
