diff options
Diffstat (limited to 'kernel/cpuset.c')
| -rw-r--r-- | kernel/cpuset.c | 113 | 
1 files changed, 96 insertions, 17 deletions
| diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4ea6f0dc2fc5..8c3c400cce91 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -240,7 +240,7 @@ static struct super_block *cpuset_sb; | |||
| 240 | * A cpuset can only be deleted if both its 'count' of using tasks | 240 | * A cpuset can only be deleted if both its 'count' of using tasks | 
| 241 | * is zero, and its list of 'children' cpusets is empty. Since all | 241 | * is zero, and its list of 'children' cpusets is empty. Since all | 
| 242 | * tasks in the system use _some_ cpuset, and since there is always at | 242 | * tasks in the system use _some_ cpuset, and since there is always at | 
| 243 | * least one task in the system (init, pid == 1), therefore, top_cpuset | 243 | * least one task in the system (init), therefore, top_cpuset | 
| 244 | * always has either children cpusets and/or using tasks. So we don't | 244 | * always has either children cpusets and/or using tasks. So we don't | 
| 245 | * need a special hack to ensure that top_cpuset cannot be deleted. | 245 | * need a special hack to ensure that top_cpuset cannot be deleted. | 
| 246 | * | 246 | * | 
| @@ -289,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode) | |||
| 289 | inode->i_mode = mode; | 289 | inode->i_mode = mode; | 
| 290 | inode->i_uid = current->fsuid; | 290 | inode->i_uid = current->fsuid; | 
| 291 | inode->i_gid = current->fsgid; | 291 | inode->i_gid = current->fsgid; | 
| 292 | inode->i_blksize = PAGE_CACHE_SIZE; | ||
| 293 | inode->i_blocks = 0; | 292 | inode->i_blocks = 0; | 
| 294 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 293 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 
| 295 | inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; | 294 | inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; | 
| @@ -913,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 913 | int fudge; | 912 | int fudge; | 
| 914 | int retval; | 913 | int retval; | 
| 915 | 914 | ||
| 915 | /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */ | ||
| 916 | if (cs == &top_cpuset) | ||
| 917 | return -EACCES; | ||
| 918 | |||
| 916 | trialcs = *cs; | 919 | trialcs = *cs; | 
| 917 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 920 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 
| 918 | if (retval < 0) | 921 | if (retval < 0) | 
| @@ -1222,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
| 1222 | 1225 | ||
| 1223 | task_lock(tsk); | 1226 | task_lock(tsk); | 
| 1224 | oldcs = tsk->cpuset; | 1227 | oldcs = tsk->cpuset; | 
| 1225 | if (!oldcs) { | 1228 | /* | 
| 1229 | * After getting 'oldcs' cpuset ptr, be sure still not exiting. | ||
| 1230 | * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack | ||
| 1231 | * then fail this attach_task(), to avoid breaking top_cpuset.count. | ||
| 1232 | */ | ||
| 1233 | if (tsk->flags & PF_EXITING) { | ||
| 1226 | task_unlock(tsk); | 1234 | task_unlock(tsk); | 
| 1227 | mutex_unlock(&callback_mutex); | 1235 | mutex_unlock(&callback_mutex); | 
| 1228 | put_task_struct(tsk); | 1236 | put_task_struct(tsk); | 
| @@ -2037,33 +2045,104 @@ out: | |||
| 2037 | return err; | 2045 | return err; | 
| 2038 | } | 2046 | } | 
| 2039 | 2047 | ||
| 2048 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) | ||
| 2040 | /* | 2049 | /* | 
| 2041 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 2050 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 
| 2042 | * period. This is necessary in order to make cpusets transparent | 2051 | * or memory nodes, we need to walk over the cpuset hierarchy, | 
| 2043 | * (of no affect) on systems that are actively using CPU hotplug | 2052 | * removing that CPU or node from all cpusets. If this removes the | 
| 2044 | * but making no active use of cpusets. | 2053 | * last CPU or node from a cpuset, then the guarantee_online_cpus() | 
| 2045 | * | 2054 | * or guarantee_online_mems() code will use that emptied cpusets | 
| 2046 | * This handles CPU hotplug (cpuhp) events. If someday Memory | 2055 | * parent online CPUs or nodes. Cpusets that were already empty of | 
| 2047 | * Nodes can be hotplugged (dynamically changing node_online_map) | 2056 | * CPUs or nodes are left empty. | 
| 2048 | * then we should handle that too, perhaps in a similar way. | 2057 | * | 
| 2058 | * This routine is intentionally inefficient in a couple of regards. | ||
| 2059 | * It will check all cpusets in a subtree even if the top cpuset of | ||
| 2060 | * the subtree has no offline CPUs or nodes. It checks both CPUs and | ||
| 2061 | * nodes, even though the caller could have been coded to know that | ||
| 2062 | * only one of CPUs or nodes needed to be checked on a given call. | ||
| 2063 | * This was done to minimize text size rather than cpu cycles. | ||
| 2064 | * | ||
| 2065 | * Call with both manage_mutex and callback_mutex held. | ||
| 2066 | * | ||
| 2067 | * Recursive, on depth of cpuset subtree. | ||
| 2049 | */ | 2068 | */ | 
| 2050 | 2069 | ||
| 2051 | #ifdef CONFIG_HOTPLUG_CPU | 2070 | static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) | 
| 2052 | static int cpuset_handle_cpuhp(struct notifier_block *nb, | 2071 | { | 
| 2053 | unsigned long phase, void *cpu) | 2072 | struct cpuset *c; | 
| 2073 | |||
| 2074 | /* Each of our child cpusets mems must be online */ | ||
| 2075 | list_for_each_entry(c, &cur->children, sibling) { | ||
| 2076 | guarantee_online_cpus_mems_in_subtree(c); | ||
| 2077 | if (!cpus_empty(c->cpus_allowed)) | ||
| 2078 | guarantee_online_cpus(c, &c->cpus_allowed); | ||
| 2079 | if (!nodes_empty(c->mems_allowed)) | ||
| 2080 | guarantee_online_mems(c, &c->mems_allowed); | ||
| 2081 | } | ||
| 2082 | } | ||
| 2083 | |||
| 2084 | /* | ||
| 2085 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
| 2086 | * cpu_online_map and node_online_map. Force the top cpuset to track | ||
| 2087 | * whats online after any CPU or memory node hotplug or unplug event. | ||
| 2088 | * | ||
| 2089 | * To ensure that we don't remove a CPU or node from the top cpuset | ||
| 2090 | * that is currently in use by a child cpuset (which would violate | ||
| 2091 | * the rule that cpusets must be subsets of their parent), we first | ||
| 2092 | * call the recursive routine guarantee_online_cpus_mems_in_subtree(). | ||
| 2093 | * | ||
| 2094 | * Since there are two callers of this routine, one for CPU hotplug | ||
| 2095 | * events and one for memory node hotplug events, we could have coded | ||
| 2096 | * two separate routines here. We code it as a single common routine | ||
| 2097 | * in order to minimize text size. | ||
| 2098 | */ | ||
| 2099 | |||
| 2100 | static void common_cpu_mem_hotplug_unplug(void) | ||
| 2054 | { | 2101 | { | 
| 2055 | mutex_lock(&manage_mutex); | 2102 | mutex_lock(&manage_mutex); | 
| 2056 | mutex_lock(&callback_mutex); | 2103 | mutex_lock(&callback_mutex); | 
| 2057 | 2104 | ||
| 2105 | guarantee_online_cpus_mems_in_subtree(&top_cpuset); | ||
| 2058 | top_cpuset.cpus_allowed = cpu_online_map; | 2106 | top_cpuset.cpus_allowed = cpu_online_map; | 
| 2107 | top_cpuset.mems_allowed = node_online_map; | ||
| 2059 | 2108 | ||
| 2060 | mutex_unlock(&callback_mutex); | 2109 | mutex_unlock(&callback_mutex); | 
| 2061 | mutex_unlock(&manage_mutex); | 2110 | mutex_unlock(&manage_mutex); | 
| 2111 | } | ||
| 2112 | #endif | ||
| 2113 | |||
| 2114 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 2115 | /* | ||
| 2116 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | ||
| 2117 | * period. This is necessary in order to make cpusets transparent | ||
| 2118 | * (of no affect) on systems that are actively using CPU hotplug | ||
| 2119 | * but making no active use of cpusets. | ||
| 2120 | * | ||
| 2121 | * This routine ensures that top_cpuset.cpus_allowed tracks | ||
| 2122 | * cpu_online_map on each CPU hotplug (cpuhp) event. | ||
| 2123 | */ | ||
| 2062 | 2124 | ||
| 2125 | static int cpuset_handle_cpuhp(struct notifier_block *nb, | ||
| 2126 | unsigned long phase, void *cpu) | ||
| 2127 | { | ||
| 2128 | common_cpu_mem_hotplug_unplug(); | ||
| 2063 | return 0; | 2129 | return 0; | 
| 2064 | } | 2130 | } | 
| 2065 | #endif | 2131 | #endif | 
| 2066 | 2132 | ||
| 2133 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
| 2134 | /* | ||
| 2135 | * Keep top_cpuset.mems_allowed tracking node_online_map. | ||
| 2136 | * Call this routine anytime after you change node_online_map. | ||
| 2137 | * See also the previous routine cpuset_handle_cpuhp(). | ||
| 2138 | */ | ||
| 2139 | |||
| 2140 | void cpuset_track_online_nodes() | ||
| 2141 | { | ||
| 2142 | common_cpu_mem_hotplug_unplug(); | ||
| 2143 | } | ||
| 2144 | #endif | ||
| 2145 | |||
| 2067 | /** | 2146 | /** | 
| 2068 | * cpuset_init_smp - initialize cpus_allowed | 2147 | * cpuset_init_smp - initialize cpus_allowed | 
| 2069 | * | 2148 | * | 
| @@ -2245,7 +2324,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
| 2245 | int i; | 2324 | int i; | 
| 2246 | 2325 | ||
| 2247 | for (i = 0; zl->zones[i]; i++) { | 2326 | for (i = 0; zl->zones[i]; i++) { | 
| 2248 | int nid = zl->zones[i]->zone_pgdat->node_id; | 2327 | int nid = zone_to_nid(zl->zones[i]); | 
| 2249 | 2328 | ||
| 2250 | if (node_isset(nid, current->mems_allowed)) | 2329 | if (node_isset(nid, current->mems_allowed)) | 
| 2251 | return 1; | 2330 | return 1; | 
| @@ -2316,9 +2395,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
| 2316 | const struct cpuset *cs; /* current cpuset ancestors */ | 2395 | const struct cpuset *cs; /* current cpuset ancestors */ | 
| 2317 | int allowed; /* is allocation in zone z allowed? */ | 2396 | int allowed; /* is allocation in zone z allowed? */ | 
| 2318 | 2397 | ||
| 2319 | if (in_interrupt()) | 2398 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 
| 2320 | return 1; | 2399 | return 1; | 
| 2321 | node = z->zone_pgdat->node_id; | 2400 | node = zone_to_nid(z); | 
| 2322 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | 2401 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | 
| 2323 | if (node_isset(node, current->mems_allowed)) | 2402 | if (node_isset(node, current->mems_allowed)) | 
| 2324 | return 1; | 2403 | return 1; | 
