aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-01-07 11:51:07 -0500
committerTejun Heo <tj@kernel.org>2013-01-07 11:51:07 -0500
commitdeb7aa308ea264b374d1db970174f5728a2faa27 (patch)
treeeda76fcb0a19269db2d624f7c9eca5bb52c7102b /kernel/cpuset.c
parent4e4c9a140fc2ecf5e086922ccd2022bdabe509b6 (diff)
cpuset: reorganize CPU / memory hotplug handling
Reorganize hotplug path to prepare for async hotplug handling. * Both CPU and memory hotplug handlings are collected into a single function - cpuset_handle_hotplug(). It doesn't take any argument but compares the current setttings of top_cpuset against what's actually available to determine what happened. This function directly updates top_cpuset. If there are CPUs or memory nodes which are taken down, cpuset_propagate_hotplug() in invoked on all !root cpusets. * cpuset_propagate_hotplug() is responsible for updating the specified cpuset so that it doesn't include any resource which isn't available to top_cpuset. If no CPU or memory is left after update, all tasks are moved to the nearest ancestor with both resources. * update_tasks_cpumask() and update_tasks_nodemask() are now always called after cpus or mems masks are updated even if the cpuset doesn't have any task. This is for brevity and not expected to have any measureable effect. * cpu_active_mask and N_HIGH_MEMORY are read exactly once per cpuset_handle_hotplug() invocation, all cpusets share the same view of what resources are available, and cpuset_handle_hotplug() can handle multiple resources going up and down. These properties will allow async operation. The reorganization, while drastic, is equivalent and shouldn't cause any behavior difference. This will enable making hotplug handling async and remove get_online_cpus() -> cgroup_mutex nesting. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c221
1 files changed, 104 insertions, 117 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c5edc6b3eb28..3d448e646a4a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -148,12 +148,6 @@ typedef enum {
148 CS_SPREAD_SLAB, 148 CS_SPREAD_SLAB,
149} cpuset_flagbits_t; 149} cpuset_flagbits_t;
150 150
151/* the type of hotplug event */
152enum hotplug_event {
153 CPUSET_CPU_OFFLINE,
154 CPUSET_MEM_OFFLINE,
155};
156
157/* convenient tests for these bits */ 151/* convenient tests for these bits */
158static inline bool is_cpuset_online(const struct cpuset *cs) 152static inline bool is_cpuset_online(const struct cpuset *cs)
159{ 153{
@@ -2059,116 +2053,131 @@ static struct cpuset *cpuset_next(struct list_head *queue)
2059 return cp; 2053 return cp;
2060} 2054}
2061 2055
2062 2056/**
2063/* 2057 * cpuset_propagate_hotplug - propagate CPU/memory hotplug to a cpuset
2064 * Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory 2058 * @cs: cpuset in interest
2065 * online/offline) and update the cpusets accordingly.
2066 * For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
2067 * cpuset must be moved to a parent cpuset.
2068 *
2069 * Called with cgroup_mutex held. We take callback_mutex to modify
2070 * cpus_allowed and mems_allowed.
2071 * 2059 *
2072 * This walk processes the tree from top to bottom, completing one layer 2060 * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
2073 * before dropping down to the next. It always processes a node before 2061 * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
2074 * any of its children. 2062 * all its tasks are moved to the nearest ancestor with both resources.
2075 * 2063 *
2076 * In the case of memory hot-unplug, it will remove nodes from N_MEMORY 2064 * Should be called with cgroup_mutex held.
2077 * if all present pages from a node are offlined.
2078 */ 2065 */
2079static void 2066static void cpuset_propagate_hotplug(struct cpuset *cs)
2080scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
2081{ 2067{
2082 LIST_HEAD(queue); 2068 static cpumask_t off_cpus;
2083 struct cpuset *cp; /* scans cpusets being updated */ 2069 static nodemask_t off_mems, tmp_mems;
2084 static nodemask_t oldmems; /* protected by cgroup_mutex */
2085
2086 list_add_tail((struct list_head *)&root->stack_list, &queue);
2087
2088 switch (event) {
2089 case CPUSET_CPU_OFFLINE:
2090 while ((cp = cpuset_next(&queue)) != NULL) {
2091 2070
2092 /* Continue past cpusets with all cpus online */ 2071 WARN_ON_ONCE(!cgroup_lock_is_held());
2093 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
2094 continue;
2095 2072
2096 /* Remove offline cpus from this cpuset. */ 2073 cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
2097 mutex_lock(&callback_mutex); 2074 nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
2098 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2099 cpu_active_mask);
2100 mutex_unlock(&callback_mutex);
2101 2075
2102 /* Move tasks from the empty cpuset to a parent */ 2076 /* remove offline cpus from @cs */
2103 if (cpumask_empty(cp->cpus_allowed)) 2077 if (!cpumask_empty(&off_cpus)) {
2104 remove_tasks_in_empty_cpuset(cp); 2078 mutex_lock(&callback_mutex);
2105 else 2079 cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
2106 update_tasks_cpumask(cp, NULL); 2080 mutex_unlock(&callback_mutex);
2107 } 2081 update_tasks_cpumask(cs, NULL);
2108 break; 2082 }
2109
2110 case CPUSET_MEM_OFFLINE:
2111 while ((cp = cpuset_next(&queue)) != NULL) {
2112
2113 /* Continue past cpusets with all mems online */
2114 if (nodes_subset(cp->mems_allowed,
2115 node_states[N_MEMORY]))
2116 continue;
2117
2118 oldmems = cp->mems_allowed;
2119
2120 /* Remove offline mems from this cpuset. */
2121 mutex_lock(&callback_mutex);
2122 nodes_and(cp->mems_allowed, cp->mems_allowed,
2123 node_states[N_MEMORY]);
2124 mutex_unlock(&callback_mutex);
2125 2083
2126 /* Move tasks from the empty cpuset to a parent */ 2084 /* remove offline mems from @cs */
2127 if (nodes_empty(cp->mems_allowed)) 2085 if (!nodes_empty(off_mems)) {
2128 remove_tasks_in_empty_cpuset(cp); 2086 tmp_mems = cs->mems_allowed;
2129 else 2087 mutex_lock(&callback_mutex);
2130 update_tasks_nodemask(cp, &oldmems, NULL); 2088 nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
2131 } 2089 mutex_unlock(&callback_mutex);
2090 update_tasks_nodemask(cs, &tmp_mems, NULL);
2132 } 2091 }
2092
2093 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
2094 remove_tasks_in_empty_cpuset(cs);
2133} 2095}
2134 2096
2135/* 2097/**
2136 * The top_cpuset tracks what CPUs and Memory Nodes are online, 2098 * cpuset_handle_hotplug - handle CPU/memory hot[un]plug
2137 * period. This is necessary in order to make cpusets transparent
2138 * (of no affect) on systems that are actively using CPU hotplug
2139 * but making no active use of cpusets.
2140 *
2141 * The only exception to this is suspend/resume, where we don't
2142 * modify cpusets at all.
2143 * 2099 *
2144 * This routine ensures that top_cpuset.cpus_allowed tracks 2100 * This function is called after either CPU or memory configuration has
2145 * cpu_active_mask on each CPU hotplug (cpuhp) event. 2101 * changed and updates cpuset accordingly. The top_cpuset is always
2102 * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
2103 * order to make cpusets transparent (of no affect) on systems that are
2104 * actively using CPU hotplug but making no active use of cpusets.
2146 * 2105 *
2147 * Called within get_online_cpus(). Needs to call cgroup_lock() 2106 * Non-root cpusets are only affected by offlining. If any CPUs or memory
2148 * before calling generate_sched_domains(). 2107 * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
2108 * descendants.
2149 * 2109 *
2150 * @cpu_online: Indicates whether this is a CPU online event (true) or 2110 * Note that CPU offlining during suspend is ignored. We don't modify
2151 * a CPU offline event (false). 2111 * cpusets across suspend/resume cycles at all.
2152 */ 2112 */
2153void cpuset_update_active_cpus(bool cpu_online) 2113static void cpuset_handle_hotplug(void)
2154{ 2114{
2155 struct sched_domain_attr *attr; 2115 static cpumask_t new_cpus, tmp_cpus;
2156 cpumask_var_t *doms; 2116 static nodemask_t new_mems, tmp_mems;
2157 int ndoms; 2117 bool cpus_updated, mems_updated;
2118 bool cpus_offlined, mems_offlined;
2158 2119
2159 cgroup_lock(); 2120 cgroup_lock();
2160 mutex_lock(&callback_mutex);
2161 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2162 mutex_unlock(&callback_mutex);
2163 2121
2164 if (!cpu_online) 2122 /* fetch the available cpus/mems and find out which changed how */
2165 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); 2123 cpumask_copy(&new_cpus, cpu_active_mask);
2124 new_mems = node_states[N_MEMORY];
2125
2126 cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
2127 cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
2128 &new_cpus);
2129
2130 mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
2131 nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
2132 mems_offlined = !nodes_empty(tmp_mems);
2133
2134 /* synchronize cpus_allowed to cpu_active_mask */
2135 if (cpus_updated) {
2136 mutex_lock(&callback_mutex);
2137 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2138 mutex_unlock(&callback_mutex);
2139 /* we don't mess with cpumasks of tasks in top_cpuset */
2140 }
2141
2142 /* synchronize mems_allowed to N_MEMORY */
2143 if (mems_updated) {
2144 tmp_mems = top_cpuset.mems_allowed;
2145 mutex_lock(&callback_mutex);
2146 top_cpuset.mems_allowed = new_mems;
2147 mutex_unlock(&callback_mutex);
2148 update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
2149 }
2150
2151 /* if cpus or mems went down, we need to propagate to descendants */
2152 if (cpus_offlined || mems_offlined) {
2153 struct cpuset *cs;
2154 LIST_HEAD(queue);
2155
2156 list_add_tail(&top_cpuset.stack_list, &queue);
2157 while ((cs = cpuset_next(&queue)))
2158 if (cs != &top_cpuset)
2159 cpuset_propagate_hotplug(cs);
2160 }
2166 2161
2167 ndoms = generate_sched_domains(&doms, &attr);
2168 cgroup_unlock(); 2162 cgroup_unlock();
2169 2163
2170 /* Have scheduler rebuild the domains */ 2164 /* rebuild sched domains if cpus_allowed has changed */
2171 partition_sched_domains(ndoms, doms, attr); 2165 if (cpus_updated) {
2166 struct sched_domain_attr *attr;
2167 cpumask_var_t *doms;
2168 int ndoms;
2169
2170 cgroup_lock();
2171 ndoms = generate_sched_domains(&doms, &attr);
2172 cgroup_unlock();
2173
2174 partition_sched_domains(ndoms, doms, attr);
2175 }
2176}
2177
2178void cpuset_update_active_cpus(bool cpu_online)
2179{
2180 cpuset_handle_hotplug();
2172} 2181}
2173 2182
2174#ifdef CONFIG_MEMORY_HOTPLUG 2183#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2180,29 +2189,7 @@ void cpuset_update_active_cpus(bool cpu_online)
2180static int cpuset_track_online_nodes(struct notifier_block *self, 2189static int cpuset_track_online_nodes(struct notifier_block *self,
2181 unsigned long action, void *arg) 2190 unsigned long action, void *arg)
2182{ 2191{
2183 static nodemask_t oldmems; /* protected by cgroup_mutex */ 2192 cpuset_handle_hotplug();
2184
2185 cgroup_lock();
2186 switch (action) {
2187 case MEM_ONLINE:
2188 oldmems = top_cpuset.mems_allowed;
2189 mutex_lock(&callback_mutex);
2190 top_cpuset.mems_allowed = node_states[N_MEMORY];
2191 mutex_unlock(&callback_mutex);
2192 update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
2193 break;
2194 case MEM_OFFLINE:
2195 /*
2196 * needn't update top_cpuset.mems_allowed explicitly because
2197 * scan_cpusets_upon_hotplug() will update it.
2198 */
2199 scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
2200 break;
2201 default:
2202 break;
2203 }
2204 cgroup_unlock();
2205
2206 return NOTIFY_OK; 2193 return NOTIFY_OK;
2207} 2194}
2208#endif 2195#endif