aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-01-07 11:51:07 -0500
committerTejun Heo <tj@kernel.org>2013-01-07 11:51:07 -0500
commit3a5a6d0c2b0391e159fa5bf1dddb9bf1f35178a0 (patch)
treeaafc1a1207631f277731afe9bff237ece2554efb /kernel/cpuset.c
parentdeb7aa308ea264b374d1db970174f5728a2faa27 (diff)
cpuset: don't nest cgroup_mutex inside get_online_cpus()
CPU / memory hotplug path currently grabs cgroup_mutex from hotplug event notifications. We want to separate cpuset locking from cgroup core and make cgroup_mutex outer to hotplug synchronization so that, among other things, mechanisms which depend on get_online_cpus() can be used from cgroup callbacks. In general, we want to keep cgroup_mutex the outermost lock to minimize locking interactions among different controllers. Convert cpuset_handle_hotplug() to cpuset_hotplug_workfn() and schedule it from the hotplug notifications. As the function can already handle multiple mixed events without any input, converting it to a work function is mostly trivial; however, one complication is that cpuset_update_active_cpus() needs to update sched domains synchronously to reflect an offlined cpu to avoid confusing the scheduler. This is worked around by falling back to the the default single sched domain synchronously before scheduling the actual hotplug work. This makes sched domain rebuilt twice per CPU hotplug event but the operation isn't that heavy and a lot of the second operation would be noop for systems w/ single sched domain, which is the common case. This decouples cpuset hotplug handling from the notification callbacks and there can be an arbitrary delay between the actual event and updates to cpusets. Scheduler and mm can handle it fine but moving tasks out of an empty cpuset may race against writes to the cpuset restoring execution resources which can lead to confusing behavior. Flush hotplug work item from cpuset_write_resmask() to avoid such confusions. v2: Synchronous sched domain rebuilding using the fallback sched domain added. This fixes various issues caused by confused scheduler putting tasks on a dead CPU, including the one reported by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c39
1 files changed, 35 insertions, 4 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3d448e646a4a..658eb1a32084 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -260,6 +260,13 @@ static char cpuset_nodelist[CPUSET_NODELIST_LEN];
260static DEFINE_SPINLOCK(cpuset_buffer_lock); 260static DEFINE_SPINLOCK(cpuset_buffer_lock);
261 261
262/* 262/*
263 * CPU / memory hotplug is handled asynchronously.
264 */
265static void cpuset_hotplug_workfn(struct work_struct *work);
266
267static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
268
269/*
263 * This is ugly, but preserves the userspace API for existing cpuset 270 * This is ugly, but preserves the userspace API for existing cpuset
264 * users. If someone tries to mount the "cpuset" filesystem, we 271 * users. If someone tries to mount the "cpuset" filesystem, we
265 * silently switch it to mount "cgroup" instead 272 * silently switch it to mount "cgroup" instead
@@ -1565,6 +1572,19 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1565 struct cpuset *cs = cgroup_cs(cgrp); 1572 struct cpuset *cs = cgroup_cs(cgrp);
1566 struct cpuset *trialcs; 1573 struct cpuset *trialcs;
1567 1574
1575 /*
1576 * CPU or memory hotunplug may leave @cs w/o any execution
1577 * resources, in which case the hotplug code asynchronously updates
1578 * configuration and transfers all tasks to the nearest ancestor
1579 * which can execute.
1580 *
1581 * As writes to "cpus" or "mems" may restore @cs's execution
1582 * resources, wait for the previously scheduled operations before
1583 * proceeding, so that we don't end up keep removing tasks added
1584 * after execution capability is restored.
1585 */
1586 flush_work(&cpuset_hotplug_work);
1587
1568 if (!cgroup_lock_live_group(cgrp)) 1588 if (!cgroup_lock_live_group(cgrp))
1569 return -ENODEV; 1589 return -ENODEV;
1570 1590
@@ -2095,7 +2115,7 @@ static void cpuset_propagate_hotplug(struct cpuset *cs)
2095} 2115}
2096 2116
2097/** 2117/**
2098 * cpuset_handle_hotplug - handle CPU/memory hot[un]plug 2118 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
2099 * 2119 *
2100 * This function is called after either CPU or memory configuration has 2120 * This function is called after either CPU or memory configuration has
2101 * changed and updates cpuset accordingly. The top_cpuset is always 2121 * changed and updates cpuset accordingly. The top_cpuset is always
@@ -2110,7 +2130,7 @@ static void cpuset_propagate_hotplug(struct cpuset *cs)
2110 * Note that CPU offlining during suspend is ignored. We don't modify 2130 * Note that CPU offlining during suspend is ignored. We don't modify
2111 * cpusets across suspend/resume cycles at all. 2131 * cpusets across suspend/resume cycles at all.
2112 */ 2132 */
2113static void cpuset_handle_hotplug(void) 2133static void cpuset_hotplug_workfn(struct work_struct *work)
2114{ 2134{
2115 static cpumask_t new_cpus, tmp_cpus; 2135 static cpumask_t new_cpus, tmp_cpus;
2116 static nodemask_t new_mems, tmp_mems; 2136 static nodemask_t new_mems, tmp_mems;
@@ -2177,7 +2197,18 @@ static void cpuset_handle_hotplug(void)
2177 2197
2178void cpuset_update_active_cpus(bool cpu_online) 2198void cpuset_update_active_cpus(bool cpu_online)
2179{ 2199{
2180 cpuset_handle_hotplug(); 2200 /*
2201 * We're inside cpu hotplug critical region which usually nests
2202 * inside cgroup synchronization. Bounce actual hotplug processing
2203 * to a work item to avoid reverse locking order.
2204 *
2205 * We still need to do partition_sched_domains() synchronously;
2206 * otherwise, the scheduler will get confused and put tasks to the
2207 * dead CPU. Fall back to the default single domain.
2208 * cpuset_hotplug_workfn() will rebuild it as necessary.
2209 */
2210 partition_sched_domains(1, NULL, NULL);
2211 schedule_work(&cpuset_hotplug_work);
2181} 2212}
2182 2213
2183#ifdef CONFIG_MEMORY_HOTPLUG 2214#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2189,7 +2220,7 @@ void cpuset_update_active_cpus(bool cpu_online)
2189static int cpuset_track_online_nodes(struct notifier_block *self, 2220static int cpuset_track_online_nodes(struct notifier_block *self,
2190 unsigned long action, void *arg) 2221 unsigned long action, void *arg)
2191{ 2222{
2192 cpuset_handle_hotplug(); 2223 schedule_work(&cpuset_hotplug_work);
2193 return NOTIFY_OK; 2224 return NOTIFY_OK;
2194} 2225}
2195#endif 2226#endif