diff options
author | Tejun Heo <tj@kernel.org> | 2010-06-08 15:40:36 -0400 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2010-06-08 15:40:36 -0400 |
commit | 3a101d0548e925ab16ca6aaa8cf4f767d322ddb0 (patch) | |
tree | b90d8c5f2efe30fcfa49a00fdea037567c6cd46f | |
parent | 50a323b73069b169385a8ac65633dee837a7d13f (diff) |
sched: adjust when cpu_active and cpuset configurations are updated during cpu on/offlining
Currently, when a cpu goes down, cpu_active is cleared before
CPU_DOWN_PREPARE starts and cpuset configuration is updated from a
default priority cpu notifier. When a cpu is coming up, it's set
before CPU_ONLINE but cpuset configuration again is updated from the
same cpu notifier.
For cpu notifiers, this presents an inconsistent state. Threads which
a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be
migrated to other cpus because the cpu is no more inactive.
Fix it by updating cpu_active in the highest priority cpu notifier and
cpuset configuration in the second highest when a cpu is coming up.
Down path is updated similarly. This guarantees that all other cpu
notifiers see consistent cpu_active and cpuset configuration.
cpuset_track_online_cpus() notifier is converted to
cpuset_update_active_cpus() which just updates the configuration and
now called from cpuset_cpu_[in]active() notifiers registered from
sched_init_smp(). If cpuset is disabled, cpuset_update_active_cpus()
degenerates into partition_sched_domains() making separate notifier
for !CONFIG_CPUSETS unnecessary.
This problem is triggered by cmwq. During CPU_DOWN_PREPARE, hotplug
callback creates a kthread and kthread_bind()s it to the target cpu,
and the thread is expected to run on that cpu.
* Ingo's test discovered __cpuinit/exit markups were incorrect.
Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Menage <menage@google.com>
-rw-r--r-- | include/linux/cpu.h | 16 | ||||
-rw-r--r-- | include/linux/cpuset.h | 6 | ||||
-rw-r--r-- | kernel/cpu.c | 6 | ||||
-rw-r--r-- | kernel/cpuset.c | 21 | ||||
-rw-r--r-- | kernel/sched.c | 67 |
5 files changed, 74 insertions, 42 deletions
diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 2d9073883ea9..de6b1722cdca 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h | |||
@@ -52,6 +52,22 @@ struct notifier_block; | |||
52 | * CPU notifier priorities. | 52 | * CPU notifier priorities. |
53 | */ | 53 | */ |
54 | enum { | 54 | enum { |
55 | /* | ||
56 | * SCHED_ACTIVE marks a cpu which is coming up active during | ||
57 | * CPU_ONLINE and CPU_DOWN_FAILED and must be the first | ||
58 | * notifier. CPUSET_ACTIVE adjusts cpuset according to | ||
59 | * cpu_active mask right after SCHED_ACTIVE. During | ||
60 | * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are | ||
61 | * ordered in the similar way. | ||
62 | * | ||
63 | * This ordering guarantees consistent cpu_active mask and | ||
64 | * migration behavior to all cpu notifiers. | ||
65 | */ | ||
66 | CPU_PRI_SCHED_ACTIVE = INT_MAX, | ||
67 | CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, | ||
68 | CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, | ||
69 | CPU_PRI_CPUSET_INACTIVE = INT_MIN, | ||
70 | |||
55 | /* migration should happen before other stuff but after perf */ | 71 | /* migration should happen before other stuff but after perf */ |
56 | CPU_PRI_PERF = 20, | 72 | CPU_PRI_PERF = 20, |
57 | CPU_PRI_MIGRATION = 10, | 73 | CPU_PRI_MIGRATION = 10, |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 457ed765a116..f20eb8f16025 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -20,6 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ | |||
20 | 20 | ||
21 | extern int cpuset_init(void); | 21 | extern int cpuset_init(void); |
22 | extern void cpuset_init_smp(void); | 22 | extern void cpuset_init_smp(void); |
23 | extern void cpuset_update_active_cpus(void); | ||
23 | extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); | 24 | extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); |
24 | extern int cpuset_cpus_allowed_fallback(struct task_struct *p); | 25 | extern int cpuset_cpus_allowed_fallback(struct task_struct *p); |
25 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); | 26 | extern nodemask_t cpuset_mems_allowed(struct task_struct *p); |
@@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) | |||
132 | static inline int cpuset_init(void) { return 0; } | 133 | static inline int cpuset_init(void) { return 0; } |
133 | static inline void cpuset_init_smp(void) {} | 134 | static inline void cpuset_init_smp(void) {} |
134 | 135 | ||
136 | static inline void cpuset_update_active_cpus(void) | ||
137 | { | ||
138 | partition_sched_domains(1, NULL, NULL); | ||
139 | } | ||
140 | |||
135 | static inline void cpuset_cpus_allowed(struct task_struct *p, | 141 | static inline void cpuset_cpus_allowed(struct task_struct *p, |
136 | struct cpumask *mask) | 142 | struct cpumask *mask) |
137 | { | 143 | { |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b426a4ac..f6e726f18491 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
235 | return -EINVAL; | 235 | return -EINVAL; |
236 | 236 | ||
237 | cpu_hotplug_begin(); | 237 | cpu_hotplug_begin(); |
238 | set_cpu_active(cpu, false); | ||
239 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); | 238 | err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); |
240 | if (err) { | 239 | if (err) { |
241 | set_cpu_active(cpu, true); | ||
242 | |||
243 | nr_calls--; | 240 | nr_calls--; |
244 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); | 241 | __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); |
245 | printk("%s: attempt to take down CPU %u failed\n", | 242 | printk("%s: attempt to take down CPU %u failed\n", |
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
249 | 246 | ||
250 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 247 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
251 | if (err) { | 248 | if (err) { |
252 | set_cpu_active(cpu, true); | ||
253 | /* CPU didn't die: tell everyone. Can't complain. */ | 249 | /* CPU didn't die: tell everyone. Can't complain. */ |
254 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); | 250 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); |
255 | 251 | ||
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
321 | goto out_notify; | 317 | goto out_notify; |
322 | BUG_ON(!cpu_online(cpu)); | 318 | BUG_ON(!cpu_online(cpu)); |
323 | 319 | ||
324 | set_cpu_active(cpu, true); | ||
325 | |||
326 | /* Now call notifier in preparation. */ | 320 | /* Now call notifier in preparation. */ |
327 | cpu_notify(CPU_ONLINE | mod, hcpu); | 321 | cpu_notify(CPU_ONLINE | mod, hcpu); |
328 | 322 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02b9611eadde..05727dcaa80d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2113 | * but making no active use of cpusets. | 2113 | * but making no active use of cpusets. |
2114 | * | 2114 | * |
2115 | * This routine ensures that top_cpuset.cpus_allowed tracks | 2115 | * This routine ensures that top_cpuset.cpus_allowed tracks |
2116 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 2116 | * cpu_active_mask on each CPU hotplug (cpuhp) event. |
2117 | * | 2117 | * |
2118 | * Called within get_online_cpus(). Needs to call cgroup_lock() | 2118 | * Called within get_online_cpus(). Needs to call cgroup_lock() |
2119 | * before calling generate_sched_domains(). | 2119 | * before calling generate_sched_domains(). |
2120 | */ | 2120 | */ |
2121 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | 2121 | void __cpuexit cpuset_update_active_cpus(void) |
2122 | unsigned long phase, void *unused_cpu) | ||
2123 | { | 2122 | { |
2124 | struct sched_domain_attr *attr; | 2123 | struct sched_domain_attr *attr; |
2125 | cpumask_var_t *doms; | 2124 | cpumask_var_t *doms; |
2126 | int ndoms; | 2125 | int ndoms; |
2127 | 2126 | ||
2128 | switch (phase) { | ||
2129 | case CPU_ONLINE: | ||
2130 | case CPU_ONLINE_FROZEN: | ||
2131 | case CPU_DOWN_PREPARE: | ||
2132 | case CPU_DOWN_PREPARE_FROZEN: | ||
2133 | case CPU_DOWN_FAILED: | ||
2134 | case CPU_DOWN_FAILED_FROZEN: | ||
2135 | break; | ||
2136 | |||
2137 | default: | ||
2138 | return NOTIFY_DONE; | ||
2139 | } | ||
2140 | |||
2141 | cgroup_lock(); | 2127 | cgroup_lock(); |
2142 | mutex_lock(&callback_mutex); | 2128 | mutex_lock(&callback_mutex); |
2143 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2129 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
2148 | 2134 | ||
2149 | /* Have scheduler rebuild the domains */ | 2135 | /* Have scheduler rebuild the domains */ |
2150 | partition_sched_domains(ndoms, doms, attr); | 2136 | partition_sched_domains(ndoms, doms, attr); |
2151 | |||
2152 | return NOTIFY_OK; | ||
2153 | } | 2137 | } |
2154 | 2138 | ||
2155 | #ifdef CONFIG_MEMORY_HOTPLUG | 2139 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) | |||
2203 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2187 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2204 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2188 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
2205 | 2189 | ||
2206 | hotcpu_notifier(cpuset_track_online_cpus, 0); | ||
2207 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2190 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
2208 | 2191 | ||
2209 | cpuset_wq = create_singlethread_workqueue("cpuset"); | 2192 | cpuset_wq = create_singlethread_workqueue("cpuset"); |
diff --git a/kernel/sched.c b/kernel/sched.c index 552faf8d358c..2b942e49d0fa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -5804,17 +5804,46 @@ static struct notifier_block __cpuinitdata migration_notifier = { | |||
5804 | .priority = CPU_PRI_MIGRATION, | 5804 | .priority = CPU_PRI_MIGRATION, |
5805 | }; | 5805 | }; |
5806 | 5806 | ||
5807 | static int __cpuinit sched_cpu_active(struct notifier_block *nfb, | ||
5808 | unsigned long action, void *hcpu) | ||
5809 | { | ||
5810 | switch (action & ~CPU_TASKS_FROZEN) { | ||
5811 | case CPU_ONLINE: | ||
5812 | case CPU_DOWN_FAILED: | ||
5813 | set_cpu_active((long)hcpu, true); | ||
5814 | return NOTIFY_OK; | ||
5815 | default: | ||
5816 | return NOTIFY_DONE; | ||
5817 | } | ||
5818 | } | ||
5819 | |||
5820 | static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, | ||
5821 | unsigned long action, void *hcpu) | ||
5822 | { | ||
5823 | switch (action & ~CPU_TASKS_FROZEN) { | ||
5824 | case CPU_DOWN_PREPARE: | ||
5825 | set_cpu_active((long)hcpu, false); | ||
5826 | return NOTIFY_OK; | ||
5827 | default: | ||
5828 | return NOTIFY_DONE; | ||
5829 | } | ||
5830 | } | ||
5831 | |||
5807 | static int __init migration_init(void) | 5832 | static int __init migration_init(void) |
5808 | { | 5833 | { |
5809 | void *cpu = (void *)(long)smp_processor_id(); | 5834 | void *cpu = (void *)(long)smp_processor_id(); |
5810 | int err; | 5835 | int err; |
5811 | 5836 | ||
5812 | /* Start one for the boot CPU: */ | 5837 | /* Initialize migration for the boot CPU */ |
5813 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 5838 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
5814 | BUG_ON(err == NOTIFY_BAD); | 5839 | BUG_ON(err == NOTIFY_BAD); |
5815 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 5840 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
5816 | register_cpu_notifier(&migration_notifier); | 5841 | register_cpu_notifier(&migration_notifier); |
5817 | 5842 | ||
5843 | /* Register cpu active notifiers */ | ||
5844 | cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); | ||
5845 | cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); | ||
5846 | |||
5818 | return 0; | 5847 | return 0; |
5819 | } | 5848 | } |
5820 | early_initcall(migration_init); | 5849 | early_initcall(migration_init); |
@@ -7273,29 +7302,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
7273 | } | 7302 | } |
7274 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 7303 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
7275 | 7304 | ||
7276 | #ifndef CONFIG_CPUSETS | ||
7277 | /* | 7305 | /* |
7278 | * Add online and remove offline CPUs from the scheduler domains. | 7306 | * Update cpusets according to cpu_active mask. If cpusets are |
7279 | * When cpusets are enabled they take over this function. | 7307 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
7308 | * around partition_sched_domains(). | ||
7280 | */ | 7309 | */ |
7281 | static int update_sched_domains(struct notifier_block *nfb, | 7310 | static int __cpuexit cpuset_cpu_active(struct notifier_block *nfb, |
7282 | unsigned long action, void *hcpu) | 7311 | unsigned long action, void *hcpu) |
7283 | { | 7312 | { |
7284 | switch (action) { | 7313 | switch (action & ~CPU_TASKS_FROZEN) { |
7285 | case CPU_ONLINE: | 7314 | case CPU_ONLINE: |
7286 | case CPU_ONLINE_FROZEN: | ||
7287 | case CPU_DOWN_PREPARE: | ||
7288 | case CPU_DOWN_PREPARE_FROZEN: | ||
7289 | case CPU_DOWN_FAILED: | 7315 | case CPU_DOWN_FAILED: |
7290 | case CPU_DOWN_FAILED_FROZEN: | 7316 | cpuset_update_active_cpus(); |
7291 | partition_sched_domains(1, NULL, NULL); | ||
7292 | return NOTIFY_OK; | 7317 | return NOTIFY_OK; |
7318 | default: | ||
7319 | return NOTIFY_DONE; | ||
7320 | } | ||
7321 | } | ||
7293 | 7322 | ||
7323 | static int __cpuexit cpuset_cpu_inactive(struct notifier_block *nfb, | ||
7324 | unsigned long action, void *hcpu) | ||
7325 | { | ||
7326 | switch (action & ~CPU_TASKS_FROZEN) { | ||
7327 | case CPU_DOWN_PREPARE: | ||
7328 | cpuset_update_active_cpus(); | ||
7329 | return NOTIFY_OK; | ||
7294 | default: | 7330 | default: |
7295 | return NOTIFY_DONE; | 7331 | return NOTIFY_DONE; |
7296 | } | 7332 | } |
7297 | } | 7333 | } |
7298 | #endif | ||
7299 | 7334 | ||
7300 | static int update_runtime(struct notifier_block *nfb, | 7335 | static int update_runtime(struct notifier_block *nfb, |
7301 | unsigned long action, void *hcpu) | 7336 | unsigned long action, void *hcpu) |
@@ -7341,10 +7376,8 @@ void __init sched_init_smp(void) | |||
7341 | mutex_unlock(&sched_domains_mutex); | 7376 | mutex_unlock(&sched_domains_mutex); |
7342 | put_online_cpus(); | 7377 | put_online_cpus(); |
7343 | 7378 | ||
7344 | #ifndef CONFIG_CPUSETS | 7379 | hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); |
7345 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7380 | hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); |
7346 | hotcpu_notifier(update_sched_domains, 0); | ||
7347 | #endif | ||
7348 | 7381 | ||
7349 | /* RT runtime code needs to handle some hotplug events */ | 7382 | /* RT runtime code needs to handle some hotplug events */ |
7350 | hotcpu_notifier(update_runtime, 0); | 7383 | hotcpu_notifier(update_runtime, 0); |