diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2009-11-25 07:31:39 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-12-06 15:10:56 -0500 |
commit | 6ad4c18884e864cf4c77f9074d3d1816063f99cd (patch) | |
tree | f09643f6148b576fa2d23bf7d4b37d082d94e267 | |
parent | e1b8090bdf125f8b2e192149547fead7f302a89c (diff) |
sched: Fix balance vs hotplug race
Since (e761b77: cpu hotplug, sched: Introduce cpu_active_map and redo
sched domain managment) we have cpu_active_mask which is suppose to rule
scheduler migration and load-balancing, except it never (fully) did.
The particular problem being solved here is a crash in try_to_wake_up()
where select_task_rq() ends up selecting an offline cpu because
select_task_rq_fair() trusts the sched_domain tree to reflect the
current state of affairs, similarly select_task_rq_rt() trusts the
root_domain.
However, the sched_domains are updated from CPU_DEAD, which is after the
cpu is taken offline and after stop_machine is done. Therefore it can
race perfectly well with code assuming the domains are right.
Cure this by building the domains from cpu_active_mask on
CPU_DOWN_PREPARE.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/cpumask.h | 2 | ||||
-rw-r--r-- | kernel/cpu.c | 18 | ||||
-rw-r--r-- | kernel/cpuset.c | 16 | ||||
-rw-r--r-- | kernel/sched.c | 32 |
4 files changed, 41 insertions, 27 deletions
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 789cf5f920ce..d77b54733c5b 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h | |||
@@ -84,6 +84,7 @@ extern const struct cpumask *const cpu_active_mask; | |||
84 | #define num_online_cpus() cpumask_weight(cpu_online_mask) | 84 | #define num_online_cpus() cpumask_weight(cpu_online_mask) |
85 | #define num_possible_cpus() cpumask_weight(cpu_possible_mask) | 85 | #define num_possible_cpus() cpumask_weight(cpu_possible_mask) |
86 | #define num_present_cpus() cpumask_weight(cpu_present_mask) | 86 | #define num_present_cpus() cpumask_weight(cpu_present_mask) |
87 | #define num_active_cpus() cpumask_weight(cpu_active_mask) | ||
87 | #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) | 88 | #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) |
88 | #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) | 89 | #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) |
89 | #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) | 90 | #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) |
@@ -92,6 +93,7 @@ extern const struct cpumask *const cpu_active_mask; | |||
92 | #define num_online_cpus() 1 | 93 | #define num_online_cpus() 1 |
93 | #define num_possible_cpus() 1 | 94 | #define num_possible_cpus() 1 |
94 | #define num_present_cpus() 1 | 95 | #define num_present_cpus() 1 |
96 | #define num_active_cpus() 1 | ||
95 | #define cpu_online(cpu) ((cpu) == 0) | 97 | #define cpu_online(cpu) ((cpu) == 0) |
96 | #define cpu_possible(cpu) ((cpu) == 0) | 98 | #define cpu_possible(cpu) ((cpu) == 0) |
97 | #define cpu_present(cpu) ((cpu) == 0) | 99 | #define cpu_present(cpu) ((cpu) == 0) |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ba0f1ecb212..b21688640377 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -212,6 +212,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
212 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, | 212 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, |
213 | hcpu, -1, &nr_calls); | 213 | hcpu, -1, &nr_calls); |
214 | if (err == NOTIFY_BAD) { | 214 | if (err == NOTIFY_BAD) { |
215 | set_cpu_active(cpu, true); | ||
216 | |||
215 | nr_calls--; | 217 | nr_calls--; |
216 | __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, | 218 | __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, |
217 | hcpu, nr_calls, NULL); | 219 | hcpu, nr_calls, NULL); |
@@ -223,11 +225,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
223 | 225 | ||
224 | /* Ensure that we are not runnable on dying cpu */ | 226 | /* Ensure that we are not runnable on dying cpu */ |
225 | cpumask_copy(old_allowed, ¤t->cpus_allowed); | 227 | cpumask_copy(old_allowed, ¤t->cpus_allowed); |
226 | set_cpus_allowed_ptr(current, | 228 | set_cpus_allowed_ptr(current, cpu_active_mask); |
227 | cpumask_of(cpumask_any_but(cpu_online_mask, cpu))); | ||
228 | 229 | ||
229 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 230 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
230 | if (err) { | 231 | if (err) { |
232 | set_cpu_active(cpu, true); | ||
231 | /* CPU didn't die: tell everyone. Can't complain. */ | 233 | /* CPU didn't die: tell everyone. Can't complain. */ |
232 | if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, | 234 | if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, |
233 | hcpu) == NOTIFY_BAD) | 235 | hcpu) == NOTIFY_BAD) |
@@ -292,9 +294,6 @@ int __ref cpu_down(unsigned int cpu) | |||
292 | 294 | ||
293 | err = _cpu_down(cpu, 0); | 295 | err = _cpu_down(cpu, 0); |
294 | 296 | ||
295 | if (cpu_online(cpu)) | ||
296 | set_cpu_active(cpu, true); | ||
297 | |||
298 | out: | 297 | out: |
299 | cpu_maps_update_done(); | 298 | cpu_maps_update_done(); |
300 | stop_machine_destroy(); | 299 | stop_machine_destroy(); |
@@ -387,6 +386,15 @@ int disable_nonboot_cpus(void) | |||
387 | * with the userspace trying to use the CPU hotplug at the same time | 386 | * with the userspace trying to use the CPU hotplug at the same time |
388 | */ | 387 | */ |
389 | cpumask_clear(frozen_cpus); | 388 | cpumask_clear(frozen_cpus); |
389 | |||
390 | for_each_online_cpu(cpu) { | ||
391 | if (cpu == first_cpu) | ||
392 | continue; | ||
393 | set_cpu_active(cpu, false); | ||
394 | } | ||
395 | |||
396 | synchronize_sched(); | ||
397 | |||
390 | printk("Disabling non-boot CPUs ...\n"); | 398 | printk("Disabling non-boot CPUs ...\n"); |
391 | for_each_online_cpu(cpu) { | 399 | for_each_online_cpu(cpu) { |
392 | if (cpu == first_cpu) | 400 | if (cpu == first_cpu) |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 43fb7e800028..ba401fab459f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
872 | if (retval < 0) | 872 | if (retval < 0) |
873 | return retval; | 873 | return retval; |
874 | 874 | ||
875 | if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) | 875 | if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) |
876 | return -EINVAL; | 876 | return -EINVAL; |
877 | } | 877 | } |
878 | retval = validate_change(cs, trialcs); | 878 | retval = validate_change(cs, trialcs); |
@@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2010 | } | 2010 | } |
2011 | 2011 | ||
2012 | /* Continue past cpusets with all cpus, mems online */ | 2012 | /* Continue past cpusets with all cpus, mems online */ |
2013 | if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && | 2013 | if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && |
2014 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | 2014 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) |
2015 | continue; | 2015 | continue; |
2016 | 2016 | ||
@@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) | |||
2019 | /* Remove offline cpus and mems from this cpuset. */ | 2019 | /* Remove offline cpus and mems from this cpuset. */ |
2020 | mutex_lock(&callback_mutex); | 2020 | mutex_lock(&callback_mutex); |
2021 | cpumask_and(cp->cpus_allowed, cp->cpus_allowed, | 2021 | cpumask_and(cp->cpus_allowed, cp->cpus_allowed, |
2022 | cpu_online_mask); | 2022 | cpu_active_mask); |
2023 | nodes_and(cp->mems_allowed, cp->mems_allowed, | 2023 | nodes_and(cp->mems_allowed, cp->mems_allowed, |
2024 | node_states[N_HIGH_MEMORY]); | 2024 | node_states[N_HIGH_MEMORY]); |
2025 | mutex_unlock(&callback_mutex); | 2025 | mutex_unlock(&callback_mutex); |
@@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
2057 | switch (phase) { | 2057 | switch (phase) { |
2058 | case CPU_ONLINE: | 2058 | case CPU_ONLINE: |
2059 | case CPU_ONLINE_FROZEN: | 2059 | case CPU_ONLINE_FROZEN: |
2060 | case CPU_DEAD: | 2060 | case CPU_DOWN_PREPARE: |
2061 | case CPU_DEAD_FROZEN: | 2061 | case CPU_DOWN_PREPARE_FROZEN: |
2062 | case CPU_DOWN_FAILED: | ||
2063 | case CPU_DOWN_FAILED_FROZEN: | ||
2062 | break; | 2064 | break; |
2063 | 2065 | ||
2064 | default: | 2066 | default: |
@@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |||
2067 | 2069 | ||
2068 | cgroup_lock(); | 2070 | cgroup_lock(); |
2069 | mutex_lock(&callback_mutex); | 2071 | mutex_lock(&callback_mutex); |
2070 | cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); | 2072 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2071 | mutex_unlock(&callback_mutex); | 2073 | mutex_unlock(&callback_mutex); |
2072 | scan_for_empty_cpusets(&top_cpuset); | 2074 | scan_for_empty_cpusets(&top_cpuset); |
2073 | ndoms = generate_sched_domains(&doms, &attr); | 2075 | ndoms = generate_sched_domains(&doms, &attr); |
@@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
2114 | 2116 | ||
2115 | void __init cpuset_init_smp(void) | 2117 | void __init cpuset_init_smp(void) |
2116 | { | 2118 | { |
2117 | cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); | 2119 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
2118 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2120 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
2119 | 2121 | ||
2120 | hotcpu_notifier(cpuset_track_online_cpus, 0); | 2122 | hotcpu_notifier(cpuset_track_online_cpus, 0); |
diff --git a/kernel/sched.c b/kernel/sched.c index aa31244caa9f..281da29d0801 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -4134,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
4134 | unsigned long flags; | 4134 | unsigned long flags; |
4135 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4135 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
4136 | 4136 | ||
4137 | cpumask_copy(cpus, cpu_online_mask); | 4137 | cpumask_copy(cpus, cpu_active_mask); |
4138 | 4138 | ||
4139 | /* | 4139 | /* |
4140 | * When power savings policy is enabled for the parent domain, idle | 4140 | * When power savings policy is enabled for the parent domain, idle |
@@ -4297,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
4297 | int all_pinned = 0; | 4297 | int all_pinned = 0; |
4298 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4298 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
4299 | 4299 | ||
4300 | cpumask_copy(cpus, cpu_online_mask); | 4300 | cpumask_copy(cpus, cpu_active_mask); |
4301 | 4301 | ||
4302 | /* | 4302 | /* |
4303 | * When power savings policy is enabled for the parent domain, idle | 4303 | * When power savings policy is enabled for the parent domain, idle |
@@ -4694,7 +4694,7 @@ int select_nohz_load_balancer(int stop_tick) | |||
4694 | cpumask_set_cpu(cpu, nohz.cpu_mask); | 4694 | cpumask_set_cpu(cpu, nohz.cpu_mask); |
4695 | 4695 | ||
4696 | /* time for ilb owner also to sleep */ | 4696 | /* time for ilb owner also to sleep */ |
4697 | if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | 4697 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { |
4698 | if (atomic_read(&nohz.load_balancer) == cpu) | 4698 | if (atomic_read(&nohz.load_balancer) == cpu) |
4699 | atomic_set(&nohz.load_balancer, -1); | 4699 | atomic_set(&nohz.load_balancer, -1); |
4700 | return 0; | 4700 | return 0; |
@@ -7093,7 +7093,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
7093 | int ret = 0; | 7093 | int ret = 0; |
7094 | 7094 | ||
7095 | rq = task_rq_lock(p, &flags); | 7095 | rq = task_rq_lock(p, &flags); |
7096 | if (!cpumask_intersects(new_mask, cpu_online_mask)) { | 7096 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
7097 | ret = -EINVAL; | 7097 | ret = -EINVAL; |
7098 | goto out; | 7098 | goto out; |
7099 | } | 7099 | } |
@@ -7115,7 +7115,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
7115 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 7115 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
7116 | goto out; | 7116 | goto out; |
7117 | 7117 | ||
7118 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7118 | if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { |
7119 | /* Need help from migration thread: drop lock and wait. */ | 7119 | /* Need help from migration thread: drop lock and wait. */ |
7120 | struct task_struct *mt = rq->migration_thread; | 7120 | struct task_struct *mt = rq->migration_thread; |
7121 | 7121 | ||
@@ -7269,19 +7269,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
7269 | 7269 | ||
7270 | again: | 7270 | again: |
7271 | /* Look for allowed, online CPU in same node. */ | 7271 | /* Look for allowed, online CPU in same node. */ |
7272 | for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) | 7272 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) |
7273 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 7273 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) |
7274 | goto move; | 7274 | goto move; |
7275 | 7275 | ||
7276 | /* Any allowed, online CPU? */ | 7276 | /* Any allowed, online CPU? */ |
7277 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); | 7277 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); |
7278 | if (dest_cpu < nr_cpu_ids) | 7278 | if (dest_cpu < nr_cpu_ids) |
7279 | goto move; | 7279 | goto move; |
7280 | 7280 | ||
7281 | /* No more Mr. Nice Guy. */ | 7281 | /* No more Mr. Nice Guy. */ |
7282 | if (dest_cpu >= nr_cpu_ids) { | 7282 | if (dest_cpu >= nr_cpu_ids) { |
7283 | cpuset_cpus_allowed_locked(p, &p->cpus_allowed); | 7283 | cpuset_cpus_allowed_locked(p, &p->cpus_allowed); |
7284 | dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); | 7284 | dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); |
7285 | 7285 | ||
7286 | /* | 7286 | /* |
7287 | * Don't tell them about moving exiting tasks or | 7287 | * Don't tell them about moving exiting tasks or |
@@ -7310,7 +7310,7 @@ move: | |||
7310 | */ | 7310 | */ |
7311 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 7311 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
7312 | { | 7312 | { |
7313 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); | 7313 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
7314 | unsigned long flags; | 7314 | unsigned long flags; |
7315 | 7315 | ||
7316 | local_irq_save(flags); | 7316 | local_irq_save(flags); |
@@ -7564,7 +7564,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
7564 | static struct ctl_table_header *sd_sysctl_header; | 7564 | static struct ctl_table_header *sd_sysctl_header; |
7565 | static void register_sched_domain_sysctl(void) | 7565 | static void register_sched_domain_sysctl(void) |
7566 | { | 7566 | { |
7567 | int i, cpu_num = num_online_cpus(); | 7567 | int i, cpu_num = num_possible_cpus(); |
7568 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 7568 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
7569 | char buf[32]; | 7569 | char buf[32]; |
7570 | 7570 | ||
@@ -7574,7 +7574,7 @@ static void register_sched_domain_sysctl(void) | |||
7574 | if (entry == NULL) | 7574 | if (entry == NULL) |
7575 | return; | 7575 | return; |
7576 | 7576 | ||
7577 | for_each_online_cpu(i) { | 7577 | for_each_possible_cpu(i) { |
7578 | snprintf(buf, 32, "cpu%d", i); | 7578 | snprintf(buf, 32, "cpu%d", i); |
7579 | entry->procname = kstrdup(buf, GFP_KERNEL); | 7579 | entry->procname = kstrdup(buf, GFP_KERNEL); |
7580 | entry->mode = 0555; | 7580 | entry->mode = 0555; |
@@ -9100,7 +9100,7 @@ match1: | |||
9100 | if (doms_new == NULL) { | 9100 | if (doms_new == NULL) { |
9101 | ndoms_cur = 0; | 9101 | ndoms_cur = 0; |
9102 | doms_new = &fallback_doms; | 9102 | doms_new = &fallback_doms; |
9103 | cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); | 9103 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
9104 | WARN_ON_ONCE(dattr_new); | 9104 | WARN_ON_ONCE(dattr_new); |
9105 | } | 9105 | } |
9106 | 9106 | ||
@@ -9231,8 +9231,10 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
9231 | switch (action) { | 9231 | switch (action) { |
9232 | case CPU_ONLINE: | 9232 | case CPU_ONLINE: |
9233 | case CPU_ONLINE_FROZEN: | 9233 | case CPU_ONLINE_FROZEN: |
9234 | case CPU_DEAD: | 9234 | case CPU_DOWN_PREPARE: |
9235 | case CPU_DEAD_FROZEN: | 9235 | case CPU_DOWN_PREPARE_FROZEN: |
9236 | case CPU_DOWN_FAILED: | ||
9237 | case CPU_DOWN_FAILED_FROZEN: | ||
9236 | partition_sched_domains(1, NULL, NULL); | 9238 | partition_sched_domains(1, NULL, NULL); |
9237 | return NOTIFY_OK; | 9239 | return NOTIFY_OK; |
9238 | 9240 | ||
@@ -9279,7 +9281,7 @@ void __init sched_init_smp(void) | |||
9279 | #endif | 9281 | #endif |
9280 | get_online_cpus(); | 9282 | get_online_cpus(); |
9281 | mutex_lock(&sched_domains_mutex); | 9283 | mutex_lock(&sched_domains_mutex); |
9282 | arch_init_sched_domains(cpu_online_mask); | 9284 | arch_init_sched_domains(cpu_active_mask); |
9283 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 9285 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
9284 | if (cpumask_empty(non_isolated_cpus)) | 9286 | if (cpumask_empty(non_isolated_cpus)) |
9285 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 9287 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |