aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-06-08 15:40:36 -0400
committerTejun Heo <tj@kernel.org>2010-06-08 15:40:36 -0400
commit3a101d0548e925ab16ca6aaa8cf4f767d322ddb0 (patch)
treeb90d8c5f2efe30fcfa49a00fdea037567c6cd46f
parent50a323b73069b169385a8ac65633dee837a7d13f (diff)
sched: adjust when cpu_active and cpuset configurations are updated during cpu on/offlining
Currently, when a cpu goes down, cpu_active is cleared before CPU_DOWN_PREPARE starts and cpuset configuration is updated from a default priority cpu notifier. When a cpu is coming up, it's set before CPU_ONLINE but cpuset configuration again is updated from the same cpu notifier. For cpu notifiers, this presents an inconsistent state. Threads which a CPU_DOWN_PREPARE notifier expects to be bound to the CPU can be migrated to other cpus because the cpu is no more inactive. Fix it by updating cpu_active in the highest priority cpu notifier and cpuset configuration in the second highest when a cpu is coming up. Down path is updated similarly. This guarantees that all other cpu notifiers see consistent cpu_active and cpuset configuration. cpuset_track_online_cpus() notifier is converted to cpuset_update_active_cpus() which just updates the configuration and now called from cpuset_cpu_[in]active() notifiers registered from sched_init_smp(). If cpuset is disabled, cpuset_update_active_cpus() degenerates into partition_sched_domains() making separate notifier for !CONFIG_CPUSETS unnecessary. This problem is triggered by cmwq. During CPU_DOWN_PREPARE, hotplug callback creates a kthread and kthread_bind()s it to the target cpu, and the thread is expected to run on that cpu. * Ingo's test discovered __cpuinit/exit markups were incorrect. Fixed. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Ingo Molnar <mingo@elte.hu> Cc: Paul Menage <menage@google.com>
-rw-r--r--include/linux/cpu.h16
-rw-r--r--include/linux/cpuset.h6
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c21
-rw-r--r--kernel/sched.c67
5 files changed, 74 insertions, 42 deletions
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 2d9073883ea9..de6b1722cdca 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -52,6 +52,22 @@ struct notifier_block;
52 * CPU notifier priorities. 52 * CPU notifier priorities.
53 */ 53 */
54enum { 54enum {
55 /*
56 * SCHED_ACTIVE marks a cpu which is coming up active during
57 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
58 * notifier. CPUSET_ACTIVE adjusts cpuset according to
59 * cpu_active mask right after SCHED_ACTIVE. During
60 * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
61 * ordered in the similar way.
62 *
63 * This ordering guarantees consistent cpu_active mask and
64 * migration behavior to all cpu notifiers.
65 */
66 CPU_PRI_SCHED_ACTIVE = INT_MAX,
67 CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1,
68 CPU_PRI_SCHED_INACTIVE = INT_MIN + 1,
69 CPU_PRI_CPUSET_INACTIVE = INT_MIN,
70
55 /* migration should happen before other stuff but after perf */ 71 /* migration should happen before other stuff but after perf */
56 CPU_PRI_PERF = 20, 72 CPU_PRI_PERF = 20,
57 CPU_PRI_MIGRATION = 10, 73 CPU_PRI_MIGRATION = 10,
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 457ed765a116..f20eb8f16025 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,6 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
20 20
21extern int cpuset_init(void); 21extern int cpuset_init(void);
22extern void cpuset_init_smp(void); 22extern void cpuset_init_smp(void);
23extern void cpuset_update_active_cpus(void);
23extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 24extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
24extern int cpuset_cpus_allowed_fallback(struct task_struct *p); 25extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
25extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 26extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
132static inline int cpuset_init(void) { return 0; } 133static inline int cpuset_init(void) { return 0; }
133static inline void cpuset_init_smp(void) {} 134static inline void cpuset_init_smp(void) {}
134 135
136static inline void cpuset_update_active_cpus(void)
137{
138 partition_sched_domains(1, NULL, NULL);
139}
140
135static inline void cpuset_cpus_allowed(struct task_struct *p, 141static inline void cpuset_cpus_allowed(struct task_struct *p,
136 struct cpumask *mask) 142 struct cpumask *mask)
137{ 143{
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
235 return -EINVAL; 235 return -EINVAL;
236 236
237 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
238 set_cpu_active(cpu, false);
239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
240 if (err) { 239 if (err) {
241 set_cpu_active(cpu, true);
242
243 nr_calls--; 240 nr_calls--;
244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
245 printk("%s: attempt to take down CPU %u failed\n", 242 printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
249 246
250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 247 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
251 if (err) { 248 if (err) {
252 set_cpu_active(cpu, true);
253 /* CPU didn't die: tell everyone. Can't complain. */ 249 /* CPU didn't die: tell everyone. Can't complain. */
254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 250 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
255 251
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
321 goto out_notify; 317 goto out_notify;
322 BUG_ON(!cpu_online(cpu)); 318 BUG_ON(!cpu_online(cpu));
323 319
324 set_cpu_active(cpu, true);
325
326 /* Now call notifier in preparation. */ 320 /* Now call notifier in preparation. */
327 cpu_notify(CPU_ONLINE | mod, hcpu); 321 cpu_notify(CPU_ONLINE | mod, hcpu);
328 322
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..05727dcaa80d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2113 * but making no active use of cpusets. 2113 * but making no active use of cpusets.
2114 * 2114 *
2115 * This routine ensures that top_cpuset.cpus_allowed tracks 2115 * This routine ensures that top_cpuset.cpus_allowed tracks
2116 * cpu_online_map on each CPU hotplug (cpuhp) event. 2116 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2117 * 2117 *
2118 * Called within get_online_cpus(). Needs to call cgroup_lock() 2118 * Called within get_online_cpus(). Needs to call cgroup_lock()
2119 * before calling generate_sched_domains(). 2119 * before calling generate_sched_domains().
2120 */ 2120 */
2121static int cpuset_track_online_cpus(struct notifier_block *unused_nb, 2121void __cpuexit cpuset_update_active_cpus(void)
2122 unsigned long phase, void *unused_cpu)
2123{ 2122{
2124 struct sched_domain_attr *attr; 2123 struct sched_domain_attr *attr;
2125 cpumask_var_t *doms; 2124 cpumask_var_t *doms;
2126 int ndoms; 2125 int ndoms;
2127 2126
2128 switch (phase) {
2129 case CPU_ONLINE:
2130 case CPU_ONLINE_FROZEN:
2131 case CPU_DOWN_PREPARE:
2132 case CPU_DOWN_PREPARE_FROZEN:
2133 case CPU_DOWN_FAILED:
2134 case CPU_DOWN_FAILED_FROZEN:
2135 break;
2136
2137 default:
2138 return NOTIFY_DONE;
2139 }
2140
2141 cgroup_lock(); 2127 cgroup_lock();
2142 mutex_lock(&callback_mutex); 2128 mutex_lock(&callback_mutex);
2143 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2148 2134
2149 /* Have scheduler rebuild the domains */ 2135 /* Have scheduler rebuild the domains */
2150 partition_sched_domains(ndoms, doms, attr); 2136 partition_sched_domains(ndoms, doms, attr);
2151
2152 return NOTIFY_OK;
2153} 2137}
2154 2138
2155#ifdef CONFIG_MEMORY_HOTPLUG 2139#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
2203 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2204 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2205 2189
2206 hotcpu_notifier(cpuset_track_online_cpus, 0);
2207 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2208 2191
2209 cpuset_wq = create_singlethread_workqueue("cpuset"); 2192 cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/sched.c b/kernel/sched.c
index 552faf8d358c..2b942e49d0fa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5804,17 +5804,46 @@ static struct notifier_block __cpuinitdata migration_notifier = {
5804 .priority = CPU_PRI_MIGRATION, 5804 .priority = CPU_PRI_MIGRATION,
5805}; 5805};
5806 5806
5807static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5808 unsigned long action, void *hcpu)
5809{
5810 switch (action & ~CPU_TASKS_FROZEN) {
5811 case CPU_ONLINE:
5812 case CPU_DOWN_FAILED:
5813 set_cpu_active((long)hcpu, true);
5814 return NOTIFY_OK;
5815 default:
5816 return NOTIFY_DONE;
5817 }
5818}
5819
5820static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5821 unsigned long action, void *hcpu)
5822{
5823 switch (action & ~CPU_TASKS_FROZEN) {
5824 case CPU_DOWN_PREPARE:
5825 set_cpu_active((long)hcpu, false);
5826 return NOTIFY_OK;
5827 default:
5828 return NOTIFY_DONE;
5829 }
5830}
5831
5807static int __init migration_init(void) 5832static int __init migration_init(void)
5808{ 5833{
5809 void *cpu = (void *)(long)smp_processor_id(); 5834 void *cpu = (void *)(long)smp_processor_id();
5810 int err; 5835 int err;
5811 5836
5812 /* Start one for the boot CPU: */ 5837 /* Initialize migration for the boot CPU */
5813 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5838 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5814 BUG_ON(err == NOTIFY_BAD); 5839 BUG_ON(err == NOTIFY_BAD);
5815 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5840 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5816 register_cpu_notifier(&migration_notifier); 5841 register_cpu_notifier(&migration_notifier);
5817 5842
5843 /* Register cpu active notifiers */
5844 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5845 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5846
5818 return 0; 5847 return 0;
5819} 5848}
5820early_initcall(migration_init); 5849early_initcall(migration_init);
@@ -7273,29 +7302,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7273} 7302}
7274#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7303#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7275 7304
7276#ifndef CONFIG_CPUSETS
7277/* 7305/*
7278 * Add online and remove offline CPUs from the scheduler domains. 7306 * Update cpusets according to cpu_active mask. If cpusets are
7279 * When cpusets are enabled they take over this function. 7307 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7308 * around partition_sched_domains().
7280 */ 7309 */
7281static int update_sched_domains(struct notifier_block *nfb, 7310static int __cpuexit cpuset_cpu_active(struct notifier_block *nfb,
7282 unsigned long action, void *hcpu) 7311 unsigned long action, void *hcpu)
7283{ 7312{
7284 switch (action) { 7313 switch (action & ~CPU_TASKS_FROZEN) {
7285 case CPU_ONLINE: 7314 case CPU_ONLINE:
7286 case CPU_ONLINE_FROZEN:
7287 case CPU_DOWN_PREPARE:
7288 case CPU_DOWN_PREPARE_FROZEN:
7289 case CPU_DOWN_FAILED: 7315 case CPU_DOWN_FAILED:
7290 case CPU_DOWN_FAILED_FROZEN: 7316 cpuset_update_active_cpus();
7291 partition_sched_domains(1, NULL, NULL);
7292 return NOTIFY_OK; 7317 return NOTIFY_OK;
7318 default:
7319 return NOTIFY_DONE;
7320 }
7321}
7293 7322
7323static int __cpuexit cpuset_cpu_inactive(struct notifier_block *nfb,
7324 unsigned long action, void *hcpu)
7325{
7326 switch (action & ~CPU_TASKS_FROZEN) {
7327 case CPU_DOWN_PREPARE:
7328 cpuset_update_active_cpus();
7329 return NOTIFY_OK;
7294 default: 7330 default:
7295 return NOTIFY_DONE; 7331 return NOTIFY_DONE;
7296 } 7332 }
7297} 7333}
7298#endif
7299 7334
7300static int update_runtime(struct notifier_block *nfb, 7335static int update_runtime(struct notifier_block *nfb,
7301 unsigned long action, void *hcpu) 7336 unsigned long action, void *hcpu)
@@ -7341,10 +7376,8 @@ void __init sched_init_smp(void)
7341 mutex_unlock(&sched_domains_mutex); 7376 mutex_unlock(&sched_domains_mutex);
7342 put_online_cpus(); 7377 put_online_cpus();
7343 7378
7344#ifndef CONFIG_CPUSETS 7379 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7345 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7380 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7346 hotcpu_notifier(update_sched_domains, 0);
7347#endif
7348 7381
7349 /* RT runtime code needs to handle some hotplug events */ 7382 /* RT runtime code needs to handle some hotplug events */
7350 hotcpu_notifier(update_runtime, 0); 7383 hotcpu_notifier(update_runtime, 0);