cpu hotplug, sched: Introduce cpu_active_map and redo sched domain managment (take 2)

This is based on Linus' idea of creating cpu_active_map that prevents scheduler load balancer from migrating tasks to the cpu that is going down. It allows us to simplify domain management code and avoid unecessary domain rebuilds during cpu hotplug event handling. Please ignore the cpusets part for now. It needs some more work in order to avoid crazy lock nesting. Although I did simplfy and unify domain reinitialization logic. We now simply call partition_sched_domains() in all the cases. This means that we're using exact same code paths as in cpusets case and hence the test below cover cpusets too. Cpuset changes to make rebuild_sched_domains() callable from various contexts are in the separate patch (right next after this one). This not only boots but also easily handles while true; do make clean; make -j 8; done and while true; do on-off-cpu 1; done at the same time. (on-off-cpu 1 simple does echo 0/1 > /sys/.../cpu1/online thing). Suprisingly the box (dual-core Core2) is quite usable. In fact I'm typing this on right now in gnome-terminal and things are moving just fine. Also this is running with most of the debug features enabled (lockdep, mutex, etc) no BUG_ONs or lockdep complaints so far. I believe I addressed all of the Dmitry's comments for original Linus' version. I changed both fair and rt balancer to mask out non-active cpus. And replaced cpu_is_offline() with !cpu_active() in the main scheduler code where it made sense (to me). Signed-off-by: Max Krasnyanskiy <maxk@qualcomm.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Gregory Haskins <ghaskins@novell.com> Cc: dmitry.adamushko@gmail.com Cc: pj@sgi.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Max Krasnyansky <maxk@qualcomm.com> 2008-07-15 07:43:49 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-07-18 07:22:25 -0400
commit: e761b7725234276a802322549cee5255305a0930 (patch)
tree: 27b351a7d5fc9a93590e0effce1c5adb1bfcebc0 /kernel
parent: 7ebefa8ceefed44cc321be70afc54a585a68ac0b (diff)
5 files changed, 80 insertions, 70 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index cfb1d43ab80..a1ac7ea245d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
        cpu_hotplug.refcount = 0;
 }
+cpumask_t cpu_active_map;
 #ifdef CONFIG_HOTPLUG_CPU
 void get_online_cpus(void)
@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu)
        int err = 0;
        cpu_maps_update_begin();
-        if (cpu_hotplug_disabled)
+        if (cpu_hotplug_disabled) {
                err = -EBUSY;
-        else
+                goto out;
-                err = _cpu_down(cpu, 0);
+        }
+        cpu_clear(cpu, cpu_active_map);
+        err = _cpu_down(cpu, 0);
+        if (cpu_online(cpu))
+                cpu_set(cpu, cpu_active_map);
+out:
        cpu_maps_update_done();
        return err;
 }
@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu)
        }
        cpu_maps_update_begin();
-        if (cpu_hotplug_disabled)
+        if (cpu_hotplug_disabled) {
                err = -EBUSY;
-        else
+                goto out;
-                err = _cpu_up(cpu, 0);
+        }
+        err = _cpu_up(cpu, 0);
+        if (cpu_online(cpu))
+                cpu_set(cpu, cpu_active_map);
+out:
        cpu_maps_update_done();
        return err;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a..3c3ef02f65f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 *      partition_sched_domains().
 */
-static void rebuild_sched_domains(void)
+void rebuild_sched_domains(void)
 {
        struct kfifo *q;        /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
diff --git a/kernel/sched.c b/kernel/sched.c
index 1ee18dbb451..c237624a8a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
        rq = task_rq_lock(p, &flags);
        if (!cpu_isset(dest_cpu, p->cpus_allowed)
-            || unlikely(cpu_is_offline(dest_cpu)))
+            || unlikely(!cpu_active(dest_cpu)))
                goto out;
        /* force the process onto the specified CPU */
@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
                /*
                 * If we are going offline and still the leader, give up!
                 */
-                if (cpu_is_offline(cpu) &&
+                if (!cpu_active(cpu) &&
                    atomic_read(&nohz.load_balancer) == cpu) {
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                BUG();
@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        struct rq *rq_dest, *rq_src;
        int ret = 0, on_rq;
-        if (unlikely(cpu_is_offline(dest_cpu)))
+        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
        rq_src = cpu_rq(src_cpu);
@@ -7554,18 +7554,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 }
 /*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-        ndoms_cur = 0;
-        if (doms_cur != &fallback_doms)
-                kfree(doms_cur);
-        doms_cur = &fallback_doms;
-}
-/*
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
 * For now this just excludes isolated cpus, but could be used to
 * exclude other special cases in the future.
@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 * ownership of it and will kfree it when done with it. If the caller
 * failed the kmalloc call, then it can pass in doms_new == NULL,
 * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
 *
 * Call with hotplug lock held
 */
@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
-        if (doms_new == NULL) {
+        if (doms_new == NULL)
-                ndoms_new = 1;
+                ndoms_new = 0;
-                doms_new = &fallback_doms;
-                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-                dattr_new = NULL;
-        }
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
@@ -7677,6 +7661,14 @@ match1:
                ;
        }
+        if (doms_new == NULL) {
+                ndoms_cur = 0;
+                ndoms_new = 1;
+                doms_new = &fallback_doms;
+                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+                dattr_new = NULL;
+        }
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
                for (j = 0; j < ndoms_cur; j++) {
@@ -7707,17 +7699,10 @@ match2:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
-        int err;
        get_online_cpus();
-        mutex_lock(&sched_domains_mutex);
+        rebuild_sched_domains();
-        detach_destroy_domains(&cpu_online_map);
-        free_sched_domains();
-        err = arch_init_sched_domains(&cpu_online_map);
-        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
+        return 0;
-        return err;
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7783,59 +7768,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+#ifndef CONFIG_CPUSETS
 /*
- * Force a reinitialization of the sched domains hierarchy. The domains
+ * Add online and remove offline CPUs from the scheduler domains.
- * and groups cannot be updated in place without racing with the balancing
+ * When cpusets are enabled they take over this function.
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
 */
 static int update_sched_domains(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
 {
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                partition_sched_domains(0, NULL, NULL);
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
+#endif
+static int update_runtime(struct notifier_block *nfb,
+                                unsigned long action, void *hcpu)
+{
        int cpu = (int)(long)hcpu;
        switch (action) {
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                disable_runtime(cpu_rq(cpu));
-                /* fall-through */
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                detach_destroy_domains(&cpu_online_map);
-                free_sched_domains();
                return NOTIFY_OK;
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                enable_runtime(cpu_rq(cpu));
-                /* fall-through */
+                return NOTIFY_OK;
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                /*
-                 * Fall through and re-initialise the domains.
-                 */
-                break;
        default:
                return NOTIFY_DONE;
        }
-#ifndef CONFIG_CPUSETS
-        /*
-         * Create default domain partitioning if cpusets are disabled.
-         * Otherwise we let cpusets rebuild the domains based on the
-         * current setup.
-         */
-        /* The hotplug lock is already held by cpu_up/cpu_down */
-        arch_init_sched_domains(&cpu_online_map);
-#endif
-        return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void)
                cpu_set(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
+#ifndef CONFIG_CPUSETS
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
+#endif
+        /* RT runtime code needs to handle some hotplug events */
+        hotcpu_notifier(update_runtime, 0);
        init_hrtick();
        /* Move init over to a non-isolated CPU */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d..d924c679dfa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq)
 * not idle and an idle cpu is available.  The span of cpus to
 * search starts with cpus closest then further out as needed,
 * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
 *
 * Returns the CPU we should wake onto.
 */
@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p)
                    || ((sd->flags & SD_WAKE_IDLE_FAR)
                        && !task_hot(p, task_rq(p)->clock, sd))) {
                        cpus_and(tmp, sd->span, p->cpus_allowed);
+                        cpus_and(tmp, tmp, cpu_active_map);
                        for_each_cpu_mask(i, tmp) {
                                if (idle_cpu(i)) {
                                        if (i != task_cpu(p)) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d3d1cccb3d7..50735bb9614 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -934,6 +934,13 @@ static int find_lowest_rq(struct task_struct *task)
                return -1; /* No targets found */
        /*
+         * Only consider CPUs that are usable for migration.
+         * I guess we might want to change cpupri_find() to ignore those
+         * in the first place.
+         */
+        cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+        /*
         * At this point we have built a mask of cpus representing the
         * lowest priority tasks in the system.  Now we want to elect
         * the best one based on our affinity and topology.
author	Max Krasnyansky <maxk@qualcomm.com>	2008-07-15 07:43:49 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-07-18 07:22:25 -0400
commit	e761b7725234276a802322549cee5255305a0930 (patch)
tree	27b351a7d5fc9a93590e0effce1c5adb1bfcebc0 /kernel
parent	7ebefa8ceefed44cc321be70afc54a585a68ac0b (diff)

diff --git a/kernel/cpu.c b/kernel/cpu.c index cfb1d43ab80..a1ac7ea245d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
64	cpu_hotplug.refcount = 0;	64	cpu_hotplug.refcount = 0;
65	}	65	}
66		66
		67	cpumask_t cpu_active_map;
		68
67	#ifdef CONFIG_HOTPLUG_CPU	69	#ifdef CONFIG_HOTPLUG_CPU
68		70
69	void get_online_cpus(void)	71	void get_online_cpus(void)
@@ -291,11 +293,20 @@ int __ref cpu_down(unsigned int cpu)
291	int err = 0;	293	int err = 0;
292		294
293	cpu_maps_update_begin();	295	cpu_maps_update_begin();
294	if (cpu_hotplug_disabled)	296
		297	if (cpu_hotplug_disabled) {
295	err = -EBUSY;	298	err = -EBUSY;
296	else	299	goto out;
297	err = _cpu_down(cpu, 0);	300	}
		301
		302	cpu_clear(cpu, cpu_active_map);
		303
		304	err = _cpu_down(cpu, 0);
		305
		306	if (cpu_online(cpu))
		307	cpu_set(cpu, cpu_active_map);
298		308
		309	out:
299	cpu_maps_update_done();	310	cpu_maps_update_done();
300	return err;	311	return err;
301	}	312	}
@@ -355,11 +366,18 @@ int __cpuinit cpu_up(unsigned int cpu)
355	}	366	}
356		367
357	cpu_maps_update_begin();	368	cpu_maps_update_begin();
358	if (cpu_hotplug_disabled)	369
		370	if (cpu_hotplug_disabled) {
359	err = -EBUSY;	371	err = -EBUSY;
360	else	372	goto out;
361	err = _cpu_up(cpu, 0);	373	}
362		374
		375	err = _cpu_up(cpu, 0);
		376
		377	if (cpu_online(cpu))
		378	cpu_set(cpu, cpu_active_map);
		379
		380	out:
363	cpu_maps_update_done();	381	cpu_maps_update_done();
364	return err;	382	return err;
365	}	383	}


diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601947a..3c3ef02f65f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr dattr, struct cpuset c)
564	* partition_sched_domains().	564	* partition_sched_domains().
565	*/	565	*/
566		566
567	static void rebuild_sched_domains(void)	567	void rebuild_sched_domains(void)
568	{	568	{
569	struct kfifo q; / queue of cpusets to be scanned */	569	struct kfifo q; / queue of cpusets to be scanned */
570	struct cpuset cp; / scans q */	570	struct cpuset cp; / scans q */


diff --git a/kernel/sched.c b/kernel/sched.c index 1ee18dbb451..c237624a8a0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -2881,7 +2881,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2881		2881
2882	rq = task_rq_lock(p, &flags);	2882	rq = task_rq_lock(p, &flags);
2883	if (!cpu_isset(dest_cpu, p->cpus_allowed)	2883	if (!cpu_isset(dest_cpu, p->cpus_allowed)
2884	\|\| unlikely(cpu_is_offline(dest_cpu)))	2884	\|\| unlikely(!cpu_active(dest_cpu)))
2885	goto out;	2885	goto out;
2886		2886
2887	/* force the process onto the specified CPU */	2887	/* force the process onto the specified CPU */
@@ -3849,7 +3849,7 @@ int select_nohz_load_balancer(int stop_tick)
3849	/*	3849	/*
3850	* If we are going offline and still the leader, give up!	3850	* If we are going offline and still the leader, give up!
3851	*/	3851	*/
3852	if (cpu_is_offline(cpu) &&	3852	if (!cpu_active(cpu) &&
3853	atomic_read(&nohz.load_balancer) == cpu) {	3853	atomic_read(&nohz.load_balancer) == cpu) {
3854	if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)	3854	if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3855	BUG();	3855	BUG();
@@ -5876,7 +5876,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5876	struct rq rq_dest, rq_src;	5876	struct rq rq_dest, rq_src;
5877	int ret = 0, on_rq;	5877	int ret = 0, on_rq;
5878		5878
5879	if (unlikely(cpu_is_offline(dest_cpu)))	5879	if (unlikely(!cpu_active(dest_cpu)))
5880	return ret;	5880	return ret;
5881		5881
5882	rq_src = cpu_rq(src_cpu);	5882	rq_src = cpu_rq(src_cpu);
@@ -7554,18 +7554,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7554	}	7554	}
7555		7555
7556	/*	7556	/*
7557	* Free current domain masks.
7558	* Called after all cpus are attached to NULL domain.
7559	*/
7560	static void free_sched_domains(void)
7561	{
7562	ndoms_cur = 0;
7563	if (doms_cur != &fallback_doms)
7564	kfree(doms_cur);
7565	doms_cur = &fallback_doms;
7566	}
7567
7568	/*
7569	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	7557	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
7570	* For now this just excludes isolated cpus, but could be used to	7558	* For now this just excludes isolated cpus, but could be used to
7571	* exclude other special cases in the future.	7559	* exclude other special cases in the future.
@@ -7643,7 +7631,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7643	* ownership of it and will kfree it when done with it. If the caller	7631	* ownership of it and will kfree it when done with it. If the caller
7644	* failed the kmalloc call, then it can pass in doms_new == NULL,	7632	* failed the kmalloc call, then it can pass in doms_new == NULL,
7645	* and partition_sched_domains() will fallback to the single partition	7633	* and partition_sched_domains() will fallback to the single partition
7646	* 'fallback_doms'.	7634	* 'fallback_doms', it also forces the domains to be rebuilt.
7647	*	7635	*
7648	* Call with hotplug lock held	7636	* Call with hotplug lock held
7649	*/	7637	*/
@@ -7657,12 +7645,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7657	/* always unregister in case we don't destroy any domains */	7645	/* always unregister in case we don't destroy any domains */
7658	unregister_sched_domain_sysctl();	7646	unregister_sched_domain_sysctl();
7659		7647
7660	if (doms_new == NULL) {	7648	if (doms_new == NULL)
7661	ndoms_new = 1;	7649	ndoms_new = 0;
7662	doms_new = &fallback_doms;
7663	cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7664	dattr_new = NULL;
7665	}
7666		7650
7667	/* Destroy deleted domains */	7651	/* Destroy deleted domains */
7668	for (i = 0; i < ndoms_cur; i++) {	7652	for (i = 0; i < ndoms_cur; i++) {
@@ -7677,6 +7661,14 @@ match1:
7677	;	7661	;
7678	}	7662	}
7679		7663
		7664	if (doms_new == NULL) {
		7665	ndoms_cur = 0;
		7666	ndoms_new = 1;
		7667	doms_new = &fallback_doms;
		7668	cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
		7669	dattr_new = NULL;
		7670	}
		7671
7680	/* Build new domains */	7672	/* Build new domains */
7681	for (i = 0; i < ndoms_new; i++) {	7673	for (i = 0; i < ndoms_new; i++) {
7682	for (j = 0; j < ndoms_cur; j++) {	7674	for (j = 0; j < ndoms_cur; j++) {
@@ -7707,17 +7699,10 @@ match2:
7707	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)	7699	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
7708	int arch_reinit_sched_domains(void)	7700	int arch_reinit_sched_domains(void)
7709	{	7701	{
7710	int err;
7711
7712	get_online_cpus();	7702	get_online_cpus();
7713	mutex_lock(&sched_domains_mutex);	7703	rebuild_sched_domains();
7714	detach_destroy_domains(&cpu_online_map);
7715	free_sched_domains();
7716	err = arch_init_sched_domains(&cpu_online_map);
7717	mutex_unlock(&sched_domains_mutex);
7718	put_online_cpus();	7704	put_online_cpus();
7719		7705	return 0;
7720	return err;
7721	}	7706	}
7722		7707
7723	static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)	7708	static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7783,59 +7768,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7783	}	7768	}
7784	#endif /* CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT */	7769	#endif /* CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT */
7785		7770
		7771	#ifndef CONFIG_CPUSETS
7786	/*	7772	/*
7787	* Force a reinitialization of the sched domains hierarchy. The domains	7773	* Add online and remove offline CPUs from the scheduler domains.
7788	* and groups cannot be updated in place without racing with the balancing	7774	* When cpusets are enabled they take over this function.
7789	* code, so we temporarily attach all running cpus to the NULL domain
7790	* which will prevent rebalancing while the sched domains are recalculated.
7791	*/	7775	*/
7792	static int update_sched_domains(struct notifier_block *nfb,	7776	static int update_sched_domains(struct notifier_block *nfb,
7793	unsigned long action, void *hcpu)	7777	unsigned long action, void *hcpu)
7794	{	7778	{
		7779	switch (action) {
		7780	case CPU_ONLINE:
		7781	case CPU_ONLINE_FROZEN:
		7782	case CPU_DEAD:
		7783	case CPU_DEAD_FROZEN:
		7784	partition_sched_domains(0, NULL, NULL);
		7785	return NOTIFY_OK;
		7786
		7787	default:
		7788	return NOTIFY_DONE;
		7789	}
		7790	}
		7791	#endif
		7792
		7793	static int update_runtime(struct notifier_block *nfb,
		7794	unsigned long action, void *hcpu)
		7795	{
7795	int cpu = (int)(long)hcpu;	7796	int cpu = (int)(long)hcpu;
7796		7797
7797	switch (action) {	7798	switch (action) {
7798	case CPU_DOWN_PREPARE:	7799	case CPU_DOWN_PREPARE:
7799	case CPU_DOWN_PREPARE_FROZEN:	7800	case CPU_DOWN_PREPARE_FROZEN:
7800	disable_runtime(cpu_rq(cpu));	7801	disable_runtime(cpu_rq(cpu));
7801	/* fall-through */
7802	case CPU_UP_PREPARE:
7803	case CPU_UP_PREPARE_FROZEN:
7804	detach_destroy_domains(&cpu_online_map);
7805	free_sched_domains();
7806	return NOTIFY_OK;	7802	return NOTIFY_OK;
7807		7803
7808
7809	case CPU_DOWN_FAILED:	7804	case CPU_DOWN_FAILED:
7810	case CPU_DOWN_FAILED_FROZEN:	7805	case CPU_DOWN_FAILED_FROZEN:
7811	case CPU_ONLINE:	7806	case CPU_ONLINE:
7812	case CPU_ONLINE_FROZEN:	7807	case CPU_ONLINE_FROZEN:
7813	enable_runtime(cpu_rq(cpu));	7808	enable_runtime(cpu_rq(cpu));
7814	/* fall-through */	7809	return NOTIFY_OK;
7815	case CPU_UP_CANCELED:	7810
7816	case CPU_UP_CANCELED_FROZEN:
7817	case CPU_DEAD:
7818	case CPU_DEAD_FROZEN:
7819	/*
7820	* Fall through and re-initialise the domains.
7821	*/
7822	break;
7823	default:	7811	default:
7824	return NOTIFY_DONE;	7812	return NOTIFY_DONE;
7825	}	7813	}
7826
7827	#ifndef CONFIG_CPUSETS
7828	/*
7829	* Create default domain partitioning if cpusets are disabled.
7830	* Otherwise we let cpusets rebuild the domains based on the
7831	* current setup.
7832	*/
7833
7834	/* The hotplug lock is already held by cpu_up/cpu_down */
7835	arch_init_sched_domains(&cpu_online_map);
7836	#endif
7837
7838	return NOTIFY_OK;
7839	}	7814	}
7840		7815
7841	void __init sched_init_smp(void)	7816	void __init sched_init_smp(void)
@@ -7855,8 +7830,15 @@ void __init sched_init_smp(void)
7855	cpu_set(smp_processor_id(), non_isolated_cpus);	7830	cpu_set(smp_processor_id(), non_isolated_cpus);
7856	mutex_unlock(&sched_domains_mutex);	7831	mutex_unlock(&sched_domains_mutex);
7857	put_online_cpus();	7832	put_online_cpus();
		7833
		7834	#ifndef CONFIG_CPUSETS
7858	/* XXX: Theoretical race here - CPU may be hotplugged now */	7835	/* XXX: Theoretical race here - CPU may be hotplugged now */
7859	hotcpu_notifier(update_sched_domains, 0);	7836	hotcpu_notifier(update_sched_domains, 0);
		7837	#endif
		7838
		7839	/* RT runtime code needs to handle some hotplug events */
		7840	hotcpu_notifier(update_runtime, 0);
		7841
7860	init_hrtick();	7842	init_hrtick();
7861		7843
7862	/* Move init over to a non-isolated CPU */	7844	/* Move init over to a non-isolated CPU */


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f2aa987027d..d924c679dfa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -1004,6 +1004,8 @@ static void yield_task_fair(struct rq *rq)
1004	* not idle and an idle cpu is available. The span of cpus to	1004	* not idle and an idle cpu is available. The span of cpus to
1005	* search starts with cpus closest then further out as needed,	1005	* search starts with cpus closest then further out as needed,
1006	* so we always favor a closer, idle cpu.	1006	* so we always favor a closer, idle cpu.
		1007	* Domains may include CPUs that are not usable for migration,
		1008	* hence we need to mask them out (cpu_active_map)
1007	*	1009	*
1008	* Returns the CPU we should wake onto.	1010	* Returns the CPU we should wake onto.
1009	*/	1011	*/
@@ -1031,6 +1033,7 @@ static int wake_idle(int cpu, struct task_struct *p)
1031	\|\| ((sd->flags & SD_WAKE_IDLE_FAR)	1033	\|\| ((sd->flags & SD_WAKE_IDLE_FAR)
1032	&& !task_hot(p, task_rq(p)->clock, sd))) {	1034	&& !task_hot(p, task_rq(p)->clock, sd))) {
1033	cpus_and(tmp, sd->span, p->cpus_allowed);	1035	cpus_and(tmp, sd->span, p->cpus_allowed);
		1036	cpus_and(tmp, tmp, cpu_active_map);
1034	for_each_cpu_mask(i, tmp) {	1037	for_each_cpu_mask(i, tmp) {
1035	if (idle_cpu(i)) {	1038	if (idle_cpu(i)) {
1036	if (i != task_cpu(p)) {	1039	if (i != task_cpu(p)) {


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d3d1cccb3d7..50735bb9614 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -934,6 +934,13 @@ static int find_lowest_rq(struct task_struct *task)
934	return -1; /* No targets found */	934	return -1; /* No targets found */
935		935
936	/*	936	/*
		937	* Only consider CPUs that are usable for migration.
		938	* I guess we might want to change cpupri_find() to ignore those
		939	* in the first place.
		940	*/
		941	cpus_and(lowest_mask, lowest_mask, cpu_active_map);
		942
		943	/*
937	* At this point we have built a mask of cpus representing the	944	* At this point we have built a mask of cpus representing the
938	* lowest priority tasks in the system. Now we want to elect	945	* lowest priority tasks in the system. Now we want to elect
939	* the best one based on our affinity and topology.	946	* the best one based on our affinity and topology.