5 files changed, 130 insertions, 98 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index cfb1d43ab801..033603c1d7c3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
        cpu_hotplug.refcount = 0;
 }
+cpumask_t cpu_active_map;
 #ifdef CONFIG_HOTPLUG_CPU
 void get_online_cpus(void)
@@ -291,11 +293,30 @@ int __ref cpu_down(unsigned int cpu)
        int err = 0;
        cpu_maps_update_begin();
-        if (cpu_hotplug_disabled)
+        if (cpu_hotplug_disabled) {
                err = -EBUSY;
-        else
+                goto out;
-                err = _cpu_down(cpu, 0);
+        }
+        cpu_clear(cpu, cpu_active_map);
+        /*
+         * Make sure the all cpus did the reschedule and are not
+         * using stale version of the cpu_active_map.
+         * This is not strictly necessary becuase stop_machine()
+         * that we run down the line already provides the required
+         * synchronization. But it's really a side effect and we do not
+         * want to depend on the innards of the stop_machine here.
+         */
+        synchronize_sched();
+        err = _cpu_down(cpu, 0);
+        if (cpu_online(cpu))
+                cpu_set(cpu, cpu_active_map);
+out:
        cpu_maps_update_done();
        return err;
 }
@@ -355,11 +376,18 @@ int __cpuinit cpu_up(unsigned int cpu)
        }
        cpu_maps_update_begin();
-        if (cpu_hotplug_disabled)
+        if (cpu_hotplug_disabled) {
                err = -EBUSY;
-        else
+                goto out;
-                err = _cpu_up(cpu, 0);
+        }
+        err = _cpu_up(cpu, 0);
+        if (cpu_online(cpu))
+                cpu_set(cpu, cpu_active_map);
+out:
        cpu_maps_update_done();
        return err;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a8..3c3ef02f65f1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
 *      partition_sched_domains().
 */
-static void rebuild_sched_domains(void)
+void rebuild_sched_domains(void)
 {
        struct kfifo *q;        /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
diff --git a/kernel/sched.c b/kernel/sched.c
index c13c75e9f9f7..85cf246cfdf5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2802,7 +2802,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
        rq = task_rq_lock(p, &flags);
        if (!cpu_isset(dest_cpu, p->cpus_allowed)
-            || unlikely(cpu_is_offline(dest_cpu)))
+            || unlikely(!cpu_active(dest_cpu)))
                goto out;
        /* force the process onto the specified CPU */
@@ -3770,7 +3770,7 @@ int select_nohz_load_balancer(int stop_tick)
                /*
                 * If we are going offline and still the leader, give up!
                 */
-                if (cpu_is_offline(cpu) &&
+                if (!cpu_active(cpu) &&
                    atomic_read(&nohz.load_balancer) == cpu) {
                        if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
                                BUG();
@@ -5794,7 +5794,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        struct rq *rq_dest, *rq_src;
        int ret = 0, on_rq;
-        if (unlikely(cpu_is_offline(dest_cpu)))
+        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
        rq_src = cpu_rq(src_cpu);
@@ -7472,18 +7472,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
 }
 /*
- * Free current domain masks.
- * Called after all cpus are attached to NULL domain.
- */
-static void free_sched_domains(void)
-{
-        ndoms_cur = 0;
-        if (doms_cur != &fallback_doms)
-                kfree(doms_cur);
-        doms_cur = &fallback_doms;
-}
-/*
 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
 * For now this just excludes isolated cpus, but could be used to
 * exclude other special cases in the future.
@@ -7561,7 +7549,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
 * ownership of it and will kfree it when done with it. If the caller
 * failed the kmalloc call, then it can pass in doms_new == NULL,
 * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms'.
+ * 'fallback_doms', it also forces the domains to be rebuilt.
 *
 * Call with hotplug lock held
 */
@@ -7575,12 +7563,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
        /* always unregister in case we don't destroy any domains */
        unregister_sched_domain_sysctl();
-        if (doms_new == NULL) {
+        if (doms_new == NULL)
-                ndoms_new = 1;
+                ndoms_new = 0;
-                doms_new = &fallback_doms;
-                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-                dattr_new = NULL;
-        }
        /* Destroy deleted domains */
        for (i = 0; i < ndoms_cur; i++) {
@@ -7595,6 +7579,14 @@ match1:
                ;
        }
+        if (doms_new == NULL) {
+                ndoms_cur = 0;
+                ndoms_new = 1;
+                doms_new = &fallback_doms;
+                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+                dattr_new = NULL;
+        }
        /* Build new domains */
        for (i = 0; i < ndoms_new; i++) {
                for (j = 0; j < ndoms_cur; j++) {
@@ -7625,17 +7617,10 @@ match2:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 int arch_reinit_sched_domains(void)
 {
-        int err;
        get_online_cpus();
-        mutex_lock(&sched_domains_mutex);
+        rebuild_sched_domains();
-        detach_destroy_domains(&cpu_online_map);
-        free_sched_domains();
-        err = arch_init_sched_domains(&cpu_online_map);
-        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
+        return 0;
-        return err;
 }
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7701,59 +7686,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+#ifndef CONFIG_CPUSETS
 /*
- * Force a reinitialization of the sched domains hierarchy. The domains
+ * Add online and remove offline CPUs from the scheduler domains.
- * and groups cannot be updated in place without racing with the balancing
+ * When cpusets are enabled they take over this function.
- * code, so we temporarily attach all running cpus to the NULL domain
- * which will prevent rebalancing while the sched domains are recalculated.
 */
 static int update_sched_domains(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
 {
+        switch (action) {
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
+                partition_sched_domains(0, NULL, NULL);
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
+#endif
+static int update_runtime(struct notifier_block *nfb,
+                                unsigned long action, void *hcpu)
+{
        int cpu = (int)(long)hcpu;
        switch (action) {
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                disable_runtime(cpu_rq(cpu));
-                /* fall-through */
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                detach_destroy_domains(&cpu_online_map);
-                free_sched_domains();
                return NOTIFY_OK;
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                enable_runtime(cpu_rq(cpu));
-                /* fall-through */
+                return NOTIFY_OK;
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                /*
-                 * Fall through and re-initialise the domains.
-                 */
-                break;
        default:
                return NOTIFY_DONE;
        }
-#ifndef CONFIG_CPUSETS
-        /*
-         * Create default domain partitioning if cpusets are disabled.
-         * Otherwise we let cpusets rebuild the domains based on the
-         * current setup.
-         */
-        /* The hotplug lock is already held by cpu_up/cpu_down */
-        arch_init_sched_domains(&cpu_online_map);
-#endif
-        return NOTIFY_OK;
 }
 void __init sched_init_smp(void)
@@ -7773,8 +7748,15 @@ void __init sched_init_smp(void)
                cpu_set(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
+#ifndef CONFIG_CPUSETS
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
+#endif
+        /* RT runtime code needs to handle some hotplug events */
+        hotcpu_notifier(update_runtime, 0);
        init_hrtick();
        /* Move init over to a non-isolated CPU */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6893b3ed65fe..7f700263f04c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1003,6 +1003,8 @@ static void yield_task_fair(struct rq *rq)
 * not idle and an idle cpu is available.  The span of cpus to
 * search starts with cpus closest then further out as needed,
 * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
 *
 * Returns the CPU we should wake onto.
 */
@@ -1030,6 +1032,7 @@ static int wake_idle(int cpu, struct task_struct *p)
                    || ((sd->flags & SD_WAKE_IDLE_FAR)
                        && !task_hot(p, task_rq(p)->clock, sd))) {
                        cpus_and(tmp, sd->span, p->cpus_allowed);
+                        cpus_and(tmp, tmp, cpu_active_map);
                        for_each_cpu_mask(i, tmp) {
                                if (idle_cpu(i)) {
                                        if (i != task_cpu(p)) {
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 147004c651c0..24621cea8bb0 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -601,11 +601,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
        if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
                return;
-        if (rt_se->nr_cpus_allowed == 1)
+        list_add_tail(&rt_se->run_list, queue);
-                list_add(&rt_se->run_list, queue);
-        else
-                list_add_tail(&rt_se->run_list, queue);
        __set_bit(rt_se_prio(rt_se), array->bitmap);
        inc_rt_tasks(rt_se, rt_rq);
@@ -690,32 +686,34 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 * Put task to the end of the run list without the overhead of dequeue
 * followed by enqueue.
 */
-static
+static void
-void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
 {
-        struct rt_prio_array *array = &rt_rq->active;
        if (on_rt_rq(rt_se)) {
-                list_del_init(&rt_se->run_list);
+                struct rt_prio_array *array = &rt_rq->active;
-                list_add_tail(&rt_se->run_list,
+                struct list_head *queue = array->queue + rt_se_prio(rt_se);
-                              array->queue + rt_se_prio(rt_se));
+                if (head)
+                        list_move(&rt_se->run_list, queue);
+                else
+                        list_move_tail(&rt_se->run_list, queue);
        }
 }
-static void requeue_task_rt(struct rq *rq, struct task_struct *p)
+static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
 {
        struct sched_rt_entity *rt_se = &p->rt;
        struct rt_rq *rt_rq;
        for_each_sched_rt_entity(rt_se) {
                rt_rq = rt_rq_of_se(rt_se);
-                requeue_rt_entity(rt_rq, rt_se);
+                requeue_rt_entity(rt_rq, rt_se, head);
        }
 }
 static void yield_task_rt(struct rq *rq)
 {
-        requeue_task_rt(rq, rq->curr);
+        requeue_task_rt(rq, rq->curr, 0);
 }
 #ifdef CONFIG_SMP
@@ -755,6 +753,30 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
         */
        return task_cpu(p);
 }
+static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+{
+        cpumask_t mask;
+        if (rq->curr->rt.nr_cpus_allowed == 1)
+                return;
+        if (p->rt.nr_cpus_allowed != 1
+            && cpupri_find(&rq->rd->cpupri, p, &mask))
+                return;
+        if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
+                return;
+        /*
+         * There appears to be other cpus that can accept
+         * current and none to run 'p', so lets reschedule
+         * to try and push current away:
+         */
+        requeue_task_rt(rq, p, 1);
+        resched_task(rq->curr);
+}
 #endif /* CONFIG_SMP */
 /*
@@ -780,18 +802,8 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
         * to move current somewhere else, making room for our non-migratable
         * task.
         */
-        if((p->prio == rq->curr->prio)
+        if (p->prio == rq->curr->prio && !need_resched())
-           && p->rt.nr_cpus_allowed == 1
+                check_preempt_equal_prio(rq, p);
-           && rq->curr->rt.nr_cpus_allowed != 1) {
-                cpumask_t mask;
-                if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
-                        /*
-                         * There appears to be other cpus that can accept
-                         * current, so lets reschedule to try and push it away
-                         */
-                        resched_task(rq->curr);
-        }
 #endif
 }
@@ -924,6 +936,13 @@ static int find_lowest_rq(struct task_struct *task)
                return -1; /* No targets found */
        /*
+         * Only consider CPUs that are usable for migration.
+         * I guess we might want to change cpupri_find() to ignore those
+         * in the first place.
+         */
+        cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
+        /*
         * At this point we have built a mask of cpus representing the
         * lowest priority tasks in the system.  Now we want to elect
         * the best one based on our affinity and topology.
@@ -1417,7 +1436,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
         * on the queue:
         */
        if (p->rt.run_list.prev != p->rt.run_list.next) {
-                requeue_task_rt(rq, p);
+                requeue_task_rt(rq, p, 0);
                set_tsk_need_resched(p);
        }
 }

diff --git a/kernel/cpu.c b/kernel/cpu.c index cfb1d43ab801..033603c1d7c3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
64	cpu_hotplug.refcount = 0;	64	cpu_hotplug.refcount = 0;
65	}	65	}
66		66
		67	cpumask_t cpu_active_map;
		68
67	#ifdef CONFIG_HOTPLUG_CPU	69	#ifdef CONFIG_HOTPLUG_CPU
68		70
69	void get_online_cpus(void)	71	void get_online_cpus(void)
@@ -291,11 +293,30 @@ int __ref cpu_down(unsigned int cpu)
291	int err = 0;	293	int err = 0;
292		294
293	cpu_maps_update_begin();	295	cpu_maps_update_begin();
294	if (cpu_hotplug_disabled)	296
		297	if (cpu_hotplug_disabled) {
295	err = -EBUSY;	298	err = -EBUSY;
296	else	299	goto out;
297	err = _cpu_down(cpu, 0);	300	}
		301
		302	cpu_clear(cpu, cpu_active_map);
		303
		304	/*
		305	* Make sure the all cpus did the reschedule and are not
		306	* using stale version of the cpu_active_map.
		307	* This is not strictly necessary becuase stop_machine()
		308	* that we run down the line already provides the required
		309	* synchronization. But it's really a side effect and we do not
		310	* want to depend on the innards of the stop_machine here.
		311	*/
		312	synchronize_sched();
		313
		314	err = _cpu_down(cpu, 0);
298		315
		316	if (cpu_online(cpu))
		317	cpu_set(cpu, cpu_active_map);
		318
		319	out:
299	cpu_maps_update_done();	320	cpu_maps_update_done();
300	return err;	321	return err;
301	}	322	}
@@ -355,11 +376,18 @@ int __cpuinit cpu_up(unsigned int cpu)
355	}	376	}
356		377
357	cpu_maps_update_begin();	378	cpu_maps_update_begin();
358	if (cpu_hotplug_disabled)	379
		380	if (cpu_hotplug_disabled) {
359	err = -EBUSY;	381	err = -EBUSY;
360	else	382	goto out;
361	err = _cpu_up(cpu, 0);	383	}
		384
		385	err = _cpu_up(cpu, 0);
362		386
		387	if (cpu_online(cpu))
		388	cpu_set(cpu, cpu_active_map);
		389
		390	out:
363	cpu_maps_update_done();	391	cpu_maps_update_done();
364	return err;	392	return err;
365	}	393	}


diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601947a8..3c3ef02f65f1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -564,7 +564,7 @@ update_domain_attr(struct sched_domain_attr dattr, struct cpuset c)
564	* partition_sched_domains().	564	* partition_sched_domains().
565	*/	565	*/
566		566
567	static void rebuild_sched_domains(void)	567	void rebuild_sched_domains(void)
568	{	568	{
569	struct kfifo q; / queue of cpusets to be scanned */	569	struct kfifo q; / queue of cpusets to be scanned */
570	struct cpuset cp; / scans q */	570	struct cpuset cp; / scans q */


diff --git a/kernel/sched.c b/kernel/sched.c index c13c75e9f9f7..85cf246cfdf5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -2802,7 +2802,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2802		2802
2803	rq = task_rq_lock(p, &flags);	2803	rq = task_rq_lock(p, &flags);
2804	if (!cpu_isset(dest_cpu, p->cpus_allowed)	2804	if (!cpu_isset(dest_cpu, p->cpus_allowed)
2805	\|\| unlikely(cpu_is_offline(dest_cpu)))	2805	\|\| unlikely(!cpu_active(dest_cpu)))
2806	goto out;	2806	goto out;
2807		2807
2808	/* force the process onto the specified CPU */	2808	/* force the process onto the specified CPU */
@@ -3770,7 +3770,7 @@ int select_nohz_load_balancer(int stop_tick)
3770	/*	3770	/*
3771	* If we are going offline and still the leader, give up!	3771	* If we are going offline and still the leader, give up!
3772	*/	3772	*/
3773	if (cpu_is_offline(cpu) &&	3773	if (!cpu_active(cpu) &&
3774	atomic_read(&nohz.load_balancer) == cpu) {	3774	atomic_read(&nohz.load_balancer) == cpu) {
3775	if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)	3775	if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3776	BUG();	3776	BUG();
@@ -5794,7 +5794,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5794	struct rq rq_dest, rq_src;	5794	struct rq rq_dest, rq_src;
5795	int ret = 0, on_rq;	5795	int ret = 0, on_rq;
5796		5796
5797	if (unlikely(cpu_is_offline(dest_cpu)))	5797	if (unlikely(!cpu_active(dest_cpu)))
5798	return ret;	5798	return ret;
5799		5799
5800	rq_src = cpu_rq(src_cpu);	5800	rq_src = cpu_rq(src_cpu);
@@ -7472,18 +7472,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7472	}	7472	}
7473		7473
7474	/*	7474	/*
7475	* Free current domain masks.
7476	* Called after all cpus are attached to NULL domain.
7477	*/
7478	static void free_sched_domains(void)
7479	{
7480	ndoms_cur = 0;
7481	if (doms_cur != &fallback_doms)
7482	kfree(doms_cur);
7483	doms_cur = &fallback_doms;
7484	}
7485
7486	/*
7487	* Set up scheduler domains and groups. Callers must hold the hotplug lock.	7475	* Set up scheduler domains and groups. Callers must hold the hotplug lock.
7488	* For now this just excludes isolated cpus, but could be used to	7476	* For now this just excludes isolated cpus, but could be used to
7489	* exclude other special cases in the future.	7477	* exclude other special cases in the future.
@@ -7561,7 +7549,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7561	* ownership of it and will kfree it when done with it. If the caller	7549	* ownership of it and will kfree it when done with it. If the caller
7562	* failed the kmalloc call, then it can pass in doms_new == NULL,	7550	* failed the kmalloc call, then it can pass in doms_new == NULL,
7563	* and partition_sched_domains() will fallback to the single partition	7551	* and partition_sched_domains() will fallback to the single partition
7564	* 'fallback_doms'.	7552	* 'fallback_doms', it also forces the domains to be rebuilt.
7565	*	7553	*
7566	* Call with hotplug lock held	7554	* Call with hotplug lock held
7567	*/	7555	*/
@@ -7575,12 +7563,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7575	/* always unregister in case we don't destroy any domains */	7563	/* always unregister in case we don't destroy any domains */
7576	unregister_sched_domain_sysctl();	7564	unregister_sched_domain_sysctl();
7577		7565
7578	if (doms_new == NULL) {	7566	if (doms_new == NULL)
7579	ndoms_new = 1;	7567	ndoms_new = 0;
7580	doms_new = &fallback_doms;
7581	cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7582	dattr_new = NULL;
7583	}
7584		7568
7585	/* Destroy deleted domains */	7569	/* Destroy deleted domains */
7586	for (i = 0; i < ndoms_cur; i++) {	7570	for (i = 0; i < ndoms_cur; i++) {
@@ -7595,6 +7579,14 @@ match1:
7595	;	7579	;
7596	}	7580	}
7597		7581
		7582	if (doms_new == NULL) {
		7583	ndoms_cur = 0;
		7584	ndoms_new = 1;
		7585	doms_new = &fallback_doms;
		7586	cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
		7587	dattr_new = NULL;
		7588	}
		7589
7598	/* Build new domains */	7590	/* Build new domains */
7599	for (i = 0; i < ndoms_new; i++) {	7591	for (i = 0; i < ndoms_new; i++) {
7600	for (j = 0; j < ndoms_cur; j++) {	7592	for (j = 0; j < ndoms_cur; j++) {
@@ -7625,17 +7617,10 @@ match2:
7625	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)	7617	#if defined(CONFIG_SCHED_MC) \|\| defined(CONFIG_SCHED_SMT)
7626	int arch_reinit_sched_domains(void)	7618	int arch_reinit_sched_domains(void)
7627	{	7619	{
7628	int err;
7629
7630	get_online_cpus();	7620	get_online_cpus();
7631	mutex_lock(&sched_domains_mutex);	7621	rebuild_sched_domains();
7632	detach_destroy_domains(&cpu_online_map);
7633	free_sched_domains();
7634	err = arch_init_sched_domains(&cpu_online_map);
7635	mutex_unlock(&sched_domains_mutex);
7636	put_online_cpus();	7622	put_online_cpus();
7637		7623	return 0;
7638	return err;
7639	}	7624	}
7640		7625
7641	static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)	7626	static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7701,59 +7686,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7701	}	7686	}
7702	#endif /* CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT */	7687	#endif /* CONFIG_SCHED_MC \|\| CONFIG_SCHED_SMT */
7703		7688
		7689	#ifndef CONFIG_CPUSETS
7704	/*	7690	/*
7705	* Force a reinitialization of the sched domains hierarchy. The domains	7691	* Add online and remove offline CPUs from the scheduler domains.
7706	* and groups cannot be updated in place without racing with the balancing	7692	* When cpusets are enabled they take over this function.
7707	* code, so we temporarily attach all running cpus to the NULL domain
7708	* which will prevent rebalancing while the sched domains are recalculated.
7709	*/	7693	*/
7710	static int update_sched_domains(struct notifier_block *nfb,	7694	static int update_sched_domains(struct notifier_block *nfb,
7711	unsigned long action, void *hcpu)	7695	unsigned long action, void *hcpu)
7712	{	7696	{
		7697	switch (action) {
		7698	case CPU_ONLINE:
		7699	case CPU_ONLINE_FROZEN:
		7700	case CPU_DEAD:
		7701	case CPU_DEAD_FROZEN:
		7702	partition_sched_domains(0, NULL, NULL);
		7703	return NOTIFY_OK;
		7704
		7705	default:
		7706	return NOTIFY_DONE;
		7707	}
		7708	}
		7709	#endif
		7710
		7711	static int update_runtime(struct notifier_block *nfb,
		7712	unsigned long action, void *hcpu)
		7713	{
7713	int cpu = (int)(long)hcpu;	7714	int cpu = (int)(long)hcpu;
7714		7715
7715	switch (action) {	7716	switch (action) {
7716	case CPU_DOWN_PREPARE:	7717	case CPU_DOWN_PREPARE:
7717	case CPU_DOWN_PREPARE_FROZEN:	7718	case CPU_DOWN_PREPARE_FROZEN:
7718	disable_runtime(cpu_rq(cpu));	7719	disable_runtime(cpu_rq(cpu));
7719	/* fall-through */
7720	case CPU_UP_PREPARE:
7721	case CPU_UP_PREPARE_FROZEN:
7722	detach_destroy_domains(&cpu_online_map);
7723	free_sched_domains();
7724	return NOTIFY_OK;	7720	return NOTIFY_OK;
7725		7721
7726
7727	case CPU_DOWN_FAILED:	7722	case CPU_DOWN_FAILED:
7728	case CPU_DOWN_FAILED_FROZEN:	7723	case CPU_DOWN_FAILED_FROZEN:
7729	case CPU_ONLINE:	7724	case CPU_ONLINE:
7730	case CPU_ONLINE_FROZEN:	7725	case CPU_ONLINE_FROZEN:
7731	enable_runtime(cpu_rq(cpu));	7726	enable_runtime(cpu_rq(cpu));
7732	/* fall-through */	7727	return NOTIFY_OK;
7733	case CPU_UP_CANCELED:	7728
7734	case CPU_UP_CANCELED_FROZEN:
7735	case CPU_DEAD:
7736	case CPU_DEAD_FROZEN:
7737	/*
7738	* Fall through and re-initialise the domains.
7739	*/
7740	break;
7741	default:	7729	default:
7742	return NOTIFY_DONE;	7730	return NOTIFY_DONE;
7743	}	7731	}
7744
7745	#ifndef CONFIG_CPUSETS
7746	/*
7747	* Create default domain partitioning if cpusets are disabled.
7748	* Otherwise we let cpusets rebuild the domains based on the
7749	* current setup.
7750	*/
7751
7752	/* The hotplug lock is already held by cpu_up/cpu_down */
7753	arch_init_sched_domains(&cpu_online_map);
7754	#endif
7755
7756	return NOTIFY_OK;
7757	}	7732	}
7758		7733
7759	void __init sched_init_smp(void)	7734	void __init sched_init_smp(void)
@@ -7773,8 +7748,15 @@ void __init sched_init_smp(void)
7773	cpu_set(smp_processor_id(), non_isolated_cpus);	7748	cpu_set(smp_processor_id(), non_isolated_cpus);
7774	mutex_unlock(&sched_domains_mutex);	7749	mutex_unlock(&sched_domains_mutex);
7775	put_online_cpus();	7750	put_online_cpus();
		7751
		7752	#ifndef CONFIG_CPUSETS
7776	/* XXX: Theoretical race here - CPU may be hotplugged now */	7753	/* XXX: Theoretical race here - CPU may be hotplugged now */
7777	hotcpu_notifier(update_sched_domains, 0);	7754	hotcpu_notifier(update_sched_domains, 0);
		7755	#endif
		7756
		7757	/* RT runtime code needs to handle some hotplug events */
		7758	hotcpu_notifier(update_runtime, 0);
		7759
7778	init_hrtick();	7760	init_hrtick();
7779		7761
7780	/* Move init over to a non-isolated CPU */	7762	/* Move init over to a non-isolated CPU */


diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6893b3ed65fe..7f700263f04c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -1003,6 +1003,8 @@ static void yield_task_fair(struct rq *rq)
1003	* not idle and an idle cpu is available. The span of cpus to	1003	* not idle and an idle cpu is available. The span of cpus to
1004	* search starts with cpus closest then further out as needed,	1004	* search starts with cpus closest then further out as needed,
1005	* so we always favor a closer, idle cpu.	1005	* so we always favor a closer, idle cpu.
		1006	* Domains may include CPUs that are not usable for migration,
		1007	* hence we need to mask them out (cpu_active_map)
1006	*	1008	*
1007	* Returns the CPU we should wake onto.	1009	* Returns the CPU we should wake onto.
1008	*/	1010	*/
@@ -1030,6 +1032,7 @@ static int wake_idle(int cpu, struct task_struct *p)
1030	\|\| ((sd->flags & SD_WAKE_IDLE_FAR)	1032	\|\| ((sd->flags & SD_WAKE_IDLE_FAR)
1031	&& !task_hot(p, task_rq(p)->clock, sd))) {	1033	&& !task_hot(p, task_rq(p)->clock, sd))) {
1032	cpus_and(tmp, sd->span, p->cpus_allowed);	1034	cpus_and(tmp, sd->span, p->cpus_allowed);
		1035	cpus_and(tmp, tmp, cpu_active_map);
1033	for_each_cpu_mask(i, tmp) {	1036	for_each_cpu_mask(i, tmp) {
1034	if (idle_cpu(i)) {	1037	if (idle_cpu(i)) {
1035	if (i != task_cpu(p)) {	1038	if (i != task_cpu(p)) {


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 147004c651c0..24621cea8bb0 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -601,11 +601,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
601	if (group_rq && (rt_rq_throttled(group_rq) \|\| !group_rq->rt_nr_running))	601	if (group_rq && (rt_rq_throttled(group_rq) \|\| !group_rq->rt_nr_running))
602	return;	602	return;
603		603
604	if (rt_se->nr_cpus_allowed == 1)	604	list_add_tail(&rt_se->run_list, queue);
605	list_add(&rt_se->run_list, queue);
606	else
607	list_add_tail(&rt_se->run_list, queue);
608
609	__set_bit(rt_se_prio(rt_se), array->bitmap);	605	__set_bit(rt_se_prio(rt_se), array->bitmap);
610		606
611	inc_rt_tasks(rt_se, rt_rq);	607	inc_rt_tasks(rt_se, rt_rq);
@@ -690,32 +686,34 @@ static void dequeue_task_rt(struct rq rq, struct task_struct p, int sleep)
690	* Put task to the end of the run list without the overhead of dequeue	686	* Put task to the end of the run list without the overhead of dequeue
691	* followed by enqueue.	687	* followed by enqueue.
692	*/	688	*/
693	static	689	static void
694	void requeue_rt_entity(struct rt_rq rt_rq, struct sched_rt_entity rt_se)	690	requeue_rt_entity(struct rt_rq rt_rq, struct sched_rt_entity rt_se, int head)
695	{	691	{
696	struct rt_prio_array *array = &rt_rq->active;
697
698	if (on_rt_rq(rt_se)) {	692	if (on_rt_rq(rt_se)) {
699	list_del_init(&rt_se->run_list);	693	struct rt_prio_array *array = &rt_rq->active;
700	list_add_tail(&rt_se->run_list,	694	struct list_head *queue = array->queue + rt_se_prio(rt_se);
701	array->queue + rt_se_prio(rt_se));	695
		696	if (head)
		697	list_move(&rt_se->run_list, queue);
		698	else
		699	list_move_tail(&rt_se->run_list, queue);
702	}	700	}
703	}	701	}
704		702
705	static void requeue_task_rt(struct rq rq, struct task_struct p)	703	static void requeue_task_rt(struct rq rq, struct task_struct p, int head)
706	{	704	{
707	struct sched_rt_entity *rt_se = &p->rt;	705	struct sched_rt_entity *rt_se = &p->rt;
708	struct rt_rq *rt_rq;	706	struct rt_rq *rt_rq;
709		707
710	for_each_sched_rt_entity(rt_se) {	708	for_each_sched_rt_entity(rt_se) {
711	rt_rq = rt_rq_of_se(rt_se);	709	rt_rq = rt_rq_of_se(rt_se);
712	requeue_rt_entity(rt_rq, rt_se);	710	requeue_rt_entity(rt_rq, rt_se, head);
713	}	711	}
714	}	712	}
715		713
716	static void yield_task_rt(struct rq *rq)	714	static void yield_task_rt(struct rq *rq)
717	{	715	{
718	requeue_task_rt(rq, rq->curr);	716	requeue_task_rt(rq, rq->curr, 0);
719	}	717	}
720		718
721	#ifdef CONFIG_SMP	719	#ifdef CONFIG_SMP
@@ -755,6 +753,30 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
755	*/	753	*/
756	return task_cpu(p);	754	return task_cpu(p);
757	}	755	}
		756
		757	static void check_preempt_equal_prio(struct rq rq, struct task_struct p)
		758	{
		759	cpumask_t mask;
		760
		761	if (rq->curr->rt.nr_cpus_allowed == 1)
		762	return;
		763
		764	if (p->rt.nr_cpus_allowed != 1
		765	&& cpupri_find(&rq->rd->cpupri, p, &mask))
		766	return;
		767
		768	if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
		769	return;
		770
		771	/*
		772	* There appears to be other cpus that can accept
		773	* current and none to run 'p', so lets reschedule
		774	* to try and push current away:
		775	*/
		776	requeue_task_rt(rq, p, 1);
		777	resched_task(rq->curr);
		778	}
		779
758	#endif /* CONFIG_SMP */	780	#endif /* CONFIG_SMP */
759		781
760	/*	782	/*
@@ -780,18 +802,8 @@ static void check_preempt_curr_rt(struct rq rq, struct task_struct p)
780	* to move current somewhere else, making room for our non-migratable	802	* to move current somewhere else, making room for our non-migratable
781	* task.	803	* task.
782	*/	804	*/
783	if((p->prio == rq->curr->prio)	805	if (p->prio == rq->curr->prio && !need_resched())
784	&& p->rt.nr_cpus_allowed == 1	806	check_preempt_equal_prio(rq, p);
785	&& rq->curr->rt.nr_cpus_allowed != 1) {
786	cpumask_t mask;
787
788	if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
789	/*
790	* There appears to be other cpus that can accept
791	* current, so lets reschedule to try and push it away
792	*/
793	resched_task(rq->curr);
794	}
795	#endif	807	#endif
796	}	808	}
797		809
@@ -924,6 +936,13 @@ static int find_lowest_rq(struct task_struct *task)
924	return -1; /* No targets found */	936	return -1; /* No targets found */
925		937
926	/*	938	/*
		939	* Only consider CPUs that are usable for migration.
		940	* I guess we might want to change cpupri_find() to ignore those
		941	* in the first place.
		942	*/
		943	cpus_and(lowest_mask, lowest_mask, cpu_active_map);
		944
		945	/*
927	* At this point we have built a mask of cpus representing the	946	* At this point we have built a mask of cpus representing the
928	* lowest priority tasks in the system. Now we want to elect	947	* lowest priority tasks in the system. Now we want to elect
929	* the best one based on our affinity and topology.	948	* the best one based on our affinity and topology.
@@ -1417,7 +1436,7 @@ static void task_tick_rt(struct rq rq, struct task_struct p, int queued)
1417	* on the queue:	1436	* on the queue:
1418	*/	1437	*/
1419	if (p->rt.run_list.prev != p->rt.run_list.next) {	1438	if (p->rt.run_list.prev != p->rt.run_list.next) {
1420	requeue_task_rt(rq, p);	1439	requeue_task_rt(rq, p, 0);
1421	set_tsk_need_resched(p);	1440	set_tsk_need_resched(p);
1422	}	1441	}
1423	}	1442	}