2 files changed, 231 insertions, 2 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 6185fa080ec8..97cab609fc31 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1952,6 +1952,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        prev_state = prev->state;
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
+        schedule_tail_balance_rt(rq);
        fire_sched_in_preempt_notifiers(current);
        if (mm)
                mmdrop(mm);
@@ -2185,11 +2187,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 /*
 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 */
-static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
+        int ret = 0;
        if (unlikely(!irqs_disabled())) {
                /* printk() doesn't work good under rq->lock */
                spin_unlock(&this_rq->lock);
@@ -2200,9 +2204,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
                        spin_lock(&this_rq->lock);
+                        ret = 1;
                } else
                        spin_lock(&busiest->lock);
        }
+        return ret;
 }
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 136c2857a049..7815e90b1147 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -137,6 +137,227 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 }
 #ifdef CONFIG_SMP
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
+/* Return the second highest RT task, NULL otherwise */
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+{
+        struct rt_prio_array *array = &rq->rt.active;
+        struct task_struct *next;
+        struct list_head *queue;
+        int idx;
+        assert_spin_locked(&rq->lock);
+        if (likely(rq->rt.rt_nr_running < 2))
+                return NULL;
+        idx = sched_find_first_bit(array->bitmap);
+        if (unlikely(idx >= MAX_RT_PRIO)) {
+                WARN_ON(1); /* rt_nr_running is bad */
+                return NULL;
+        }
+        queue = array->queue + idx;
+        next = list_entry(queue->next, struct task_struct, run_list);
+        if (unlikely(next != rq->curr))
+                return next;
+        if (queue->next->next != queue) {
+                /* same prio task */
+                next = list_entry(queue->next->next, struct task_struct, run_list);
+                return next;
+        }
+        /* slower, but more flexible */
+        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+        if (unlikely(idx >= MAX_RT_PRIO)) {
+                WARN_ON(1); /* rt_nr_running was 2 and above! */
+                return NULL;
+        }
+        queue = array->queue + idx;
+        next = list_entry(queue->next, struct task_struct, run_list);
+        return next;
+}
+static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
+                                      struct rq *this_rq)
+{
+        struct rq *lowest_rq = NULL;
+        int cpu;
+        int tries;
+        cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
+        cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
+        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+                /*
+                 * Scan each rq for the lowest prio.
+                 */
+                for_each_cpu_mask(cpu, *cpu_mask) {
+                        struct rq *rq = &per_cpu(runqueues, cpu);
+                        if (cpu == this_rq->cpu)
+                                continue;
+                        /* We look for lowest RT prio or non-rt CPU */
+                        if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+                                lowest_rq = rq;
+                                break;
+                        }
+                        /* no locking for now */
+                        if (rq->rt.highest_prio > task->prio &&
+                            (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
+                                lowest_rq = rq;
+                        }
+                }
+                if (!lowest_rq)
+                        break;
+                /* if the prio of this runqueue changed, try again */
+                if (double_lock_balance(this_rq, lowest_rq)) {
+                        /*
+                         * We had to unlock the run queue. In
+                         * the mean time, task could have
+                         * migrated already or had its affinity changed.
+                         * Also make sure that it wasn't scheduled on its rq.
+                         */
+                        if (unlikely(task_rq(task) != this_rq ||
+                                     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
+                                     task_running(this_rq, task) ||
+                                     !task->se.on_rq)) {
+                                spin_unlock(&lowest_rq->lock);
+                                lowest_rq = NULL;
+                                break;
+                        }
+                }
+                /* If this rq is still suitable use it. */
+                if (lowest_rq->rt.highest_prio > task->prio)
+                        break;
+                /* try again */
+                spin_unlock(&lowest_rq->lock);
+                lowest_rq = NULL;
+        }
+        return lowest_rq;
+}
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *this_rq)
+{
+        struct task_struct *next_task;
+        struct rq *lowest_rq;
+        int ret = 0;
+        int paranoid = RT_MAX_TRIES;
+        assert_spin_locked(&this_rq->lock);
+        next_task = pick_next_highest_task_rt(this_rq);
+        if (!next_task)
+                return 0;
+ retry:
+        if (unlikely(next_task == this_rq->curr))
+                return 0;
+        /*
+         * It's possible that the next_task slipped in of
+         * higher priority than current. If that's the case
+         * just reschedule current.
+         */
+        if (unlikely(next_task->prio < this_rq->curr->prio)) {
+                resched_task(this_rq->curr);
+                return 0;
+        }
+        /* We might release this_rq lock */
+        get_task_struct(next_task);
+        /* find_lock_lowest_rq locks the rq if found */
+        lowest_rq = find_lock_lowest_rq(next_task, this_rq);
+        if (!lowest_rq) {
+                struct task_struct *task;
+                /*
+                 * find lock_lowest_rq releases this_rq->lock
+                 * so it is possible that next_task has changed.
+                 * If it has, then try again.
+                 */
+                task = pick_next_highest_task_rt(this_rq);
+                if (unlikely(task != next_task) && task && paranoid--) {
+                        put_task_struct(next_task);
+                        next_task = task;
+                        goto retry;
+                }
+                goto out;
+        }
+        assert_spin_locked(&lowest_rq->lock);
+        deactivate_task(this_rq, next_task, 0);
+        set_task_cpu(next_task, lowest_rq->cpu);
+        activate_task(lowest_rq, next_task, 0);
+        resched_task(lowest_rq->curr);
+        spin_unlock(&lowest_rq->lock);
+        ret = 1;
+out:
+        put_task_struct(next_task);
+        return ret;
+}
+/*
+ * TODO: Currently we just use the second highest prio task on
+ *       the queue, and stop when it can't migrate (or there's
+ *       no more RT tasks).  There may be a case where a lower
+ *       priority RT task has a different affinity than the
+ *       higher RT task. In this case the lower RT task could
+ *       possibly be able to migrate where as the higher priority
+ *       RT task could not.  We currently ignore this issue.
+ *       Enhancements are welcome!
+ */
+static void push_rt_tasks(struct rq *rq)
+{
+        /* push_rt_task will return true if it moved an RT */
+        while (push_rt_task(rq))
+                ;
+}
+static void schedule_tail_balance_rt(struct rq *rq)
+{
+        /*
+         * If we have more than one rt_task queued, then
+         * see if we can push the other rt_tasks off to other CPUS.
+         * Note we may release the rq lock, and since
+         * the lock was owned by prev, we need to release it
+         * first via finish_lock_switch and then reaquire it here.
+         */
+        if (unlikely(rq->rt.rt_nr_running > 1)) {
+                spin_lock_irq(&rq->lock);
+                push_rt_tasks(rq);
+                spin_unlock_irq(&rq->lock);
+        }
+}
 /*
 * Load-balancing iterator. Note: while the runqueue stays locked
 * during the whole iteration, the current task might be
@@ -241,7 +462,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
                                  &rt_rq_iterator);
 }
-#endif
+#else /* CONFIG_SMP */
+# define schedule_tail_balance_rt(rq)   do { } while (0)
+#endif /* CONFIG_SMP */
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {

diff --git a/kernel/sched.c b/kernel/sched.c index 6185fa080ec8..97cab609fc31 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -1952,6 +1952,8 @@ static void finish_task_switch(struct rq rq, struct task_struct prev)
1952	prev_state = prev->state;	1952	prev_state = prev->state;
1953	finish_arch_switch(prev);	1953	finish_arch_switch(prev);
1954	finish_lock_switch(rq, prev);	1954	finish_lock_switch(rq, prev);
		1955	schedule_tail_balance_rt(rq);
		1956
1955	fire_sched_in_preempt_notifiers(current);	1957	fire_sched_in_preempt_notifiers(current);
1956	if (mm)	1958	if (mm)
1957	mmdrop(mm);	1959	mmdrop(mm);
@@ -2185,11 +2187,13 @@ static void double_rq_unlock(struct rq rq1, struct rq rq2)
2185	/*	2187	/*
2186	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.	2188	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2187	*/	2189	*/
2188	static void double_lock_balance(struct rq this_rq, struct rq busiest)	2190	static int double_lock_balance(struct rq this_rq, struct rq busiest)
2189	__releases(this_rq->lock)	2191	__releases(this_rq->lock)
2190	__acquires(busiest->lock)	2192	__acquires(busiest->lock)
2191	__acquires(this_rq->lock)	2193	__acquires(this_rq->lock)
2192	{	2194	{
		2195	int ret = 0;
		2196
2193	if (unlikely(!irqs_disabled())) {	2197	if (unlikely(!irqs_disabled())) {
2194	/* printk() doesn't work good under rq->lock */	2198	/* printk() doesn't work good under rq->lock */
2195	spin_unlock(&this_rq->lock);	2199	spin_unlock(&this_rq->lock);
@@ -2200,9 +2204,11 @@ static void double_lock_balance(struct rq this_rq, struct rq busiest)
2200	spin_unlock(&this_rq->lock);	2204	spin_unlock(&this_rq->lock);
2201	spin_lock(&busiest->lock);	2205	spin_lock(&busiest->lock);
2202	spin_lock(&this_rq->lock);	2206	spin_lock(&this_rq->lock);
		2207	ret = 1;
2203	} else	2208	} else
2204	spin_lock(&busiest->lock);	2209	spin_lock(&busiest->lock);
2205	}	2210	}
		2211	return ret;
2206	}	2212	}
2207		2213
2208	/*	2214	/*


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 136c2857a049..7815e90b1147 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -137,6 +137,227 @@ static void put_prev_task_rt(struct rq rq, struct task_struct p)
137	}	137	}
138		138
139	#ifdef CONFIG_SMP	139	#ifdef CONFIG_SMP
		140	/* Only try algorithms three times */
		141	#define RT_MAX_TRIES 3
		142
		143	static int double_lock_balance(struct rq this_rq, struct rq busiest);
		144	static void deactivate_task(struct rq rq, struct task_struct p, int sleep);
		145
		146	/* Return the second highest RT task, NULL otherwise */
		147	static struct task_struct pick_next_highest_task_rt(struct rq rq)
		148	{
		149	struct rt_prio_array *array = &rq->rt.active;
		150	struct task_struct *next;
		151	struct list_head *queue;
		152	int idx;
		153
		154	assert_spin_locked(&rq->lock);
		155
		156	if (likely(rq->rt.rt_nr_running < 2))
		157	return NULL;
		158
		159	idx = sched_find_first_bit(array->bitmap);
		160	if (unlikely(idx >= MAX_RT_PRIO)) {
		161	WARN_ON(1); /* rt_nr_running is bad */
		162	return NULL;
		163	}
		164
		165	queue = array->queue + idx;
		166	next = list_entry(queue->next, struct task_struct, run_list);
		167	if (unlikely(next != rq->curr))
		168	return next;
		169
		170	if (queue->next->next != queue) {
		171	/* same prio task */
		172	next = list_entry(queue->next->next, struct task_struct, run_list);
		173	return next;
		174	}
		175
		176	/* slower, but more flexible */
		177	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
		178	if (unlikely(idx >= MAX_RT_PRIO)) {
		179	WARN_ON(1); /* rt_nr_running was 2 and above! */
		180	return NULL;
		181	}
		182
		183	queue = array->queue + idx;
		184	next = list_entry(queue->next, struct task_struct, run_list);
		185
		186	return next;
		187	}
		188
		189	static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
		190
		191	/* Will lock the rq it finds */
		192	static struct rq find_lock_lowest_rq(struct task_struct task,
		193	struct rq *this_rq)
		194	{
		195	struct rq *lowest_rq = NULL;
		196	int cpu;
		197	int tries;
		198	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
		199
		200	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
		201
		202	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
		203	/*
		204	* Scan each rq for the lowest prio.
		205	*/
		206	for_each_cpu_mask(cpu, *cpu_mask) {
		207	struct rq *rq = &per_cpu(runqueues, cpu);
		208
		209	if (cpu == this_rq->cpu)
		210	continue;
		211
		212	/* We look for lowest RT prio or non-rt CPU */
		213	if (rq->rt.highest_prio >= MAX_RT_PRIO) {
		214	lowest_rq = rq;
		215	break;
		216	}
		217
		218	/* no locking for now */
		219	if (rq->rt.highest_prio > task->prio &&
		220	(!lowest_rq \|\| rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
		221	lowest_rq = rq;
		222	}
		223	}
		224
		225	if (!lowest_rq)
		226	break;
		227
		228	/* if the prio of this runqueue changed, try again */
		229	if (double_lock_balance(this_rq, lowest_rq)) {
		230	/*
		231	* We had to unlock the run queue. In
		232	* the mean time, task could have
		233	* migrated already or had its affinity changed.
		234	* Also make sure that it wasn't scheduled on its rq.
		235	*/
		236	if (unlikely(task_rq(task) != this_rq \|\|
		237	!cpu_isset(lowest_rq->cpu, task->cpus_allowed) \|\|
		238	task_running(this_rq, task) \|\|
		239	!task->se.on_rq)) {
		240	spin_unlock(&lowest_rq->lock);
		241	lowest_rq = NULL;
		242	break;
		243	}
		244	}
		245
		246	/* If this rq is still suitable use it. */
		247	if (lowest_rq->rt.highest_prio > task->prio)
		248	break;
		249
		250	/* try again */
		251	spin_unlock(&lowest_rq->lock);
		252	lowest_rq = NULL;
		253	}
		254
		255	return lowest_rq;
		256	}
		257
		258	/*
		259	* If the current CPU has more than one RT task, see if the non
		260	* running task can migrate over to a CPU that is running a task
		261	* of lesser priority.
		262	*/
		263	static int push_rt_task(struct rq *this_rq)
		264	{
		265	struct task_struct *next_task;
		266	struct rq *lowest_rq;
		267	int ret = 0;
		268	int paranoid = RT_MAX_TRIES;
		269
		270	assert_spin_locked(&this_rq->lock);
		271
		272	next_task = pick_next_highest_task_rt(this_rq);
		273	if (!next_task)
		274	return 0;
		275
		276	retry:
		277	if (unlikely(next_task == this_rq->curr))
		278	return 0;
		279
		280	/*
		281	* It's possible that the next_task slipped in of
		282	* higher priority than current. If that's the case
		283	* just reschedule current.
		284	*/
		285	if (unlikely(next_task->prio < this_rq->curr->prio)) {
		286	resched_task(this_rq->curr);
		287	return 0;
		288	}
		289
		290	/* We might release this_rq lock */
		291	get_task_struct(next_task);
		292
		293	/* find_lock_lowest_rq locks the rq if found */
		294	lowest_rq = find_lock_lowest_rq(next_task, this_rq);
		295	if (!lowest_rq) {
		296	struct task_struct *task;
		297	/*
		298	* find lock_lowest_rq releases this_rq->lock
		299	* so it is possible that next_task has changed.
		300	* If it has, then try again.
		301	*/
		302	task = pick_next_highest_task_rt(this_rq);
		303	if (unlikely(task != next_task) && task && paranoid--) {
		304	put_task_struct(next_task);
		305	next_task = task;
		306	goto retry;
		307	}
		308	goto out;
		309	}
		310
		311	assert_spin_locked(&lowest_rq->lock);
		312
		313	deactivate_task(this_rq, next_task, 0);
		314	set_task_cpu(next_task, lowest_rq->cpu);
		315	activate_task(lowest_rq, next_task, 0);
		316
		317	resched_task(lowest_rq->curr);
		318
		319	spin_unlock(&lowest_rq->lock);
		320
		321	ret = 1;
		322	out:
		323	put_task_struct(next_task);
		324
		325	return ret;
		326	}
		327
		328	/*
		329	* TODO: Currently we just use the second highest prio task on
		330	* the queue, and stop when it can't migrate (or there's
		331	* no more RT tasks). There may be a case where a lower
		332	* priority RT task has a different affinity than the
		333	* higher RT task. In this case the lower RT task could
		334	* possibly be able to migrate where as the higher priority
		335	* RT task could not. We currently ignore this issue.
		336	* Enhancements are welcome!
		337	*/
		338	static void push_rt_tasks(struct rq *rq)
		339	{
		340	/* push_rt_task will return true if it moved an RT */
		341	while (push_rt_task(rq))
		342	;
		343	}
		344
		345	static void schedule_tail_balance_rt(struct rq *rq)
		346	{
		347	/*
		348	* If we have more than one rt_task queued, then
		349	* see if we can push the other rt_tasks off to other CPUS.
		350	* Note we may release the rq lock, and since
		351	* the lock was owned by prev, we need to release it
		352	* first via finish_lock_switch and then reaquire it here.
		353	*/
		354	if (unlikely(rq->rt.rt_nr_running > 1)) {
		355	spin_lock_irq(&rq->lock);
		356	push_rt_tasks(rq);
		357	spin_unlock_irq(&rq->lock);
		358	}
		359	}
		360
140	/*	361	/*
141	* Load-balancing iterator. Note: while the runqueue stays locked	362	* Load-balancing iterator. Note: while the runqueue stays locked
142	* during the whole iteration, the current task might be	363	* during the whole iteration, the current task might be
@@ -241,7 +462,9 @@ move_one_task_rt(struct rq this_rq, int this_cpu, struct rq busiest,
241	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,	462	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
242	&rt_rq_iterator);	463	&rt_rq_iterator);
243	}	464	}
244	#endif	465	#else /* CONFIG_SMP */
		466	# define schedule_tail_balance_rt(rq) do { } while (0)
		467	#endif /* CONFIG_SMP */
245		468
246	static void task_tick_rt(struct rq rq, struct task_struct p)	469	static void task_tick_rt(struct rq rq, struct task_struct p)
247	{	470	{