sched: add RT task pushing

This patch adds an algorithm to push extra RT tasks off a run queue to other CPU runqueues. When more than one RT task is added to a run queue, this algorithm takes an assertive approach to push the RT tasks that are not running onto other run queues that have lower priority. The way this works is that the highest RT task that is not running is looked at and we examine the runqueues on the CPUS for that tasks affinity mask. We find the runqueue with the lowest prio in the CPU affinity of the picked task, and if it is lower in prio than the picked task, we push the task onto that CPU runqueue. We continue pushing RT tasks off the current runqueue until we don't push any more. The algorithm stops when the next highest RT task can't preempt any other processes on other CPUS. TODO: The algorithm may stop when there are still RT tasks that can be migrated. Specifically, if the highest non running RT task CPU affinity is restricted to CPUs that are running higher priority tasks, there may be a lower priority task queued that has an affinity with a CPU that is running a lower priority task that it could be migrated to. This patch set does not address this issue. Note: checkpatch reveals two over 80 character instances. I'm not sure that breaking them up will help visually, so I left them as is. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Steven Rostedt <srostedt@redhat.com> 2008-01-25 15:08:05 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-01-25 15:08:05 -0500
commit: e8fa136262e1121288bb93befe2295928ffd240d (patch)
tree: 5df829adde9b43efee39275c05751c99bf46eb2f
parent: 764a9d6fe4b52995c8aba277e3634385699354f4 (diff)
2 files changed, 231 insertions, 2 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 6185fa080ec8..97cab609fc31 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1952,6 +1952,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        prev_state = prev->state;
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
+        schedule_tail_balance_rt(rq);
        fire_sched_in_preempt_notifiers(current);
        if (mm)
                mmdrop(mm);
@@ -2185,11 +2187,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 /*
 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 */
-static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
+        int ret = 0;
        if (unlikely(!irqs_disabled())) {
                /* printk() doesn't work good under rq->lock */
                spin_unlock(&this_rq->lock);
@@ -2200,9 +2204,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
                        spin_unlock(&this_rq->lock);
                        spin_lock(&busiest->lock);
                        spin_lock(&this_rq->lock);
+                        ret = 1;
                } else
                        spin_lock(&busiest->lock);
        }
+        return ret;
 }
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 136c2857a049..7815e90b1147 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -137,6 +137,227 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 }
 #ifdef CONFIG_SMP
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
+/* Return the second highest RT task, NULL otherwise */
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+{
+        struct rt_prio_array *array = &rq->rt.active;
+        struct task_struct *next;
+        struct list_head *queue;
+        int idx;
+        assert_spin_locked(&rq->lock);
+        if (likely(rq->rt.rt_nr_running < 2))
+                return NULL;
+        idx = sched_find_first_bit(array->bitmap);
+        if (unlikely(idx >= MAX_RT_PRIO)) {
+                WARN_ON(1); /* rt_nr_running is bad */
+                return NULL;
+        }
+        queue = array->queue + idx;
+        next = list_entry(queue->next, struct task_struct, run_list);
+        if (unlikely(next != rq->curr))
+                return next;
+        if (queue->next->next != queue) {
+                /* same prio task */
+                next = list_entry(queue->next->next, struct task_struct, run_list);
+                return next;
+        }
+        /* slower, but more flexible */
+        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+        if (unlikely(idx >= MAX_RT_PRIO)) {
+                WARN_ON(1); /* rt_nr_running was 2 and above! */
+                return NULL;
+        }
+        queue = array->queue + idx;
+        next = list_entry(queue->next, struct task_struct, run_list);
+        return next;
+}
+static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task,
+                                      struct rq *this_rq)
+{
+        struct rq *lowest_rq = NULL;
+        int cpu;
+        int tries;
+        cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
+        cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
+        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+                /*
+                 * Scan each rq for the lowest prio.
+                 */
+                for_each_cpu_mask(cpu, *cpu_mask) {
+                        struct rq *rq = &per_cpu(runqueues, cpu);
+                        if (cpu == this_rq->cpu)
+                                continue;
+                        /* We look for lowest RT prio or non-rt CPU */
+                        if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+                                lowest_rq = rq;
+                                break;
+                        }
+                        /* no locking for now */
+                        if (rq->rt.highest_prio > task->prio &&
+                            (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
+                                lowest_rq = rq;
+                        }
+                }
+                if (!lowest_rq)
+                        break;
+                /* if the prio of this runqueue changed, try again */
+                if (double_lock_balance(this_rq, lowest_rq)) {
+                        /*
+                         * We had to unlock the run queue. In
+                         * the mean time, task could have
+                         * migrated already or had its affinity changed.
+                         * Also make sure that it wasn't scheduled on its rq.
+                         */
+                        if (unlikely(task_rq(task) != this_rq ||
+                                     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
+                                     task_running(this_rq, task) ||
+                                     !task->se.on_rq)) {
+                                spin_unlock(&lowest_rq->lock);
+                                lowest_rq = NULL;
+                                break;
+                        }
+                }
+                /* If this rq is still suitable use it. */
+                if (lowest_rq->rt.highest_prio > task->prio)
+                        break;
+                /* try again */
+                spin_unlock(&lowest_rq->lock);
+                lowest_rq = NULL;
+        }
+        return lowest_rq;
+}
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *this_rq)
+{
+        struct task_struct *next_task;
+        struct rq *lowest_rq;
+        int ret = 0;
+        int paranoid = RT_MAX_TRIES;
+        assert_spin_locked(&this_rq->lock);
+        next_task = pick_next_highest_task_rt(this_rq);
+        if (!next_task)
+                return 0;
+ retry:
+        if (unlikely(next_task == this_rq->curr))
+                return 0;
+        /*
+         * It's possible that the next_task slipped in of
+         * higher priority than current. If that's the case
+         * just reschedule current.
+         */
+        if (unlikely(next_task->prio < this_rq->curr->prio)) {
+                resched_task(this_rq->curr);
+                return 0;
+        }
+        /* We might release this_rq lock */
+        get_task_struct(next_task);
+        /* find_lock_lowest_rq locks the rq if found */
+        lowest_rq = find_lock_lowest_rq(next_task, this_rq);
+        if (!lowest_rq) {
+                struct task_struct *task;
+                /*
+                 * find lock_lowest_rq releases this_rq->lock
+                 * so it is possible that next_task has changed.
+                 * If it has, then try again.
+                 */
+                task = pick_next_highest_task_rt(this_rq);
+                if (unlikely(task != next_task) && task && paranoid--) {
+                        put_task_struct(next_task);
+                        next_task = task;
+                        goto retry;
+                }
+                goto out;
+        }
+        assert_spin_locked(&lowest_rq->lock);
+        deactivate_task(this_rq, next_task, 0);
+        set_task_cpu(next_task, lowest_rq->cpu);
+        activate_task(lowest_rq, next_task, 0);
+        resched_task(lowest_rq->curr);
+        spin_unlock(&lowest_rq->lock);
+        ret = 1;
+out:
+        put_task_struct(next_task);
+        return ret;
+}
+/*
+ * TODO: Currently we just use the second highest prio task on
+ *       the queue, and stop when it can't migrate (or there's
+ *       no more RT tasks).  There may be a case where a lower
+ *       priority RT task has a different affinity than the
+ *       higher RT task. In this case the lower RT task could
+ *       possibly be able to migrate where as the higher priority
+ *       RT task could not.  We currently ignore this issue.
+ *       Enhancements are welcome!
+ */
+static void push_rt_tasks(struct rq *rq)
+{
+        /* push_rt_task will return true if it moved an RT */
+        while (push_rt_task(rq))
+                ;
+}
+static void schedule_tail_balance_rt(struct rq *rq)
+{
+        /*
+         * If we have more than one rt_task queued, then
+         * see if we can push the other rt_tasks off to other CPUS.
+         * Note we may release the rq lock, and since
+         * the lock was owned by prev, we need to release it
+         * first via finish_lock_switch and then reaquire it here.
+         */
+        if (unlikely(rq->rt.rt_nr_running > 1)) {
+                spin_lock_irq(&rq->lock);
+                push_rt_tasks(rq);
+                spin_unlock_irq(&rq->lock);
+        }
+}
 /*
 * Load-balancing iterator. Note: while the runqueue stays locked
 * during the whole iteration, the current task might be
@@ -241,7 +462,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
                                  &rt_rq_iterator);
 }
-#endif
+#else /* CONFIG_SMP */
+# define schedule_tail_balance_rt(rq)   do { } while (0)
+#endif /* CONFIG_SMP */
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
author	Steven Rostedt <srostedt@redhat.com>	2008-01-25 15:08:05 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-01-25 15:08:05 -0500
commit	e8fa136262e1121288bb93befe2295928ffd240d (patch)
tree	5df829adde9b43efee39275c05751c99bf46eb2f
parent	764a9d6fe4b52995c8aba277e3634385699354f4 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 6185fa080ec8..97cab609fc31 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -1952,6 +1952,8 @@ static void finish_task_switch(struct rq rq, struct task_struct prev)
1952	prev_state = prev->state;	1952	prev_state = prev->state;
1953	finish_arch_switch(prev);	1953	finish_arch_switch(prev);
1954	finish_lock_switch(rq, prev);	1954	finish_lock_switch(rq, prev);
		1955	schedule_tail_balance_rt(rq);
		1956
1955	fire_sched_in_preempt_notifiers(current);	1957	fire_sched_in_preempt_notifiers(current);
1956	if (mm)	1958	if (mm)
1957	mmdrop(mm);	1959	mmdrop(mm);
@@ -2185,11 +2187,13 @@ static void double_rq_unlock(struct rq rq1, struct rq rq2)
2185	/*	2187	/*
2186	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.	2188	* double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2187	*/	2189	*/
2188	static void double_lock_balance(struct rq this_rq, struct rq busiest)	2190	static int double_lock_balance(struct rq this_rq, struct rq busiest)
2189	__releases(this_rq->lock)	2191	__releases(this_rq->lock)
2190	__acquires(busiest->lock)	2192	__acquires(busiest->lock)
2191	__acquires(this_rq->lock)	2193	__acquires(this_rq->lock)
2192	{	2194	{
		2195	int ret = 0;
		2196
2193	if (unlikely(!irqs_disabled())) {	2197	if (unlikely(!irqs_disabled())) {
2194	/* printk() doesn't work good under rq->lock */	2198	/* printk() doesn't work good under rq->lock */
2195	spin_unlock(&this_rq->lock);	2199	spin_unlock(&this_rq->lock);
@@ -2200,9 +2204,11 @@ static void double_lock_balance(struct rq this_rq, struct rq busiest)
2200	spin_unlock(&this_rq->lock);	2204	spin_unlock(&this_rq->lock);
2201	spin_lock(&busiest->lock);	2205	spin_lock(&busiest->lock);
2202	spin_lock(&this_rq->lock);	2206	spin_lock(&this_rq->lock);
		2207	ret = 1;
2203	} else	2208	} else
2204	spin_lock(&busiest->lock);	2209	spin_lock(&busiest->lock);
2205	}	2210	}
		2211	return ret;
2206	}	2212	}
2207		2213
2208	/*	2214	/*


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 136c2857a049..7815e90b1147 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -137,6 +137,227 @@ static void put_prev_task_rt(struct rq rq, struct task_struct p)
137	}	137	}
138		138
139	#ifdef CONFIG_SMP	139	#ifdef CONFIG_SMP
		140	/* Only try algorithms three times */
		141	#define RT_MAX_TRIES 3
		142
		143	static int double_lock_balance(struct rq this_rq, struct rq busiest);
		144	static void deactivate_task(struct rq rq, struct task_struct p, int sleep);
		145
		146	/* Return the second highest RT task, NULL otherwise */
		147	static struct task_struct pick_next_highest_task_rt(struct rq rq)
		148	{
		149	struct rt_prio_array *array = &rq->rt.active;
		150	struct task_struct *next;
		151	struct list_head *queue;
		152	int idx;
		153
		154	assert_spin_locked(&rq->lock);
		155
		156	if (likely(rq->rt.rt_nr_running < 2))
		157	return NULL;
		158
		159	idx = sched_find_first_bit(array->bitmap);
		160	if (unlikely(idx >= MAX_RT_PRIO)) {
		161	WARN_ON(1); /* rt_nr_running is bad */
		162	return NULL;
		163	}
		164
		165	queue = array->queue + idx;
		166	next = list_entry(queue->next, struct task_struct, run_list);
		167	if (unlikely(next != rq->curr))
		168	return next;
		169
		170	if (queue->next->next != queue) {
		171	/* same prio task */
		172	next = list_entry(queue->next->next, struct task_struct, run_list);
		173	return next;
		174	}
		175
		176	/* slower, but more flexible */
		177	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
		178	if (unlikely(idx >= MAX_RT_PRIO)) {
		179	WARN_ON(1); /* rt_nr_running was 2 and above! */
		180	return NULL;
		181	}
		182
		183	queue = array->queue + idx;
		184	next = list_entry(queue->next, struct task_struct, run_list);
		185
		186	return next;
		187	}
		188
		189	static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
		190
		191	/* Will lock the rq it finds */
		192	static struct rq find_lock_lowest_rq(struct task_struct task,
		193	struct rq *this_rq)
		194	{
		195	struct rq *lowest_rq = NULL;
		196	int cpu;
		197	int tries;
		198	cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
		199
		200	cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
		201
		202	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
		203	/*
		204	* Scan each rq for the lowest prio.
		205	*/
		206	for_each_cpu_mask(cpu, *cpu_mask) {
		207	struct rq *rq = &per_cpu(runqueues, cpu);
		208
		209	if (cpu == this_rq->cpu)
		210	continue;
		211
		212	/* We look for lowest RT prio or non-rt CPU */
		213	if (rq->rt.highest_prio >= MAX_RT_PRIO) {
		214	lowest_rq = rq;
		215	break;
		216	}
		217
		218	/* no locking for now */
		219	if (rq->rt.highest_prio > task->prio &&
		220	(!lowest_rq \|\| rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
		221	lowest_rq = rq;
		222	}
		223	}
		224
		225	if (!lowest_rq)
		226	break;
		227
		228	/* if the prio of this runqueue changed, try again */
		229	if (double_lock_balance(this_rq, lowest_rq)) {
		230	/*
		231	* We had to unlock the run queue. In
		232	* the mean time, task could have
		233	* migrated already or had its affinity changed.
		234	* Also make sure that it wasn't scheduled on its rq.
		235	*/
		236	if (unlikely(task_rq(task) != this_rq \|\|
		237	!cpu_isset(lowest_rq->cpu, task->cpus_allowed) \|\|
		238	task_running(this_rq, task) \|\|
		239	!task->se.on_rq)) {
		240	spin_unlock(&lowest_rq->lock);
		241	lowest_rq = NULL;
		242	break;
		243	}
		244	}
		245
		246	/* If this rq is still suitable use it. */
		247	if (lowest_rq->rt.highest_prio > task->prio)
		248	break;
		249
		250	/* try again */
		251	spin_unlock(&lowest_rq->lock);
		252	lowest_rq = NULL;
		253	}
		254
		255	return lowest_rq;
		256	}
		257
		258	/*
		259	* If the current CPU has more than one RT task, see if the non
		260	* running task can migrate over to a CPU that is running a task
		261	* of lesser priority.
		262	*/
		263	static int push_rt_task(struct rq *this_rq)
		264	{
		265	struct task_struct *next_task;
		266	struct rq *lowest_rq;
		267	int ret = 0;
		268	int paranoid = RT_MAX_TRIES;
		269
		270	assert_spin_locked(&this_rq->lock);
		271
		272	next_task = pick_next_highest_task_rt(this_rq);
		273	if (!next_task)
		274	return 0;
		275
		276	retry:
		277	if (unlikely(next_task == this_rq->curr))
		278	return 0;
		279
		280	/*
		281	* It's possible that the next_task slipped in of
		282	* higher priority than current. If that's the case
		283	* just reschedule current.
		284	*/
		285	if (unlikely(next_task->prio < this_rq->curr->prio)) {
		286	resched_task(this_rq->curr);
		287	return 0;
		288	}
		289
		290	/* We might release this_rq lock */
		291	get_task_struct(next_task);
		292
		293	/* find_lock_lowest_rq locks the rq if found */
		294	lowest_rq = find_lock_lowest_rq(next_task, this_rq);
		295	if (!lowest_rq) {
		296	struct task_struct *task;
		297	/*
		298	* find lock_lowest_rq releases this_rq->lock
		299	* so it is possible that next_task has changed.
		300	* If it has, then try again.
		301	*/
		302	task = pick_next_highest_task_rt(this_rq);
		303	if (unlikely(task != next_task) && task && paranoid--) {
		304	put_task_struct(next_task);
		305	next_task = task;
		306	goto retry;
		307	}
		308	goto out;
		309	}
		310
		311	assert_spin_locked(&lowest_rq->lock);
		312
		313	deactivate_task(this_rq, next_task, 0);
		314	set_task_cpu(next_task, lowest_rq->cpu);
		315	activate_task(lowest_rq, next_task, 0);
		316
		317	resched_task(lowest_rq->curr);
		318
		319	spin_unlock(&lowest_rq->lock);
		320
		321	ret = 1;
		322	out:
		323	put_task_struct(next_task);
		324
		325	return ret;
		326	}
		327
		328	/*
		329	* TODO: Currently we just use the second highest prio task on
		330	* the queue, and stop when it can't migrate (or there's
		331	* no more RT tasks). There may be a case where a lower
		332	* priority RT task has a different affinity than the
		333	* higher RT task. In this case the lower RT task could
		334	* possibly be able to migrate where as the higher priority
		335	* RT task could not. We currently ignore this issue.
		336	* Enhancements are welcome!
		337	*/
		338	static void push_rt_tasks(struct rq *rq)
		339	{
		340	/* push_rt_task will return true if it moved an RT */
		341	while (push_rt_task(rq))
		342	;
		343	}
		344
		345	static void schedule_tail_balance_rt(struct rq *rq)
		346	{
		347	/*
		348	* If we have more than one rt_task queued, then
		349	* see if we can push the other rt_tasks off to other CPUS.
		350	* Note we may release the rq lock, and since
		351	* the lock was owned by prev, we need to release it
		352	* first via finish_lock_switch and then reaquire it here.
		353	*/
		354	if (unlikely(rq->rt.rt_nr_running > 1)) {
		355	spin_lock_irq(&rq->lock);
		356	push_rt_tasks(rq);
		357	spin_unlock_irq(&rq->lock);
		358	}
		359	}
		360
140	/*	361	/*
141	* Load-balancing iterator. Note: while the runqueue stays locked	362	* Load-balancing iterator. Note: while the runqueue stays locked
142	* during the whole iteration, the current task might be	363	* during the whole iteration, the current task might be
@@ -241,7 +462,9 @@ move_one_task_rt(struct rq this_rq, int this_cpu, struct rq busiest,
241	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,	462	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
242	&rt_rq_iterator);	463	&rt_rq_iterator);
243	}	464	}
244	#endif	465	#else /* CONFIG_SMP */
		466	# define schedule_tail_balance_rt(rq) do { } while (0)
		467	#endif /* CONFIG_SMP */
245		468
246	static void task_tick_rt(struct rq rq, struct task_struct p)	469	static void task_tick_rt(struct rq rq, struct task_struct p)
247	{	470	{