sched: pull RT tasks from overloaded runqueues

This patch adds the algorithm to pull tasks from RT overloaded runqueues. When a pull RT is initiated, all overloaded runqueues are examined for a RT task that is higher in prio than the highest prio task queued on the target runqueue. If another runqueue holds a RT task that is of higher prio than the highest prio task on the target runqueue is found it is pulled to the target runqueue. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Steven Rostedt <srostedt@redhat.com> 2008-01-25 15:08:07 -0500
committer: Ingo Molnar <mingo@elte.hu> 2008-01-25 15:08:07 -0500
commit: f65eda4f789168ba5ff3fa75546c29efeed19f58 (patch)
tree: 235e6daad2bc37b22cc5b21907608c79f944f036 /kernel
parent: 4fd29176b7cd24909f8ceba2105cb3ae2857b90c (diff)
2 files changed, 178 insertions, 11 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 97cab609fc31..c91797107913 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3721,6 +3721,8 @@ need_resched_nonpreemptible:
                switch_count = &prev->nvcsw;
        }
+        schedule_balance_rt(rq, prev);
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 547f858b0752..bacb32039e95 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -179,8 +179,17 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+        if (!task_running(rq, p) &&
+            (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
+                return 1;
+        return 0;
+}
 /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
+                                                     int cpu)
 {
        struct rt_prio_array *array = &rq->rt.active;
        struct task_struct *next;
@@ -199,26 +208,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
        }
        queue = array->queue + idx;
+        BUG_ON(list_empty(queue));
        next = list_entry(queue->next, struct task_struct, run_list);
-        if (unlikely(next != rq->curr))
+        if (unlikely(pick_rt_task(rq, next, cpu)))
-                return next;
+                goto out;
        if (queue->next->next != queue) {
                /* same prio task */
                next = list_entry(queue->next->next, struct task_struct, run_list);
-                return next;
+                if (pick_rt_task(rq, next, cpu))
+                        goto out;
        }
+ retry:
        /* slower, but more flexible */
        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-        if (unlikely(idx >= MAX_RT_PRIO)) {
+        if (unlikely(idx >= MAX_RT_PRIO))
-                WARN_ON(1); /* rt_nr_running was 2 and above! */
                return NULL;
-        }
        queue = array->queue + idx;
-        next = list_entry(queue->next, struct task_struct, run_list);
+        BUG_ON(list_empty(queue));
+        list_for_each_entry(next, queue, run_list) {
+                if (pick_rt_task(rq, next, cpu))
+                        goto out;
+        }
+        goto retry;
+ out:
        return next;
 }
@@ -305,13 +324,15 @@ static int push_rt_task(struct rq *this_rq)
        assert_spin_locked(&this_rq->lock);
-        next_task = pick_next_highest_task_rt(this_rq);
+        next_task = pick_next_highest_task_rt(this_rq, -1);
        if (!next_task)
                return 0;
 retry:
-        if (unlikely(next_task == this_rq->curr))
+        if (unlikely(next_task == this_rq->curr)) {
+                WARN_ON(1);
                return 0;
+        }
        /*
         * It's possible that the next_task slipped in of
@@ -335,7 +356,7 @@ static int push_rt_task(struct rq *this_rq)
                 * so it is possible that next_task has changed.
                 * If it has, then try again.
                 */
-                task = pick_next_highest_task_rt(this_rq);
+                task = pick_next_highest_task_rt(this_rq, -1);
                if (unlikely(task != next_task) && task && paranoid--) {
                        put_task_struct(next_task);
                        next_task = task;
@@ -378,6 +399,149 @@ static void push_rt_tasks(struct rq *rq)
                ;
 }
+static int pull_rt_task(struct rq *this_rq)
+{
+        struct task_struct *next;
+        struct task_struct *p;
+        struct rq *src_rq;
+        cpumask_t *rto_cpumask;
+        int this_cpu = this_rq->cpu;
+        int cpu;
+        int ret = 0;
+        assert_spin_locked(&this_rq->lock);
+        /*
+         * If cpusets are used, and we have overlapping
+         * run queue cpusets, then this algorithm may not catch all.
+         * This is just the price you pay on trying to keep
+         * dirtying caches down on large SMP machines.
+         */
+        if (likely(!rt_overloaded()))
+                return 0;
+        next = pick_next_task_rt(this_rq);
+        rto_cpumask = rt_overload();
+        for_each_cpu_mask(cpu, *rto_cpumask) {
+                if (this_cpu == cpu)
+                        continue;
+                src_rq = cpu_rq(cpu);
+                if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
+                        /*
+                         * It is possible that overlapping cpusets
+                         * will miss clearing a non overloaded runqueue.
+                         * Clear it now.
+                         */
+                        if (double_lock_balance(this_rq, src_rq)) {
+                                /* unlocked our runqueue lock */
+                                struct task_struct *old_next = next;
+                                next = pick_next_task_rt(this_rq);
+                                if (next != old_next)
+                                        ret = 1;
+                        }
+                        if (likely(src_rq->rt.rt_nr_running <= 1))
+                                /*
+                                 * Small chance that this_rq->curr changed
+                                 * but it's really harmless here.
+                                 */
+                                rt_clear_overload(this_rq);
+                        else
+                                /*
+                                 * Heh, the src_rq is now overloaded, since
+                                 * we already have the src_rq lock, go straight
+                                 * to pulling tasks from it.
+                                 */
+                                goto try_pulling;
+                        spin_unlock(&src_rq->lock);
+                        continue;
+                }
+                /*
+                 * We can potentially drop this_rq's lock in
+                 * double_lock_balance, and another CPU could
+                 * steal our next task - hence we must cause
+                 * the caller to recalculate the next task
+                 * in that case:
+                 */
+                if (double_lock_balance(this_rq, src_rq)) {
+                        struct task_struct *old_next = next;
+                        next = pick_next_task_rt(this_rq);
+                        if (next != old_next)
+                                ret = 1;
+                }
+                /*
+                 * Are there still pullable RT tasks?
+                 */
+                if (src_rq->rt.rt_nr_running <= 1) {
+                        spin_unlock(&src_rq->lock);
+                        continue;
+                }
+ try_pulling:
+                p = pick_next_highest_task_rt(src_rq, this_cpu);
+                /*
+                 * Do we have an RT task that preempts
+                 * the to-be-scheduled task?
+                 */
+                if (p && (!next || (p->prio < next->prio))) {
+                        WARN_ON(p == src_rq->curr);
+                        WARN_ON(!p->se.on_rq);
+                        /*
+                         * There's a chance that p is higher in priority
+                         * than what's currently running on its cpu.
+                         * This is just that p is wakeing up and hasn't
+                         * had a chance to schedule. We only pull
+                         * p if it is lower in priority than the
+                         * current task on the run queue or
+                         * this_rq next task is lower in prio than
+                         * the current task on that rq.
+                         */
+                        if (p->prio < src_rq->curr->prio ||
+                            (next && next->prio < src_rq->curr->prio))
+                                goto bail;
+                        ret = 1;
+                        deactivate_task(src_rq, p, 0);
+                        set_task_cpu(p, this_cpu);
+                        activate_task(this_rq, p, 0);
+                        /*
+                         * We continue with the search, just in
+                         * case there's an even higher prio task
+                         * in another runqueue. (low likelyhood
+                         * but possible)
+                         */
+                        /*
+                         * Update next so that we won't pick a task
+                         * on another cpu with a priority lower (or equal)
+                         * than the one we just picked.
+                         */
+                        next = p;
+                }
+ bail:
+                spin_unlock(&src_rq->lock);
+        }
+        return ret;
+}
+static void schedule_balance_rt(struct rq *rq,
+                                struct task_struct *prev)
+{
+        /* Try to pull RT tasks here if we lower this rq's prio */
+        if (unlikely(rt_task(prev)) &&
+            rq->rt.highest_prio > prev->prio)
+                pull_rt_task(rq);
+}
 static void schedule_tail_balance_rt(struct rq *rq)
 {
        /*
@@ -500,6 +664,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)   do { } while (0)
+# define schedule_balance_rt(rq, prev)  do { } while (0)
 #endif /* CONFIG_SMP */
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
author	Steven Rostedt <srostedt@redhat.com>	2008-01-25 15:08:07 -0500
committer	Ingo Molnar <mingo@elte.hu>	2008-01-25 15:08:07 -0500
commit	f65eda4f789168ba5ff3fa75546c29efeed19f58 (patch)
tree	235e6daad2bc37b22cc5b21907608c79f944f036 /kernel
parent	4fd29176b7cd24909f8ceba2105cb3ae2857b90c (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 97cab609fc31..c91797107913 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -3721,6 +3721,8 @@ need_resched_nonpreemptible:
3721	switch_count = &prev->nvcsw;	3721	switch_count = &prev->nvcsw;
3722	}	3722	}
3723		3723
		3724	schedule_balance_rt(rq, prev);
		3725
3724	if (unlikely(!rq->nr_running))	3726	if (unlikely(!rq->nr_running))
3725	idle_balance(cpu, rq);	3727	idle_balance(cpu, rq);
3726		3728


diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 547f858b0752..bacb32039e95 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c
@@ -179,8 +179,17 @@ static void put_prev_task_rt(struct rq rq, struct task_struct p)
179	static int double_lock_balance(struct rq this_rq, struct rq busiest);	179	static int double_lock_balance(struct rq this_rq, struct rq busiest);
180	static void deactivate_task(struct rq rq, struct task_struct p, int sleep);	180	static void deactivate_task(struct rq rq, struct task_struct p, int sleep);
181		181
		182	static int pick_rt_task(struct rq rq, struct task_struct p, int cpu)
		183	{
		184	if (!task_running(rq, p) &&
		185	(cpu < 0 \|\| cpu_isset(cpu, p->cpus_allowed)))
		186	return 1;
		187	return 0;
		188	}
		189
182	/* Return the second highest RT task, NULL otherwise */	190	/* Return the second highest RT task, NULL otherwise */
183	static struct task_struct pick_next_highest_task_rt(struct rq rq)	191	static struct task_struct pick_next_highest_task_rt(struct rq rq,
		192	int cpu)
184	{	193	{
185	struct rt_prio_array *array = &rq->rt.active;	194	struct rt_prio_array *array = &rq->rt.active;
186	struct task_struct *next;	195	struct task_struct *next;
@@ -199,26 +208,36 @@ static struct task_struct pick_next_highest_task_rt(struct rq rq)
199	}	208	}
200		209
201	queue = array->queue + idx;	210	queue = array->queue + idx;
		211	BUG_ON(list_empty(queue));
		212
202	next = list_entry(queue->next, struct task_struct, run_list);	213	next = list_entry(queue->next, struct task_struct, run_list);
203	if (unlikely(next != rq->curr))	214	if (unlikely(pick_rt_task(rq, next, cpu)))
204	return next;	215	goto out;
205		216
206	if (queue->next->next != queue) {	217	if (queue->next->next != queue) {
207	/* same prio task */	218	/* same prio task */
208	next = list_entry(queue->next->next, struct task_struct, run_list);	219	next = list_entry(queue->next->next, struct task_struct, run_list);
209	return next;	220	if (pick_rt_task(rq, next, cpu))
		221	goto out;
210	}	222	}
211		223
		224	retry:
212	/* slower, but more flexible */	225	/* slower, but more flexible */
213	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);	226	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
214	if (unlikely(idx >= MAX_RT_PRIO)) {	227	if (unlikely(idx >= MAX_RT_PRIO))
215	WARN_ON(1); /* rt_nr_running was 2 and above! */
216	return NULL;	228	return NULL;
217	}
218		229
219	queue = array->queue + idx;	230	queue = array->queue + idx;
220	next = list_entry(queue->next, struct task_struct, run_list);	231	BUG_ON(list_empty(queue));
		232
		233	list_for_each_entry(next, queue, run_list) {
		234	if (pick_rt_task(rq, next, cpu))
		235	goto out;
		236	}
		237
		238	goto retry;
221		239
		240	out:
222	return next;	241	return next;
223	}	242	}
224		243
@@ -305,13 +324,15 @@ static int push_rt_task(struct rq *this_rq)
305		324
306	assert_spin_locked(&this_rq->lock);	325	assert_spin_locked(&this_rq->lock);
307		326
308	next_task = pick_next_highest_task_rt(this_rq);	327	next_task = pick_next_highest_task_rt(this_rq, -1);
309	if (!next_task)	328	if (!next_task)
310	return 0;	329	return 0;
311		330
312	retry:	331	retry:
313	if (unlikely(next_task == this_rq->curr))	332	if (unlikely(next_task == this_rq->curr)) {
		333	WARN_ON(1);
314	return 0;	334	return 0;
		335	}
315		336
316	/*	337	/*
317	* It's possible that the next_task slipped in of	338	* It's possible that the next_task slipped in of
@@ -335,7 +356,7 @@ static int push_rt_task(struct rq *this_rq)
335	* so it is possible that next_task has changed.	356	* so it is possible that next_task has changed.
336	* If it has, then try again.	357	* If it has, then try again.
337	*/	358	*/
338	task = pick_next_highest_task_rt(this_rq);	359	task = pick_next_highest_task_rt(this_rq, -1);
339	if (unlikely(task != next_task) && task && paranoid--) {	360	if (unlikely(task != next_task) && task && paranoid--) {
340	put_task_struct(next_task);	361	put_task_struct(next_task);
341	next_task = task;	362	next_task = task;
@@ -378,6 +399,149 @@ static void push_rt_tasks(struct rq *rq)
378	;	399	;
379	}	400	}
380		401
		402	static int pull_rt_task(struct rq *this_rq)
		403	{
		404	struct task_struct *next;
		405	struct task_struct *p;
		406	struct rq *src_rq;
		407	cpumask_t *rto_cpumask;
		408	int this_cpu = this_rq->cpu;
		409	int cpu;
		410	int ret = 0;
		411
		412	assert_spin_locked(&this_rq->lock);
		413
		414	/*
		415	* If cpusets are used, and we have overlapping
		416	* run queue cpusets, then this algorithm may not catch all.
		417	* This is just the price you pay on trying to keep
		418	* dirtying caches down on large SMP machines.
		419	*/
		420	if (likely(!rt_overloaded()))
		421	return 0;
		422
		423	next = pick_next_task_rt(this_rq);
		424
		425	rto_cpumask = rt_overload();
		426
		427	for_each_cpu_mask(cpu, *rto_cpumask) {
		428	if (this_cpu == cpu)
		429	continue;
		430
		431	src_rq = cpu_rq(cpu);
		432	if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
		433	/*
		434	* It is possible that overlapping cpusets
		435	* will miss clearing a non overloaded runqueue.
		436	* Clear it now.
		437	*/
		438	if (double_lock_balance(this_rq, src_rq)) {
		439	/* unlocked our runqueue lock */
		440	struct task_struct *old_next = next;
		441	next = pick_next_task_rt(this_rq);
		442	if (next != old_next)
		443	ret = 1;
		444	}
		445	if (likely(src_rq->rt.rt_nr_running <= 1))
		446	/*
		447	* Small chance that this_rq->curr changed
		448	* but it's really harmless here.
		449	*/
		450	rt_clear_overload(this_rq);
		451	else
		452	/*
		453	* Heh, the src_rq is now overloaded, since
		454	* we already have the src_rq lock, go straight
		455	* to pulling tasks from it.
		456	*/
		457	goto try_pulling;
		458	spin_unlock(&src_rq->lock);
		459	continue;
		460	}
		461
		462	/*
		463	* We can potentially drop this_rq's lock in
		464	* double_lock_balance, and another CPU could
		465	* steal our next task - hence we must cause
		466	* the caller to recalculate the next task
		467	* in that case:
		468	*/
		469	if (double_lock_balance(this_rq, src_rq)) {
		470	struct task_struct *old_next = next;
		471	next = pick_next_task_rt(this_rq);
		472	if (next != old_next)
		473	ret = 1;
		474	}
		475
		476	/*
		477	* Are there still pullable RT tasks?
		478	*/
		479	if (src_rq->rt.rt_nr_running <= 1) {
		480	spin_unlock(&src_rq->lock);
		481	continue;
		482	}
		483
		484	try_pulling:
		485	p = pick_next_highest_task_rt(src_rq, this_cpu);
		486
		487	/*
		488	* Do we have an RT task that preempts
		489	* the to-be-scheduled task?
		490	*/
		491	if (p && (!next \|\| (p->prio < next->prio))) {
		492	WARN_ON(p == src_rq->curr);
		493	WARN_ON(!p->se.on_rq);
		494
		495	/*
		496	* There's a chance that p is higher in priority
		497	* than what's currently running on its cpu.
		498	* This is just that p is wakeing up and hasn't
		499	* had a chance to schedule. We only pull
		500	* p if it is lower in priority than the
		501	* current task on the run queue or
		502	* this_rq next task is lower in prio than
		503	* the current task on that rq.
		504	*/
		505	if (p->prio < src_rq->curr->prio \|\|
		506	(next && next->prio < src_rq->curr->prio))
		507	goto bail;
		508
		509	ret = 1;
		510
		511	deactivate_task(src_rq, p, 0);
		512	set_task_cpu(p, this_cpu);
		513	activate_task(this_rq, p, 0);
		514	/*
		515	* We continue with the search, just in
		516	* case there's an even higher prio task
		517	* in another runqueue. (low likelyhood
		518	* but possible)
		519	*/
		520
		521	/*
		522	* Update next so that we won't pick a task
		523	* on another cpu with a priority lower (or equal)
		524	* than the one we just picked.
		525	*/
		526	next = p;
		527
		528	}
		529	bail:
		530	spin_unlock(&src_rq->lock);
		531	}
		532
		533	return ret;
		534	}
		535
		536	static void schedule_balance_rt(struct rq *rq,
		537	struct task_struct *prev)
		538	{
		539	/* Try to pull RT tasks here if we lower this rq's prio */
		540	if (unlikely(rt_task(prev)) &&
		541	rq->rt.highest_prio > prev->prio)
		542	pull_rt_task(rq);
		543	}
		544
381	static void schedule_tail_balance_rt(struct rq *rq)	545	static void schedule_tail_balance_rt(struct rq *rq)
382	{	546	{
383	/*	547	/*
@@ -500,6 +664,7 @@ move_one_task_rt(struct rq this_rq, int this_cpu, struct rq busiest,
500	}	664	}
501	#else /* CONFIG_SMP */	665	#else /* CONFIG_SMP */
502	# define schedule_tail_balance_rt(rq) do { } while (0)	666	# define schedule_tail_balance_rt(rq) do { } while (0)
		667	# define schedule_balance_rt(rq, prev) do { } while (0)
503	#endif /* CONFIG_SMP */	668	#endif /* CONFIG_SMP */
504		669
505	static void task_tick_rt(struct rq rq, struct task_struct p)	670	static void task_tick_rt(struct rq rq, struct task_struct p)