aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSteven Rostedt <srostedt@redhat.com>2008-01-25 15:08:05 -0500
committerIngo Molnar <mingo@elte.hu>2008-01-25 15:08:05 -0500
commite8fa136262e1121288bb93befe2295928ffd240d (patch)
tree5df829adde9b43efee39275c05751c99bf46eb2f
parent764a9d6fe4b52995c8aba277e3634385699354f4 (diff)
sched: add RT task pushing
This patch adds an algorithm to push extra RT tasks off a run queue to other CPU runqueues. When more than one RT task is added to a run queue, this algorithm takes an assertive approach to push the RT tasks that are not running onto other run queues that have lower priority. The way this works is that the highest RT task that is not running is looked at and we examine the runqueues on the CPUS for that tasks affinity mask. We find the runqueue with the lowest prio in the CPU affinity of the picked task, and if it is lower in prio than the picked task, we push the task onto that CPU runqueue. We continue pushing RT tasks off the current runqueue until we don't push any more. The algorithm stops when the next highest RT task can't preempt any other processes on other CPUS. TODO: The algorithm may stop when there are still RT tasks that can be migrated. Specifically, if the highest non running RT task CPU affinity is restricted to CPUs that are running higher priority tasks, there may be a lower priority task queued that has an affinity with a CPU that is running a lower priority task that it could be migrated to. This patch set does not address this issue. Note: checkpatch reveals two over 80 character instances. I'm not sure that breaking them up will help visually, so I left them as is. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched.c8
-rw-r--r--kernel/sched_rt.c225
2 files changed, 231 insertions, 2 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 6185fa080ec8..97cab609fc31 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1952,6 +1952,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1952 prev_state = prev->state; 1952 prev_state = prev->state;
1953 finish_arch_switch(prev); 1953 finish_arch_switch(prev);
1954 finish_lock_switch(rq, prev); 1954 finish_lock_switch(rq, prev);
1955 schedule_tail_balance_rt(rq);
1956
1955 fire_sched_in_preempt_notifiers(current); 1957 fire_sched_in_preempt_notifiers(current);
1956 if (mm) 1958 if (mm)
1957 mmdrop(mm); 1959 mmdrop(mm);
@@ -2185,11 +2187,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2185/* 2187/*
2186 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2188 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2187 */ 2189 */
2188static void double_lock_balance(struct rq *this_rq, struct rq *busiest) 2190static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2189 __releases(this_rq->lock) 2191 __releases(this_rq->lock)
2190 __acquires(busiest->lock) 2192 __acquires(busiest->lock)
2191 __acquires(this_rq->lock) 2193 __acquires(this_rq->lock)
2192{ 2194{
2195 int ret = 0;
2196
2193 if (unlikely(!irqs_disabled())) { 2197 if (unlikely(!irqs_disabled())) {
2194 /* printk() doesn't work good under rq->lock */ 2198 /* printk() doesn't work good under rq->lock */
2195 spin_unlock(&this_rq->lock); 2199 spin_unlock(&this_rq->lock);
@@ -2200,9 +2204,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2200 spin_unlock(&this_rq->lock); 2204 spin_unlock(&this_rq->lock);
2201 spin_lock(&busiest->lock); 2205 spin_lock(&busiest->lock);
2202 spin_lock(&this_rq->lock); 2206 spin_lock(&this_rq->lock);
2207 ret = 1;
2203 } else 2208 } else
2204 spin_lock(&busiest->lock); 2209 spin_lock(&busiest->lock);
2205 } 2210 }
2211 return ret;
2206} 2212}
2207 2213
2208/* 2214/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 136c2857a049..7815e90b1147 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -137,6 +137,227 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
137} 137}
138 138
139#ifdef CONFIG_SMP 139#ifdef CONFIG_SMP
140/* Only try algorithms three times */
141#define RT_MAX_TRIES 3
142
143static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
144static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
145
146/* Return the second highest RT task, NULL otherwise */
147static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
148{
149 struct rt_prio_array *array = &rq->rt.active;
150 struct task_struct *next;
151 struct list_head *queue;
152 int idx;
153
154 assert_spin_locked(&rq->lock);
155
156 if (likely(rq->rt.rt_nr_running < 2))
157 return NULL;
158
159 idx = sched_find_first_bit(array->bitmap);
160 if (unlikely(idx >= MAX_RT_PRIO)) {
161 WARN_ON(1); /* rt_nr_running is bad */
162 return NULL;
163 }
164
165 queue = array->queue + idx;
166 next = list_entry(queue->next, struct task_struct, run_list);
167 if (unlikely(next != rq->curr))
168 return next;
169
170 if (queue->next->next != queue) {
171 /* same prio task */
172 next = list_entry(queue->next->next, struct task_struct, run_list);
173 return next;
174 }
175
176 /* slower, but more flexible */
177 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
178 if (unlikely(idx >= MAX_RT_PRIO)) {
179 WARN_ON(1); /* rt_nr_running was 2 and above! */
180 return NULL;
181 }
182
183 queue = array->queue + idx;
184 next = list_entry(queue->next, struct task_struct, run_list);
185
186 return next;
187}
188
189static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
190
191/* Will lock the rq it finds */
192static struct rq *find_lock_lowest_rq(struct task_struct *task,
193 struct rq *this_rq)
194{
195 struct rq *lowest_rq = NULL;
196 int cpu;
197 int tries;
198 cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
199
200 cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
201
202 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
203 /*
204 * Scan each rq for the lowest prio.
205 */
206 for_each_cpu_mask(cpu, *cpu_mask) {
207 struct rq *rq = &per_cpu(runqueues, cpu);
208
209 if (cpu == this_rq->cpu)
210 continue;
211
212 /* We look for lowest RT prio or non-rt CPU */
213 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
214 lowest_rq = rq;
215 break;
216 }
217
218 /* no locking for now */
219 if (rq->rt.highest_prio > task->prio &&
220 (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
221 lowest_rq = rq;
222 }
223 }
224
225 if (!lowest_rq)
226 break;
227
228 /* if the prio of this runqueue changed, try again */
229 if (double_lock_balance(this_rq, lowest_rq)) {
230 /*
231 * We had to unlock the run queue. In
232 * the mean time, task could have
233 * migrated already or had its affinity changed.
234 * Also make sure that it wasn't scheduled on its rq.
235 */
236 if (unlikely(task_rq(task) != this_rq ||
237 !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
238 task_running(this_rq, task) ||
239 !task->se.on_rq)) {
240 spin_unlock(&lowest_rq->lock);
241 lowest_rq = NULL;
242 break;
243 }
244 }
245
246 /* If this rq is still suitable use it. */
247 if (lowest_rq->rt.highest_prio > task->prio)
248 break;
249
250 /* try again */
251 spin_unlock(&lowest_rq->lock);
252 lowest_rq = NULL;
253 }
254
255 return lowest_rq;
256}
257
258/*
259 * If the current CPU has more than one RT task, see if the non
260 * running task can migrate over to a CPU that is running a task
261 * of lesser priority.
262 */
263static int push_rt_task(struct rq *this_rq)
264{
265 struct task_struct *next_task;
266 struct rq *lowest_rq;
267 int ret = 0;
268 int paranoid = RT_MAX_TRIES;
269
270 assert_spin_locked(&this_rq->lock);
271
272 next_task = pick_next_highest_task_rt(this_rq);
273 if (!next_task)
274 return 0;
275
276 retry:
277 if (unlikely(next_task == this_rq->curr))
278 return 0;
279
280 /*
281 * It's possible that the next_task slipped in of
282 * higher priority than current. If that's the case
283 * just reschedule current.
284 */
285 if (unlikely(next_task->prio < this_rq->curr->prio)) {
286 resched_task(this_rq->curr);
287 return 0;
288 }
289
290 /* We might release this_rq lock */
291 get_task_struct(next_task);
292
293 /* find_lock_lowest_rq locks the rq if found */
294 lowest_rq = find_lock_lowest_rq(next_task, this_rq);
295 if (!lowest_rq) {
296 struct task_struct *task;
297 /*
298 * find lock_lowest_rq releases this_rq->lock
299 * so it is possible that next_task has changed.
300 * If it has, then try again.
301 */
302 task = pick_next_highest_task_rt(this_rq);
303 if (unlikely(task != next_task) && task && paranoid--) {
304 put_task_struct(next_task);
305 next_task = task;
306 goto retry;
307 }
308 goto out;
309 }
310
311 assert_spin_locked(&lowest_rq->lock);
312
313 deactivate_task(this_rq, next_task, 0);
314 set_task_cpu(next_task, lowest_rq->cpu);
315 activate_task(lowest_rq, next_task, 0);
316
317 resched_task(lowest_rq->curr);
318
319 spin_unlock(&lowest_rq->lock);
320
321 ret = 1;
322out:
323 put_task_struct(next_task);
324
325 return ret;
326}
327
328/*
329 * TODO: Currently we just use the second highest prio task on
330 * the queue, and stop when it can't migrate (or there's
331 * no more RT tasks). There may be a case where a lower
332 * priority RT task has a different affinity than the
333 * higher RT task. In this case the lower RT task could
334 * possibly be able to migrate where as the higher priority
335 * RT task could not. We currently ignore this issue.
336 * Enhancements are welcome!
337 */
338static void push_rt_tasks(struct rq *rq)
339{
340 /* push_rt_task will return true if it moved an RT */
341 while (push_rt_task(rq))
342 ;
343}
344
345static void schedule_tail_balance_rt(struct rq *rq)
346{
347 /*
348 * If we have more than one rt_task queued, then
349 * see if we can push the other rt_tasks off to other CPUS.
350 * Note we may release the rq lock, and since
351 * the lock was owned by prev, we need to release it
352 * first via finish_lock_switch and then reaquire it here.
353 */
354 if (unlikely(rq->rt.rt_nr_running > 1)) {
355 spin_lock_irq(&rq->lock);
356 push_rt_tasks(rq);
357 spin_unlock_irq(&rq->lock);
358 }
359}
360
140/* 361/*
141 * Load-balancing iterator. Note: while the runqueue stays locked 362 * Load-balancing iterator. Note: while the runqueue stays locked
142 * during the whole iteration, the current task might be 363 * during the whole iteration, the current task might be
@@ -241,7 +462,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
241 return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 462 return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
242 &rt_rq_iterator); 463 &rt_rq_iterator);
243} 464}
244#endif 465#else /* CONFIG_SMP */
466# define schedule_tail_balance_rt(rq) do { } while (0)
467#endif /* CONFIG_SMP */
245 468
246static void task_tick_rt(struct rq *rq, struct task_struct *p) 469static void task_tick_rt(struct rq *rq, struct task_struct *p)
247{ 470{