diff options
author | Steven Rostedt <srostedt@redhat.com> | 2008-01-25 15:08:07 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-01-25 15:08:07 -0500 |
commit | f65eda4f789168ba5ff3fa75546c29efeed19f58 (patch) | |
tree | 235e6daad2bc37b22cc5b21907608c79f944f036 /kernel | |
parent | 4fd29176b7cd24909f8ceba2105cb3ae2857b90c (diff) |
sched: pull RT tasks from overloaded runqueues
This patch adds the algorithm to pull tasks from RT overloaded runqueues.
When a pull RT is initiated, all overloaded runqueues are examined for
a RT task that is higher in prio than the highest prio task queued on the
target runqueue. If another runqueue holds a RT task that is of higher
prio than the highest prio task on the target runqueue is found it is pulled
to the target runqueue.
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 2 | ||||
-rw-r--r-- | kernel/sched_rt.c | 187 |
2 files changed, 178 insertions, 11 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 97cab609fc31..c91797107913 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -3721,6 +3721,8 @@ need_resched_nonpreemptible: | |||
3721 | switch_count = &prev->nvcsw; | 3721 | switch_count = &prev->nvcsw; |
3722 | } | 3722 | } |
3723 | 3723 | ||
3724 | schedule_balance_rt(rq, prev); | ||
3725 | |||
3724 | if (unlikely(!rq->nr_running)) | 3726 | if (unlikely(!rq->nr_running)) |
3725 | idle_balance(cpu, rq); | 3727 | idle_balance(cpu, rq); |
3726 | 3728 | ||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 547f858b0752..bacb32039e95 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -179,8 +179,17 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
179 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest); | 179 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest); |
180 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); | 180 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); |
181 | 181 | ||
182 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | ||
183 | { | ||
184 | if (!task_running(rq, p) && | ||
185 | (cpu < 0 || cpu_isset(cpu, p->cpus_allowed))) | ||
186 | return 1; | ||
187 | return 0; | ||
188 | } | ||
189 | |||
182 | /* Return the second highest RT task, NULL otherwise */ | 190 | /* Return the second highest RT task, NULL otherwise */ |
183 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq) | 191 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, |
192 | int cpu) | ||
184 | { | 193 | { |
185 | struct rt_prio_array *array = &rq->rt.active; | 194 | struct rt_prio_array *array = &rq->rt.active; |
186 | struct task_struct *next; | 195 | struct task_struct *next; |
@@ -199,26 +208,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq) | |||
199 | } | 208 | } |
200 | 209 | ||
201 | queue = array->queue + idx; | 210 | queue = array->queue + idx; |
211 | BUG_ON(list_empty(queue)); | ||
212 | |||
202 | next = list_entry(queue->next, struct task_struct, run_list); | 213 | next = list_entry(queue->next, struct task_struct, run_list); |
203 | if (unlikely(next != rq->curr)) | 214 | if (unlikely(pick_rt_task(rq, next, cpu))) |
204 | return next; | 215 | goto out; |
205 | 216 | ||
206 | if (queue->next->next != queue) { | 217 | if (queue->next->next != queue) { |
207 | /* same prio task */ | 218 | /* same prio task */ |
208 | next = list_entry(queue->next->next, struct task_struct, run_list); | 219 | next = list_entry(queue->next->next, struct task_struct, run_list); |
209 | return next; | 220 | if (pick_rt_task(rq, next, cpu)) |
221 | goto out; | ||
210 | } | 222 | } |
211 | 223 | ||
224 | retry: | ||
212 | /* slower, but more flexible */ | 225 | /* slower, but more flexible */ |
213 | idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | 226 | idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); |
214 | if (unlikely(idx >= MAX_RT_PRIO)) { | 227 | if (unlikely(idx >= MAX_RT_PRIO)) |
215 | WARN_ON(1); /* rt_nr_running was 2 and above! */ | ||
216 | return NULL; | 228 | return NULL; |
217 | } | ||
218 | 229 | ||
219 | queue = array->queue + idx; | 230 | queue = array->queue + idx; |
220 | next = list_entry(queue->next, struct task_struct, run_list); | 231 | BUG_ON(list_empty(queue)); |
232 | |||
233 | list_for_each_entry(next, queue, run_list) { | ||
234 | if (pick_rt_task(rq, next, cpu)) | ||
235 | goto out; | ||
236 | } | ||
237 | |||
238 | goto retry; | ||
221 | 239 | ||
240 | out: | ||
222 | return next; | 241 | return next; |
223 | } | 242 | } |
224 | 243 | ||
@@ -305,13 +324,15 @@ static int push_rt_task(struct rq *this_rq) | |||
305 | 324 | ||
306 | assert_spin_locked(&this_rq->lock); | 325 | assert_spin_locked(&this_rq->lock); |
307 | 326 | ||
308 | next_task = pick_next_highest_task_rt(this_rq); | 327 | next_task = pick_next_highest_task_rt(this_rq, -1); |
309 | if (!next_task) | 328 | if (!next_task) |
310 | return 0; | 329 | return 0; |
311 | 330 | ||
312 | retry: | 331 | retry: |
313 | if (unlikely(next_task == this_rq->curr)) | 332 | if (unlikely(next_task == this_rq->curr)) { |
333 | WARN_ON(1); | ||
314 | return 0; | 334 | return 0; |
335 | } | ||
315 | 336 | ||
316 | /* | 337 | /* |
317 | * It's possible that the next_task slipped in of | 338 | * It's possible that the next_task slipped in of |
@@ -335,7 +356,7 @@ static int push_rt_task(struct rq *this_rq) | |||
335 | * so it is possible that next_task has changed. | 356 | * so it is possible that next_task has changed. |
336 | * If it has, then try again. | 357 | * If it has, then try again. |
337 | */ | 358 | */ |
338 | task = pick_next_highest_task_rt(this_rq); | 359 | task = pick_next_highest_task_rt(this_rq, -1); |
339 | if (unlikely(task != next_task) && task && paranoid--) { | 360 | if (unlikely(task != next_task) && task && paranoid--) { |
340 | put_task_struct(next_task); | 361 | put_task_struct(next_task); |
341 | next_task = task; | 362 | next_task = task; |
@@ -378,6 +399,149 @@ static void push_rt_tasks(struct rq *rq) | |||
378 | ; | 399 | ; |
379 | } | 400 | } |
380 | 401 | ||
402 | static int pull_rt_task(struct rq *this_rq) | ||
403 | { | ||
404 | struct task_struct *next; | ||
405 | struct task_struct *p; | ||
406 | struct rq *src_rq; | ||
407 | cpumask_t *rto_cpumask; | ||
408 | int this_cpu = this_rq->cpu; | ||
409 | int cpu; | ||
410 | int ret = 0; | ||
411 | |||
412 | assert_spin_locked(&this_rq->lock); | ||
413 | |||
414 | /* | ||
415 | * If cpusets are used, and we have overlapping | ||
416 | * run queue cpusets, then this algorithm may not catch all. | ||
417 | * This is just the price you pay on trying to keep | ||
418 | * dirtying caches down on large SMP machines. | ||
419 | */ | ||
420 | if (likely(!rt_overloaded())) | ||
421 | return 0; | ||
422 | |||
423 | next = pick_next_task_rt(this_rq); | ||
424 | |||
425 | rto_cpumask = rt_overload(); | ||
426 | |||
427 | for_each_cpu_mask(cpu, *rto_cpumask) { | ||
428 | if (this_cpu == cpu) | ||
429 | continue; | ||
430 | |||
431 | src_rq = cpu_rq(cpu); | ||
432 | if (unlikely(src_rq->rt.rt_nr_running <= 1)) { | ||
433 | /* | ||
434 | * It is possible that overlapping cpusets | ||
435 | * will miss clearing a non overloaded runqueue. | ||
436 | * Clear it now. | ||
437 | */ | ||
438 | if (double_lock_balance(this_rq, src_rq)) { | ||
439 | /* unlocked our runqueue lock */ | ||
440 | struct task_struct *old_next = next; | ||
441 | next = pick_next_task_rt(this_rq); | ||
442 | if (next != old_next) | ||
443 | ret = 1; | ||
444 | } | ||
445 | if (likely(src_rq->rt.rt_nr_running <= 1)) | ||
446 | /* | ||
447 | * Small chance that this_rq->curr changed | ||
448 | * but it's really harmless here. | ||
449 | */ | ||
450 | rt_clear_overload(this_rq); | ||
451 | else | ||
452 | /* | ||
453 | * Heh, the src_rq is now overloaded, since | ||
454 | * we already have the src_rq lock, go straight | ||
455 | * to pulling tasks from it. | ||
456 | */ | ||
457 | goto try_pulling; | ||
458 | spin_unlock(&src_rq->lock); | ||
459 | continue; | ||
460 | } | ||
461 | |||
462 | /* | ||
463 | * We can potentially drop this_rq's lock in | ||
464 | * double_lock_balance, and another CPU could | ||
465 | * steal our next task - hence we must cause | ||
466 | * the caller to recalculate the next task | ||
467 | * in that case: | ||
468 | */ | ||
469 | if (double_lock_balance(this_rq, src_rq)) { | ||
470 | struct task_struct *old_next = next; | ||
471 | next = pick_next_task_rt(this_rq); | ||
472 | if (next != old_next) | ||
473 | ret = 1; | ||
474 | } | ||
475 | |||
476 | /* | ||
477 | * Are there still pullable RT tasks? | ||
478 | */ | ||
479 | if (src_rq->rt.rt_nr_running <= 1) { | ||
480 | spin_unlock(&src_rq->lock); | ||
481 | continue; | ||
482 | } | ||
483 | |||
484 | try_pulling: | ||
485 | p = pick_next_highest_task_rt(src_rq, this_cpu); | ||
486 | |||
487 | /* | ||
488 | * Do we have an RT task that preempts | ||
489 | * the to-be-scheduled task? | ||
490 | */ | ||
491 | if (p && (!next || (p->prio < next->prio))) { | ||
492 | WARN_ON(p == src_rq->curr); | ||
493 | WARN_ON(!p->se.on_rq); | ||
494 | |||
495 | /* | ||
496 | * There's a chance that p is higher in priority | ||
497 | * than what's currently running on its cpu. | ||
498 | * This is just that p is wakeing up and hasn't | ||
499 | * had a chance to schedule. We only pull | ||
500 | * p if it is lower in priority than the | ||
501 | * current task on the run queue or | ||
502 | * this_rq next task is lower in prio than | ||
503 | * the current task on that rq. | ||
504 | */ | ||
505 | if (p->prio < src_rq->curr->prio || | ||
506 | (next && next->prio < src_rq->curr->prio)) | ||
507 | goto bail; | ||
508 | |||
509 | ret = 1; | ||
510 | |||
511 | deactivate_task(src_rq, p, 0); | ||
512 | set_task_cpu(p, this_cpu); | ||
513 | activate_task(this_rq, p, 0); | ||
514 | /* | ||
515 | * We continue with the search, just in | ||
516 | * case there's an even higher prio task | ||
517 | * in another runqueue. (low likelyhood | ||
518 | * but possible) | ||
519 | */ | ||
520 | |||
521 | /* | ||
522 | * Update next so that we won't pick a task | ||
523 | * on another cpu with a priority lower (or equal) | ||
524 | * than the one we just picked. | ||
525 | */ | ||
526 | next = p; | ||
527 | |||
528 | } | ||
529 | bail: | ||
530 | spin_unlock(&src_rq->lock); | ||
531 | } | ||
532 | |||
533 | return ret; | ||
534 | } | ||
535 | |||
536 | static void schedule_balance_rt(struct rq *rq, | ||
537 | struct task_struct *prev) | ||
538 | { | ||
539 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
540 | if (unlikely(rt_task(prev)) && | ||
541 | rq->rt.highest_prio > prev->prio) | ||
542 | pull_rt_task(rq); | ||
543 | } | ||
544 | |||
381 | static void schedule_tail_balance_rt(struct rq *rq) | 545 | static void schedule_tail_balance_rt(struct rq *rq) |
382 | { | 546 | { |
383 | /* | 547 | /* |
@@ -500,6 +664,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
500 | } | 664 | } |
501 | #else /* CONFIG_SMP */ | 665 | #else /* CONFIG_SMP */ |
502 | # define schedule_tail_balance_rt(rq) do { } while (0) | 666 | # define schedule_tail_balance_rt(rq) do { } while (0) |
667 | # define schedule_balance_rt(rq, prev) do { } while (0) | ||
503 | #endif /* CONFIG_SMP */ | 668 | #endif /* CONFIG_SMP */ |
504 | 669 | ||
505 | static void task_tick_rt(struct rq *rq, struct task_struct *p) | 670 | static void task_tick_rt(struct rq *rq, struct task_struct *p) |