diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 146 |
1 files changed, 123 insertions, 23 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 8ee437a5ec1d..fcc3483e9955 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -467,11 +467,17 @@ struct rt_rq { | |||
467 | struct rt_prio_array active; | 467 | struct rt_prio_array active; |
468 | unsigned long rt_nr_running; | 468 | unsigned long rt_nr_running; |
469 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 469 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
470 | int highest_prio; /* highest queued rt task prio */ | 470 | struct { |
471 | int curr; /* highest queued rt task prio */ | ||
472 | #ifdef CONFIG_SMP | ||
473 | int next; /* next highest */ | ||
474 | #endif | ||
475 | } highest_prio; | ||
471 | #endif | 476 | #endif |
472 | #ifdef CONFIG_SMP | 477 | #ifdef CONFIG_SMP |
473 | unsigned long rt_nr_migratory; | 478 | unsigned long rt_nr_migratory; |
474 | int overloaded; | 479 | int overloaded; |
480 | struct plist_head pushable_tasks; | ||
475 | #endif | 481 | #endif |
476 | int rt_throttled; | 482 | int rt_throttled; |
477 | u64 rt_time; | 483 | u64 rt_time; |
@@ -1610,21 +1616,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1610 | 1616 | ||
1611 | #endif | 1617 | #endif |
1612 | 1618 | ||
1619 | #ifdef CONFIG_PREEMPT | ||
1620 | |||
1613 | /* | 1621 | /* |
1614 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 1622 | * fair double_lock_balance: Safely acquires both rq->locks in a fair |
1623 | * way at the expense of forcing extra atomic operations in all | ||
1624 | * invocations. This assures that the double_lock is acquired using the | ||
1625 | * same underlying policy as the spinlock_t on this architecture, which | ||
1626 | * reduces latency compared to the unfair variant below. However, it | ||
1627 | * also adds more overhead and therefore may reduce throughput. | ||
1615 | */ | 1628 | */ |
1616 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1629 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) |
1630 | __releases(this_rq->lock) | ||
1631 | __acquires(busiest->lock) | ||
1632 | __acquires(this_rq->lock) | ||
1633 | { | ||
1634 | spin_unlock(&this_rq->lock); | ||
1635 | double_rq_lock(this_rq, busiest); | ||
1636 | |||
1637 | return 1; | ||
1638 | } | ||
1639 | |||
1640 | #else | ||
1641 | /* | ||
1642 | * Unfair double_lock_balance: Optimizes throughput at the expense of | ||
1643 | * latency by eliminating extra atomic operations when the locks are | ||
1644 | * already in proper order on entry. This favors lower cpu-ids and will | ||
1645 | * grant the double lock to lower cpus over higher ids under contention, | ||
1646 | * regardless of entry order into the function. | ||
1647 | */ | ||
1648 | static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1617 | __releases(this_rq->lock) | 1649 | __releases(this_rq->lock) |
1618 | __acquires(busiest->lock) | 1650 | __acquires(busiest->lock) |
1619 | __acquires(this_rq->lock) | 1651 | __acquires(this_rq->lock) |
1620 | { | 1652 | { |
1621 | int ret = 0; | 1653 | int ret = 0; |
1622 | 1654 | ||
1623 | if (unlikely(!irqs_disabled())) { | ||
1624 | /* printk() doesn't work good under rq->lock */ | ||
1625 | spin_unlock(&this_rq->lock); | ||
1626 | BUG_ON(1); | ||
1627 | } | ||
1628 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1655 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1629 | if (busiest < this_rq) { | 1656 | if (busiest < this_rq) { |
1630 | spin_unlock(&this_rq->lock); | 1657 | spin_unlock(&this_rq->lock); |
@@ -1637,6 +1664,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1637 | return ret; | 1664 | return ret; |
1638 | } | 1665 | } |
1639 | 1666 | ||
1667 | #endif /* CONFIG_PREEMPT */ | ||
1668 | |||
1669 | /* | ||
1670 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | ||
1671 | */ | ||
1672 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) | ||
1673 | { | ||
1674 | if (unlikely(!irqs_disabled())) { | ||
1675 | /* printk() doesn't work good under rq->lock */ | ||
1676 | spin_unlock(&this_rq->lock); | ||
1677 | BUG_ON(1); | ||
1678 | } | ||
1679 | |||
1680 | return _double_lock_balance(this_rq, busiest); | ||
1681 | } | ||
1682 | |||
1640 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) | 1683 | static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) |
1641 | __releases(busiest->lock) | 1684 | __releases(busiest->lock) |
1642 | { | 1685 | { |
@@ -1705,6 +1748,9 @@ static void update_avg(u64 *avg, u64 sample) | |||
1705 | 1748 | ||
1706 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | 1749 | static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) |
1707 | { | 1750 | { |
1751 | if (wakeup) | ||
1752 | p->se.start_runtime = p->se.sum_exec_runtime; | ||
1753 | |||
1708 | sched_info_queued(p); | 1754 | sched_info_queued(p); |
1709 | p->sched_class->enqueue_task(rq, p, wakeup); | 1755 | p->sched_class->enqueue_task(rq, p, wakeup); |
1710 | p->se.on_rq = 1; | 1756 | p->se.on_rq = 1; |
@@ -1712,10 +1758,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1712 | 1758 | ||
1713 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) | 1759 | static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) |
1714 | { | 1760 | { |
1715 | if (sleep && p->se.last_wakeup) { | 1761 | if (sleep) { |
1716 | update_avg(&p->se.avg_overlap, | 1762 | if (p->se.last_wakeup) { |
1717 | p->se.sum_exec_runtime - p->se.last_wakeup); | 1763 | update_avg(&p->se.avg_overlap, |
1718 | p->se.last_wakeup = 0; | 1764 | p->se.sum_exec_runtime - p->se.last_wakeup); |
1765 | p->se.last_wakeup = 0; | ||
1766 | } else { | ||
1767 | update_avg(&p->se.avg_wakeup, | ||
1768 | sysctl_sched_wakeup_granularity); | ||
1769 | } | ||
1719 | } | 1770 | } |
1720 | 1771 | ||
1721 | sched_info_dequeued(p); | 1772 | sched_info_dequeued(p); |
@@ -2355,6 +2406,22 @@ out_activate: | |||
2355 | activate_task(rq, p, 1); | 2406 | activate_task(rq, p, 1); |
2356 | success = 1; | 2407 | success = 1; |
2357 | 2408 | ||
2409 | /* | ||
2410 | * Only attribute actual wakeups done by this task. | ||
2411 | */ | ||
2412 | if (!in_interrupt()) { | ||
2413 | struct sched_entity *se = ¤t->se; | ||
2414 | u64 sample = se->sum_exec_runtime; | ||
2415 | |||
2416 | if (se->last_wakeup) | ||
2417 | sample -= se->last_wakeup; | ||
2418 | else | ||
2419 | sample -= se->start_runtime; | ||
2420 | update_avg(&se->avg_wakeup, sample); | ||
2421 | |||
2422 | se->last_wakeup = se->sum_exec_runtime; | ||
2423 | } | ||
2424 | |||
2358 | out_running: | 2425 | out_running: |
2359 | trace_sched_wakeup(rq, p, success); | 2426 | trace_sched_wakeup(rq, p, success); |
2360 | check_preempt_curr(rq, p, sync); | 2427 | check_preempt_curr(rq, p, sync); |
@@ -2365,8 +2432,6 @@ out_running: | |||
2365 | p->sched_class->task_wake_up(rq, p); | 2432 | p->sched_class->task_wake_up(rq, p); |
2366 | #endif | 2433 | #endif |
2367 | out: | 2434 | out: |
2368 | current->se.last_wakeup = current->se.sum_exec_runtime; | ||
2369 | |||
2370 | task_rq_unlock(rq, &flags); | 2435 | task_rq_unlock(rq, &flags); |
2371 | 2436 | ||
2372 | return success; | 2437 | return success; |
@@ -2396,6 +2461,8 @@ static void __sched_fork(struct task_struct *p) | |||
2396 | p->se.prev_sum_exec_runtime = 0; | 2461 | p->se.prev_sum_exec_runtime = 0; |
2397 | p->se.last_wakeup = 0; | 2462 | p->se.last_wakeup = 0; |
2398 | p->se.avg_overlap = 0; | 2463 | p->se.avg_overlap = 0; |
2464 | p->se.start_runtime = 0; | ||
2465 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; | ||
2399 | 2466 | ||
2400 | #ifdef CONFIG_SCHEDSTATS | 2467 | #ifdef CONFIG_SCHEDSTATS |
2401 | p->se.wait_start = 0; | 2468 | p->se.wait_start = 0; |
@@ -2458,6 +2525,8 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2458 | /* Want to start with kernel preemption disabled. */ | 2525 | /* Want to start with kernel preemption disabled. */ |
2459 | task_thread_info(p)->preempt_count = 1; | 2526 | task_thread_info(p)->preempt_count = 1; |
2460 | #endif | 2527 | #endif |
2528 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | ||
2529 | |||
2461 | put_cpu(); | 2530 | put_cpu(); |
2462 | } | 2531 | } |
2463 | 2532 | ||
@@ -2598,6 +2667,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2598 | { | 2667 | { |
2599 | struct mm_struct *mm = rq->prev_mm; | 2668 | struct mm_struct *mm = rq->prev_mm; |
2600 | long prev_state; | 2669 | long prev_state; |
2670 | #ifdef CONFIG_SMP | ||
2671 | int post_schedule = 0; | ||
2672 | |||
2673 | if (current->sched_class->needs_post_schedule) | ||
2674 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
2675 | #endif | ||
2601 | 2676 | ||
2602 | rq->prev_mm = NULL; | 2677 | rq->prev_mm = NULL; |
2603 | 2678 | ||
@@ -2616,7 +2691,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2616 | finish_arch_switch(prev); | 2691 | finish_arch_switch(prev); |
2617 | finish_lock_switch(rq, prev); | 2692 | finish_lock_switch(rq, prev); |
2618 | #ifdef CONFIG_SMP | 2693 | #ifdef CONFIG_SMP |
2619 | if (current->sched_class->post_schedule) | 2694 | if (post_schedule) |
2620 | current->sched_class->post_schedule(rq); | 2695 | current->sched_class->post_schedule(rq); |
2621 | #endif | 2696 | #endif |
2622 | 2697 | ||
@@ -2997,6 +3072,16 @@ next: | |||
2997 | pulled++; | 3072 | pulled++; |
2998 | rem_load_move -= p->se.load.weight; | 3073 | rem_load_move -= p->se.load.weight; |
2999 | 3074 | ||
3075 | #ifdef CONFIG_PREEMPT | ||
3076 | /* | ||
3077 | * NEWIDLE balancing is a source of latency, so preemptible kernels | ||
3078 | * will stop after the first task is pulled to minimize the critical | ||
3079 | * section. | ||
3080 | */ | ||
3081 | if (idle == CPU_NEWLY_IDLE) | ||
3082 | goto out; | ||
3083 | #endif | ||
3084 | |||
3000 | /* | 3085 | /* |
3001 | * We only want to steal up to the prescribed amount of weighted load. | 3086 | * We only want to steal up to the prescribed amount of weighted load. |
3002 | */ | 3087 | */ |
@@ -3043,9 +3128,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3043 | sd, idle, all_pinned, &this_best_prio); | 3128 | sd, idle, all_pinned, &this_best_prio); |
3044 | class = class->next; | 3129 | class = class->next; |
3045 | 3130 | ||
3131 | #ifdef CONFIG_PREEMPT | ||
3132 | /* | ||
3133 | * NEWIDLE balancing is a source of latency, so preemptible | ||
3134 | * kernels will stop after the first task is pulled to minimize | ||
3135 | * the critical section. | ||
3136 | */ | ||
3046 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) | 3137 | if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) |
3047 | break; | 3138 | break; |
3048 | 3139 | #endif | |
3049 | } while (class && max_load_move > total_load_moved); | 3140 | } while (class && max_load_move > total_load_moved); |
3050 | 3141 | ||
3051 | return total_load_moved > 0; | 3142 | return total_load_moved > 0; |
@@ -3890,19 +3981,24 @@ int select_nohz_load_balancer(int stop_tick) | |||
3890 | int cpu = smp_processor_id(); | 3981 | int cpu = smp_processor_id(); |
3891 | 3982 | ||
3892 | if (stop_tick) { | 3983 | if (stop_tick) { |
3893 | cpumask_set_cpu(cpu, nohz.cpu_mask); | ||
3894 | cpu_rq(cpu)->in_nohz_recently = 1; | 3984 | cpu_rq(cpu)->in_nohz_recently = 1; |
3895 | 3985 | ||
3896 | /* | 3986 | if (!cpu_active(cpu)) { |
3897 | * If we are going offline and still the leader, give up! | 3987 | if (atomic_read(&nohz.load_balancer) != cpu) |
3898 | */ | 3988 | return 0; |
3899 | if (!cpu_active(cpu) && | 3989 | |
3900 | atomic_read(&nohz.load_balancer) == cpu) { | 3990 | /* |
3991 | * If we are going offline and still the leader, | ||
3992 | * give up! | ||
3993 | */ | ||
3901 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 3994 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) |
3902 | BUG(); | 3995 | BUG(); |
3996 | |||
3903 | return 0; | 3997 | return 0; |
3904 | } | 3998 | } |
3905 | 3999 | ||
4000 | cpumask_set_cpu(cpu, nohz.cpu_mask); | ||
4001 | |||
3906 | /* time for ilb owner also to sleep */ | 4002 | /* time for ilb owner also to sleep */ |
3907 | if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | 4003 | if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { |
3908 | if (atomic_read(&nohz.load_balancer) == cpu) | 4004 | if (atomic_read(&nohz.load_balancer) == cpu) |
@@ -8214,11 +8310,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
8214 | __set_bit(MAX_RT_PRIO, array->bitmap); | 8310 | __set_bit(MAX_RT_PRIO, array->bitmap); |
8215 | 8311 | ||
8216 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 8312 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
8217 | rt_rq->highest_prio = MAX_RT_PRIO; | 8313 | rt_rq->highest_prio.curr = MAX_RT_PRIO; |
8314 | #ifdef CONFIG_SMP | ||
8315 | rt_rq->highest_prio.next = MAX_RT_PRIO; | ||
8316 | #endif | ||
8218 | #endif | 8317 | #endif |
8219 | #ifdef CONFIG_SMP | 8318 | #ifdef CONFIG_SMP |
8220 | rt_rq->rt_nr_migratory = 0; | 8319 | rt_rq->rt_nr_migratory = 0; |
8221 | rt_rq->overloaded = 0; | 8320 | rt_rq->overloaded = 0; |
8321 | plist_head_init(&rq->rt.pushable_tasks, &rq->lock); | ||
8222 | #endif | 8322 | #endif |
8223 | 8323 | ||
8224 | rt_rq->rt_time = 0; | 8324 | rt_rq->rt_time = 0; |