aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c200
1 files changed, 169 insertions, 31 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 8e2558c2ba67..2f28351892c9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331 */ 331 */
332static DEFINE_SPINLOCK(task_group_lock); 332static DEFINE_SPINLOCK(task_group_lock);
333 333
334#ifdef CONFIG_SMP
335static int root_task_group_empty(void)
336{
337 return list_empty(&root_task_group.children);
338}
339#endif
340
334#ifdef CONFIG_FAIR_GROUP_SCHED 341#ifdef CONFIG_FAIR_GROUP_SCHED
335#ifdef CONFIG_USER_SCHED 342#ifdef CONFIG_USER_SCHED
336# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 343# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
391 398
392#else 399#else
393 400
401#ifdef CONFIG_SMP
402static int root_task_group_empty(void)
403{
404 return 1;
405}
406#endif
407
394static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 408static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
395static inline struct task_group *task_group(struct task_struct *p) 409static inline struct task_group *task_group(struct task_struct *p)
396{ 410{
@@ -467,11 +481,17 @@ struct rt_rq {
467 struct rt_prio_array active; 481 struct rt_prio_array active;
468 unsigned long rt_nr_running; 482 unsigned long rt_nr_running;
469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 483#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
470 int highest_prio; /* highest queued rt task prio */ 484 struct {
485 int curr; /* highest queued rt task prio */
486#ifdef CONFIG_SMP
487 int next; /* next highest */
488#endif
489 } highest_prio;
471#endif 490#endif
472#ifdef CONFIG_SMP 491#ifdef CONFIG_SMP
473 unsigned long rt_nr_migratory; 492 unsigned long rt_nr_migratory;
474 int overloaded; 493 int overloaded;
494 struct plist_head pushable_tasks;
475#endif 495#endif
476 int rt_throttled; 496 int rt_throttled;
477 u64 rt_time; 497 u64 rt_time;
@@ -549,7 +569,6 @@ struct rq {
549 unsigned long nr_running; 569 unsigned long nr_running;
550 #define CPU_LOAD_IDX_MAX 5 570 #define CPU_LOAD_IDX_MAX 5
551 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 571 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
552 unsigned char idle_at_tick;
553#ifdef CONFIG_NO_HZ 572#ifdef CONFIG_NO_HZ
554 unsigned long last_tick_seen; 573 unsigned long last_tick_seen;
555 unsigned char in_nohz_recently; 574 unsigned char in_nohz_recently;
@@ -590,6 +609,7 @@ struct rq {
590 struct root_domain *rd; 609 struct root_domain *rd;
591 struct sched_domain *sd; 610 struct sched_domain *sd;
592 611
612 unsigned char idle_at_tick;
593 /* For active balancing */ 613 /* For active balancing */
594 int active_balance; 614 int active_balance;
595 int push_cpu; 615 int push_cpu;
@@ -1183,10 +1203,10 @@ static void resched_task(struct task_struct *p)
1183 1203
1184 assert_spin_locked(&task_rq(p)->lock); 1204 assert_spin_locked(&task_rq(p)->lock);
1185 1205
1186 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1206 if (test_tsk_need_resched(p))
1187 return; 1207 return;
1188 1208
1189 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1209 set_tsk_need_resched(p);
1190 1210
1191 cpu = task_cpu(p); 1211 cpu = task_cpu(p);
1192 if (cpu == smp_processor_id()) 1212 if (cpu == smp_processor_id())
@@ -1242,7 +1262,7 @@ void wake_up_idle_cpu(int cpu)
1242 * lockless. The worst case is that the other CPU runs the 1262 * lockless. The worst case is that the other CPU runs the
1243 * idle task through an additional NOOP schedule() 1263 * idle task through an additional NOOP schedule()
1244 */ 1264 */
1245 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); 1265 set_tsk_need_resched(rq->idle);
1246 1266
1247 /* NEED_RESCHED must be visible before we test polling */ 1267 /* NEED_RESCHED must be visible before we test polling */
1248 smp_mb(); 1268 smp_mb();
@@ -1610,21 +1630,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610 1630
1611#endif 1631#endif
1612 1632
1633#ifdef CONFIG_PREEMPT
1634
1613/* 1635/*
1614 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1636 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1637 * way at the expense of forcing extra atomic operations in all
1638 * invocations. This assures that the double_lock is acquired using the
1639 * same underlying policy as the spinlock_t on this architecture, which
1640 * reduces latency compared to the unfair variant below. However, it
1641 * also adds more overhead and therefore may reduce throughput.
1615 */ 1642 */
1616static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1643static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1644 __releases(this_rq->lock)
1645 __acquires(busiest->lock)
1646 __acquires(this_rq->lock)
1647{
1648 spin_unlock(&this_rq->lock);
1649 double_rq_lock(this_rq, busiest);
1650
1651 return 1;
1652}
1653
1654#else
1655/*
1656 * Unfair double_lock_balance: Optimizes throughput at the expense of
1657 * latency by eliminating extra atomic operations when the locks are
1658 * already in proper order on entry. This favors lower cpu-ids and will
1659 * grant the double lock to lower cpus over higher ids under contention,
1660 * regardless of entry order into the function.
1661 */
1662static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1617 __releases(this_rq->lock) 1663 __releases(this_rq->lock)
1618 __acquires(busiest->lock) 1664 __acquires(busiest->lock)
1619 __acquires(this_rq->lock) 1665 __acquires(this_rq->lock)
1620{ 1666{
1621 int ret = 0; 1667 int ret = 0;
1622 1668
1623 if (unlikely(!irqs_disabled())) {
1624 /* printk() doesn't work good under rq->lock */
1625 spin_unlock(&this_rq->lock);
1626 BUG_ON(1);
1627 }
1628 if (unlikely(!spin_trylock(&busiest->lock))) { 1669 if (unlikely(!spin_trylock(&busiest->lock))) {
1629 if (busiest < this_rq) { 1670 if (busiest < this_rq) {
1630 spin_unlock(&this_rq->lock); 1671 spin_unlock(&this_rq->lock);
@@ -1637,6 +1678,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1637 return ret; 1678 return ret;
1638} 1679}
1639 1680
1681#endif /* CONFIG_PREEMPT */
1682
1683/*
1684 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1685 */
1686static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1687{
1688 if (unlikely(!irqs_disabled())) {
1689 /* printk() doesn't work good under rq->lock */
1690 spin_unlock(&this_rq->lock);
1691 BUG_ON(1);
1692 }
1693
1694 return _double_lock_balance(this_rq, busiest);
1695}
1696
1640static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1697static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(busiest->lock) 1698 __releases(busiest->lock)
1642{ 1699{
@@ -1705,6 +1762,9 @@ static void update_avg(u64 *avg, u64 sample)
1705 1762
1706static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1763static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1707{ 1764{
1765 if (wakeup)
1766 p->se.start_runtime = p->se.sum_exec_runtime;
1767
1708 sched_info_queued(p); 1768 sched_info_queued(p);
1709 p->sched_class->enqueue_task(rq, p, wakeup); 1769 p->sched_class->enqueue_task(rq, p, wakeup);
1710 p->se.on_rq = 1; 1770 p->se.on_rq = 1;
@@ -1712,10 +1772,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1712 1772
1713static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1773static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1714{ 1774{
1715 if (sleep && p->se.last_wakeup) { 1775 if (sleep) {
1716 update_avg(&p->se.avg_overlap, 1776 if (p->se.last_wakeup) {
1717 p->se.sum_exec_runtime - p->se.last_wakeup); 1777 update_avg(&p->se.avg_overlap,
1718 p->se.last_wakeup = 0; 1778 p->se.sum_exec_runtime - p->se.last_wakeup);
1779 p->se.last_wakeup = 0;
1780 } else {
1781 update_avg(&p->se.avg_wakeup,
1782 sysctl_sched_wakeup_granularity);
1783 }
1719 } 1784 }
1720 1785
1721 sched_info_dequeued(p); 1786 sched_info_dequeued(p);
@@ -2267,7 +2332,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2267 sync = 0; 2332 sync = 0;
2268 2333
2269#ifdef CONFIG_SMP 2334#ifdef CONFIG_SMP
2270 if (sched_feat(LB_WAKEUP_UPDATE)) { 2335 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2271 struct sched_domain *sd; 2336 struct sched_domain *sd;
2272 2337
2273 this_cpu = raw_smp_processor_id(); 2338 this_cpu = raw_smp_processor_id();
@@ -2345,6 +2410,22 @@ out_activate:
2345 activate_task(rq, p, 1); 2410 activate_task(rq, p, 1);
2346 success = 1; 2411 success = 1;
2347 2412
2413 /*
2414 * Only attribute actual wakeups done by this task.
2415 */
2416 if (!in_interrupt()) {
2417 struct sched_entity *se = &current->se;
2418 u64 sample = se->sum_exec_runtime;
2419
2420 if (se->last_wakeup)
2421 sample -= se->last_wakeup;
2422 else
2423 sample -= se->start_runtime;
2424 update_avg(&se->avg_wakeup, sample);
2425
2426 se->last_wakeup = se->sum_exec_runtime;
2427 }
2428
2348out_running: 2429out_running:
2349 trace_sched_wakeup(rq, p, success); 2430 trace_sched_wakeup(rq, p, success);
2350 check_preempt_curr(rq, p, sync); 2431 check_preempt_curr(rq, p, sync);
@@ -2355,8 +2436,6 @@ out_running:
2355 p->sched_class->task_wake_up(rq, p); 2436 p->sched_class->task_wake_up(rq, p);
2356#endif 2437#endif
2357out: 2438out:
2358 current->se.last_wakeup = current->se.sum_exec_runtime;
2359
2360 task_rq_unlock(rq, &flags); 2439 task_rq_unlock(rq, &flags);
2361 2440
2362 return success; 2441 return success;
@@ -2386,6 +2465,8 @@ static void __sched_fork(struct task_struct *p)
2386 p->se.prev_sum_exec_runtime = 0; 2465 p->se.prev_sum_exec_runtime = 0;
2387 p->se.last_wakeup = 0; 2466 p->se.last_wakeup = 0;
2388 p->se.avg_overlap = 0; 2467 p->se.avg_overlap = 0;
2468 p->se.start_runtime = 0;
2469 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2389 2470
2390#ifdef CONFIG_SCHEDSTATS 2471#ifdef CONFIG_SCHEDSTATS
2391 p->se.wait_start = 0; 2472 p->se.wait_start = 0;
@@ -2448,6 +2529,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2448 /* Want to start with kernel preemption disabled. */ 2529 /* Want to start with kernel preemption disabled. */
2449 task_thread_info(p)->preempt_count = 1; 2530 task_thread_info(p)->preempt_count = 1;
2450#endif 2531#endif
2532 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2533
2451 put_cpu(); 2534 put_cpu();
2452} 2535}
2453 2536
@@ -2588,6 +2671,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2588{ 2671{
2589 struct mm_struct *mm = rq->prev_mm; 2672 struct mm_struct *mm = rq->prev_mm;
2590 long prev_state; 2673 long prev_state;
2674#ifdef CONFIG_SMP
2675 int post_schedule = 0;
2676
2677 if (current->sched_class->needs_post_schedule)
2678 post_schedule = current->sched_class->needs_post_schedule(rq);
2679#endif
2591 2680
2592 rq->prev_mm = NULL; 2681 rq->prev_mm = NULL;
2593 2682
@@ -2606,7 +2695,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2606 finish_arch_switch(prev); 2695 finish_arch_switch(prev);
2607 finish_lock_switch(rq, prev); 2696 finish_lock_switch(rq, prev);
2608#ifdef CONFIG_SMP 2697#ifdef CONFIG_SMP
2609 if (current->sched_class->post_schedule) 2698 if (post_schedule)
2610 current->sched_class->post_schedule(rq); 2699 current->sched_class->post_schedule(rq);
2611#endif 2700#endif
2612 2701
@@ -2987,6 +3076,16 @@ next:
2987 pulled++; 3076 pulled++;
2988 rem_load_move -= p->se.load.weight; 3077 rem_load_move -= p->se.load.weight;
2989 3078
3079#ifdef CONFIG_PREEMPT
3080 /*
3081 * NEWIDLE balancing is a source of latency, so preemptible kernels
3082 * will stop after the first task is pulled to minimize the critical
3083 * section.
3084 */
3085 if (idle == CPU_NEWLY_IDLE)
3086 goto out;
3087#endif
3088
2990 /* 3089 /*
2991 * We only want to steal up to the prescribed amount of weighted load. 3090 * We only want to steal up to the prescribed amount of weighted load.
2992 */ 3091 */
@@ -3033,9 +3132,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3033 sd, idle, all_pinned, &this_best_prio); 3132 sd, idle, all_pinned, &this_best_prio);
3034 class = class->next; 3133 class = class->next;
3035 3134
3135#ifdef CONFIG_PREEMPT
3136 /*
3137 * NEWIDLE balancing is a source of latency, so preemptible
3138 * kernels will stop after the first task is pulled to minimize
3139 * the critical section.
3140 */
3036 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3141 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3037 break; 3142 break;
3038 3143#endif
3039 } while (class && max_load_move > total_load_moved); 3144 } while (class && max_load_move > total_load_moved);
3040 3145
3041 return total_load_moved > 0; 3146 return total_load_moved > 0;
@@ -4057,6 +4162,11 @@ static void run_rebalance_domains(struct softirq_action *h)
4057#endif 4162#endif
4058} 4163}
4059 4164
4165static inline int on_null_domain(int cpu)
4166{
4167 return !rcu_dereference(cpu_rq(cpu)->sd);
4168}
4169
4060/* 4170/*
4061 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 4171 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4062 * 4172 *
@@ -4114,7 +4224,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4114 cpumask_test_cpu(cpu, nohz.cpu_mask)) 4224 cpumask_test_cpu(cpu, nohz.cpu_mask))
4115 return; 4225 return;
4116#endif 4226#endif
4117 if (time_after_eq(jiffies, rq->next_balance)) 4227 /* Don't need to rebalance while attached to NULL domain */
4228 if (time_after_eq(jiffies, rq->next_balance) &&
4229 likely(!on_null_domain(cpu)))
4118 raise_softirq(SCHED_SOFTIRQ); 4230 raise_softirq(SCHED_SOFTIRQ);
4119} 4231}
4120 4232
@@ -4508,11 +4620,33 @@ static inline void schedule_debug(struct task_struct *prev)
4508#endif 4620#endif
4509} 4621}
4510 4622
4623static void put_prev_task(struct rq *rq, struct task_struct *prev)
4624{
4625 if (prev->state == TASK_RUNNING) {
4626 u64 runtime = prev->se.sum_exec_runtime;
4627
4628 runtime -= prev->se.prev_sum_exec_runtime;
4629 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4630
4631 /*
4632 * In order to avoid avg_overlap growing stale when we are
4633 * indeed overlapping and hence not getting put to sleep, grow
4634 * the avg_overlap on preemption.
4635 *
4636 * We use the average preemption runtime because that
4637 * correlates to the amount of cache footprint a task can
4638 * build up.
4639 */
4640 update_avg(&prev->se.avg_overlap, runtime);
4641 }
4642 prev->sched_class->put_prev_task(rq, prev);
4643}
4644
4511/* 4645/*
4512 * Pick up the highest-prio task: 4646 * Pick up the highest-prio task:
4513 */ 4647 */
4514static inline struct task_struct * 4648static inline struct task_struct *
4515pick_next_task(struct rq *rq, struct task_struct *prev) 4649pick_next_task(struct rq *rq)
4516{ 4650{
4517 const struct sched_class *class; 4651 const struct sched_class *class;
4518 struct task_struct *p; 4652 struct task_struct *p;
@@ -4586,8 +4720,8 @@ need_resched_nonpreemptible:
4586 if (unlikely(!rq->nr_running)) 4720 if (unlikely(!rq->nr_running))
4587 idle_balance(cpu, rq); 4721 idle_balance(cpu, rq);
4588 4722
4589 prev->sched_class->put_prev_task(rq, prev); 4723 put_prev_task(rq, prev);
4590 next = pick_next_task(rq, prev); 4724 next = pick_next_task(rq);
4591 4725
4592 if (likely(prev != next)) { 4726 if (likely(prev != next)) {
4593 sched_info_switch(prev, next); 4727 sched_info_switch(prev, next);
@@ -4642,7 +4776,7 @@ asmlinkage void __sched preempt_schedule(void)
4642 * between schedule and now. 4776 * between schedule and now.
4643 */ 4777 */
4644 barrier(); 4778 barrier();
4645 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 4779 } while (need_resched());
4646} 4780}
4647EXPORT_SYMBOL(preempt_schedule); 4781EXPORT_SYMBOL(preempt_schedule);
4648 4782
@@ -4671,7 +4805,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4671 * between schedule and now. 4805 * between schedule and now.
4672 */ 4806 */
4673 barrier(); 4807 barrier();
4674 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 4808 } while (need_resched());
4675} 4809}
4676 4810
4677#endif /* CONFIG_PREEMPT */ 4811#endif /* CONFIG_PREEMPT */
@@ -5145,7 +5279,7 @@ SYSCALL_DEFINE1(nice, int, increment)
5145 if (increment > 40) 5279 if (increment > 40)
5146 increment = 40; 5280 increment = 40;
5147 5281
5148 nice = PRIO_TO_NICE(current->static_prio) + increment; 5282 nice = TASK_NICE(current) + increment;
5149 if (nice < -20) 5283 if (nice < -20)
5150 nice = -20; 5284 nice = -20;
5151 if (nice > 19) 5285 if (nice > 19)
@@ -6423,7 +6557,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6423 if (!rq->nr_running) 6557 if (!rq->nr_running)
6424 break; 6558 break;
6425 update_rq_clock(rq); 6559 update_rq_clock(rq);
6426 next = pick_next_task(rq, rq->curr); 6560 next = pick_next_task(rq);
6427 if (!next) 6561 if (!next)
6428 break; 6562 break;
6429 next->sched_class->put_prev_task(rq, next); 6563 next->sched_class->put_prev_task(rq, next);
@@ -8218,11 +8352,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8218 __set_bit(MAX_RT_PRIO, array->bitmap); 8352 __set_bit(MAX_RT_PRIO, array->bitmap);
8219 8353
8220#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8354#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8221 rt_rq->highest_prio = MAX_RT_PRIO; 8355 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8356#ifdef CONFIG_SMP
8357 rt_rq->highest_prio.next = MAX_RT_PRIO;
8358#endif
8222#endif 8359#endif
8223#ifdef CONFIG_SMP 8360#ifdef CONFIG_SMP
8224 rt_rq->rt_nr_migratory = 0; 8361 rt_rq->rt_nr_migratory = 0;
8225 rt_rq->overloaded = 0; 8362 rt_rq->overloaded = 0;
8363 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8226#endif 8364#endif
8227 8365
8228 rt_rq->rt_time = 0; 8366 rt_rq->rt_time = 0;
@@ -9598,7 +9736,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9598 struct cpuacct *ca; 9736 struct cpuacct *ca;
9599 int cpu; 9737 int cpu;
9600 9738
9601 if (!cpuacct_subsys.active) 9739 if (unlikely(!cpuacct_subsys.active))
9602 return; 9740 return;
9603 9741
9604 cpu = task_cpu(tsk); 9742 cpu = task_cpu(tsk);