aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched.c274
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c324
-rw-r--r--kernel/sched_idletask.c26
-rw-r--r--kernel/sched_rt.c19
-rw-r--r--kernel/sched_stoptask.c7
-rw-r--r--kernel/softirq.c3
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time.c23
9 files changed, 480 insertions, 205 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..2effcb71a478 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -324,7 +324,7 @@ struct cfs_rq {
324 * 'curr' points to currently running entity on this cfs_rq. 324 * 'curr' points to currently running entity on this cfs_rq.
325 * It is set to NULL otherwise (i.e when none are currently running). 325 * It is set to NULL otherwise (i.e when none are currently running).
326 */ 326 */
327 struct sched_entity *curr, *next, *last; 327 struct sched_entity *curr, *next, *last, *skip;
328 328
329 unsigned int nr_spread_over; 329 unsigned int nr_spread_over;
330 330
@@ -1686,6 +1686,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1686 __release(rq2->lock); 1686 __release(rq2->lock);
1687} 1687}
1688 1688
1689#else /* CONFIG_SMP */
1690
1691/*
1692 * double_rq_lock - safely lock two runqueues
1693 *
1694 * Note this does not disable interrupts like task_rq_lock,
1695 * you need to do so manually before calling.
1696 */
1697static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1698 __acquires(rq1->lock)
1699 __acquires(rq2->lock)
1700{
1701 BUG_ON(!irqs_disabled());
1702 BUG_ON(rq1 != rq2);
1703 raw_spin_lock(&rq1->lock);
1704 __acquire(rq2->lock); /* Fake it out ;) */
1705}
1706
1707/*
1708 * double_rq_unlock - safely unlock two runqueues
1709 *
1710 * Note this does not restore interrupts like task_rq_unlock,
1711 * you need to do so manually after calling.
1712 */
1713static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1714 __releases(rq1->lock)
1715 __releases(rq2->lock)
1716{
1717 BUG_ON(rq1 != rq2);
1718 raw_spin_unlock(&rq1->lock);
1719 __release(rq2->lock);
1720}
1721
1689#endif 1722#endif
1690 1723
1691static void calc_load_account_idle(struct rq *this_rq); 1724static void calc_load_account_idle(struct rq *this_rq);
@@ -1880,7 +1913,7 @@ void account_system_vtime(struct task_struct *curr)
1880 */ 1913 */
1881 if (hardirq_count()) 1914 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta); 1915 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1916 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1884 __this_cpu_add(cpu_softirq_time, delta); 1917 __this_cpu_add(cpu_softirq_time, delta);
1885 1918
1886 irq_time_write_end(); 1919 irq_time_write_end();
@@ -1920,8 +1953,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1920 sched_rt_avg_update(rq, irq_delta); 1953 sched_rt_avg_update(rq, irq_delta);
1921} 1954}
1922 1955
1956static int irqtime_account_hi_update(void)
1957{
1958 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1959 unsigned long flags;
1960 u64 latest_ns;
1961 int ret = 0;
1962
1963 local_irq_save(flags);
1964 latest_ns = this_cpu_read(cpu_hardirq_time);
1965 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1966 ret = 1;
1967 local_irq_restore(flags);
1968 return ret;
1969}
1970
1971static int irqtime_account_si_update(void)
1972{
1973 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1974 unsigned long flags;
1975 u64 latest_ns;
1976 int ret = 0;
1977
1978 local_irq_save(flags);
1979 latest_ns = this_cpu_read(cpu_softirq_time);
1980 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1981 ret = 1;
1982 local_irq_restore(flags);
1983 return ret;
1984}
1985
1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 1986#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1924 1987
1988#define sched_clock_irqtime (0)
1989
1925static void update_rq_clock_task(struct rq *rq, s64 delta) 1990static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{ 1991{
1927 rq->clock_task += delta; 1992 rq->clock_task += delta;
@@ -2025,14 +2090,14 @@ inline int task_curr(const struct task_struct *p)
2025 2090
2026static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2091static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2027 const struct sched_class *prev_class, 2092 const struct sched_class *prev_class,
2028 int oldprio, int running) 2093 int oldprio)
2029{ 2094{
2030 if (prev_class != p->sched_class) { 2095 if (prev_class != p->sched_class) {
2031 if (prev_class->switched_from) 2096 if (prev_class->switched_from)
2032 prev_class->switched_from(rq, p, running); 2097 prev_class->switched_from(rq, p);
2033 p->sched_class->switched_to(rq, p, running); 2098 p->sched_class->switched_to(rq, p);
2034 } else 2099 } else if (oldprio != p->prio)
2035 p->sched_class->prio_changed(rq, p, oldprio, running); 2100 p->sched_class->prio_changed(rq, p, oldprio);
2036} 2101}
2037 2102
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2103static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2566,6 +2631,7 @@ static void __sched_fork(struct task_struct *p)
2566 p->se.sum_exec_runtime = 0; 2631 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0; 2632 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0; 2633 p->se.nr_migrations = 0;
2634 p->se.vruntime = 0;
2569 2635
2570#ifdef CONFIG_SCHEDSTATS 2636#ifdef CONFIG_SCHEDSTATS
2571 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2637 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -3568,6 +3634,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3568} 3634}
3569 3635
3570/* 3636/*
3637 * Account system cpu time to a process and desired cpustat field
3638 * @p: the process that the cpu time gets accounted to
3639 * @cputime: the cpu time spent in kernel space since the last update
3640 * @cputime_scaled: cputime scaled by cpu frequency
3641 * @target_cputime64: pointer to cpustat field that has to be updated
3642 */
3643static inline
3644void __account_system_time(struct task_struct *p, cputime_t cputime,
3645 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3646{
3647 cputime64_t tmp = cputime_to_cputime64(cputime);
3648
3649 /* Add system time to process. */
3650 p->stime = cputime_add(p->stime, cputime);
3651 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3652 account_group_system_time(p, cputime);
3653
3654 /* Add system time to cpustat. */
3655 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3656 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3657
3658 /* Account for system time used */
3659 acct_update_integrals(p);
3660}
3661
3662/*
3571 * Account system cpu time to a process. 3663 * Account system cpu time to a process.
3572 * @p: the process that the cpu time gets accounted to 3664 * @p: the process that the cpu time gets accounted to
3573 * @hardirq_offset: the offset to subtract from hardirq_count() 3665 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,33 +3670,90 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3578 cputime_t cputime, cputime_t cputime_scaled) 3670 cputime_t cputime, cputime_t cputime_scaled)
3579{ 3671{
3580 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3672 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3581 cputime64_t tmp; 3673 cputime64_t *target_cputime64;
3582 3674
3583 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3675 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3584 account_guest_time(p, cputime, cputime_scaled); 3676 account_guest_time(p, cputime, cputime_scaled);
3585 return; 3677 return;
3586 } 3678 }
3587 3679
3588 /* Add system time to process. */
3589 p->stime = cputime_add(p->stime, cputime);
3590 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3591 account_group_system_time(p, cputime);
3592
3593 /* Add system time to cpustat. */
3594 tmp = cputime_to_cputime64(cputime);
3595 if (hardirq_count() - hardirq_offset) 3680 if (hardirq_count() - hardirq_offset)
3596 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3681 target_cputime64 = &cpustat->irq;
3597 else if (in_serving_softirq()) 3682 else if (in_serving_softirq())
3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3683 target_cputime64 = &cpustat->softirq;
3599 else 3684 else
3600 cpustat->system = cputime64_add(cpustat->system, tmp); 3685 target_cputime64 = &cpustat->system;
3601 3686
3602 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3687 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3688}
3603 3689
3604 /* Account for system time used */ 3690#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3605 acct_update_integrals(p); 3691/*
3692 * Account a tick to a process and cpustat
3693 * @p: the process that the cpu time gets accounted to
3694 * @user_tick: is the tick from userspace
3695 * @rq: the pointer to rq
3696 *
3697 * Tick demultiplexing follows the order
3698 * - pending hardirq update
3699 * - pending softirq update
3700 * - user_time
3701 * - idle_time
3702 * - system time
3703 * - check for guest_time
3704 * - else account as system_time
3705 *
3706 * Check for hardirq is done both for system and user time as there is
3707 * no timer going off while we are on hardirq and hence we may never get an
3708 * opportunity to update it solely in system time.
3709 * p->stime and friends are only updated on system time and not on irq
3710 * softirq as those do not count in task exec_runtime any more.
3711 */
3712static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3713 struct rq *rq)
3714{
3715 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3716 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3717 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3718
3719 if (irqtime_account_hi_update()) {
3720 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3721 } else if (irqtime_account_si_update()) {
3722 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3723 } else if (this_cpu_ksoftirqd() == p) {
3724 /*
3725 * ksoftirqd time do not get accounted in cpu_softirq_time.
3726 * So, we have to handle it separately here.
3727 * Also, p->stime needs to be updated for ksoftirqd.
3728 */
3729 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3730 &cpustat->softirq);
3731 } else if (user_tick) {
3732 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3733 } else if (p == rq->idle) {
3734 account_idle_time(cputime_one_jiffy);
3735 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3736 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3737 } else {
3738 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3739 &cpustat->system);
3740 }
3606} 3741}
3607 3742
3743static void irqtime_account_idle_ticks(int ticks)
3744{
3745 int i;
3746 struct rq *rq = this_rq();
3747
3748 for (i = 0; i < ticks; i++)
3749 irqtime_account_process_tick(current, 0, rq);
3750}
3751#else
3752static void irqtime_account_idle_ticks(int ticks) {}
3753static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3754 struct rq *rq) {}
3755#endif
3756
3608/* 3757/*
3609 * Account for involuntary wait time. 3758 * Account for involuntary wait time.
3610 * @steal: the cpu time spent in involuntary wait 3759 * @steal: the cpu time spent in involuntary wait
@@ -3645,6 +3794,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3645 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3794 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3646 struct rq *rq = this_rq(); 3795 struct rq *rq = this_rq();
3647 3796
3797 if (sched_clock_irqtime) {
3798 irqtime_account_process_tick(p, user_tick, rq);
3799 return;
3800 }
3801
3648 if (user_tick) 3802 if (user_tick)
3649 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3803 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3650 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3804 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3824,12 @@ void account_steal_ticks(unsigned long ticks)
3670 */ 3824 */
3671void account_idle_ticks(unsigned long ticks) 3825void account_idle_ticks(unsigned long ticks)
3672{ 3826{
3827
3828 if (sched_clock_irqtime) {
3829 irqtime_account_idle_ticks(ticks);
3830 return;
3831 }
3832
3673 account_idle_time(jiffies_to_cputime(ticks)); 3833 account_idle_time(jiffies_to_cputime(ticks));
3674} 3834}
3675 3835
@@ -4570,11 +4730,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4570 4730
4571 if (running) 4731 if (running)
4572 p->sched_class->set_curr_task(rq); 4732 p->sched_class->set_curr_task(rq);
4573 if (on_rq) { 4733 if (on_rq)
4574 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4734 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4575 4735
4576 check_class_changed(rq, p, prev_class, oldprio, running); 4736 check_class_changed(rq, p, prev_class, oldprio);
4577 }
4578 task_rq_unlock(rq, &flags); 4737 task_rq_unlock(rq, &flags);
4579} 4738}
4580 4739
@@ -4902,11 +5061,10 @@ recheck:
4902 5061
4903 if (running) 5062 if (running)
4904 p->sched_class->set_curr_task(rq); 5063 p->sched_class->set_curr_task(rq);
4905 if (on_rq) { 5064 if (on_rq)
4906 activate_task(rq, p, 0); 5065 activate_task(rq, p, 0);
4907 5066
4908 check_class_changed(rq, p, prev_class, oldprio, running); 5067 check_class_changed(rq, p, prev_class, oldprio);
4909 }
4910 __task_rq_unlock(rq); 5068 __task_rq_unlock(rq);
4911 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5069 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4912 5070
@@ -5323,6 +5481,58 @@ void __sched yield(void)
5323} 5481}
5324EXPORT_SYMBOL(yield); 5482EXPORT_SYMBOL(yield);
5325 5483
5484/**
5485 * yield_to - yield the current processor to another thread in
5486 * your thread group, or accelerate that thread toward the
5487 * processor it's on.
5488 *
5489 * It's the caller's job to ensure that the target task struct
5490 * can't go away on us before we can do any checks.
5491 *
5492 * Returns true if we indeed boosted the target task.
5493 */
5494bool __sched yield_to(struct task_struct *p, bool preempt)
5495{
5496 struct task_struct *curr = current;
5497 struct rq *rq, *p_rq;
5498 unsigned long flags;
5499 bool yielded = 0;
5500
5501 local_irq_save(flags);
5502 rq = this_rq();
5503
5504again:
5505 p_rq = task_rq(p);
5506 double_rq_lock(rq, p_rq);
5507 while (task_rq(p) != p_rq) {
5508 double_rq_unlock(rq, p_rq);
5509 goto again;
5510 }
5511
5512 if (!curr->sched_class->yield_to_task)
5513 goto out;
5514
5515 if (curr->sched_class != p->sched_class)
5516 goto out;
5517
5518 if (task_running(p_rq, p) || p->state)
5519 goto out;
5520
5521 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5522 if (yielded)
5523 schedstat_inc(rq, yld_count);
5524
5525out:
5526 double_rq_unlock(rq, p_rq);
5527 local_irq_restore(flags);
5528
5529 if (yielded)
5530 schedule();
5531
5532 return yielded;
5533}
5534EXPORT_SYMBOL_GPL(yield_to);
5535
5326/* 5536/*
5327 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5537 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5328 * that process accounting knows that this is a task in IO wait state. 5538 * that process accounting knows that this is a task in IO wait state.
@@ -7796,6 +8006,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7796 INIT_LIST_HEAD(&cfs_rq->tasks); 8006 INIT_LIST_HEAD(&cfs_rq->tasks);
7797#ifdef CONFIG_FAIR_GROUP_SCHED 8007#ifdef CONFIG_FAIR_GROUP_SCHED
7798 cfs_rq->rq = rq; 8008 cfs_rq->rq = rq;
8009 /* allow initial update_cfs_load() to truncate */
8010#ifdef CONFIG_SMP
8011 cfs_rq->load_stamp = 1;
8012#endif
7799#endif 8013#endif
7800 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8014 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7801} 8015}
@@ -8109,6 +8323,8 @@ EXPORT_SYMBOL(__might_sleep);
8109#ifdef CONFIG_MAGIC_SYSRQ 8323#ifdef CONFIG_MAGIC_SYSRQ
8110static void normalize_task(struct rq *rq, struct task_struct *p) 8324static void normalize_task(struct rq *rq, struct task_struct *p)
8111{ 8325{
8326 const struct sched_class *prev_class = p->sched_class;
8327 int old_prio = p->prio;
8112 int on_rq; 8328 int on_rq;
8113 8329
8114 on_rq = p->se.on_rq; 8330 on_rq = p->se.on_rq;
@@ -8119,6 +8335,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8119 activate_task(rq, p, 0); 8335 activate_task(rq, p, 0);
8120 resched_task(rq->curr); 8336 resched_task(rq->curr);
8121 } 8337 }
8338
8339 check_class_changed(rq, p, prev_class, old_prio);
8122} 8340}
8123 8341
8124void normalize_rt_tasks(void) 8342void normalize_rt_tasks(void)
@@ -8510,7 +8728,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510 /* Propagate contribution to hierarchy */ 8728 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags); 8729 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se) 8730 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0); 8731 update_cfs_shares(group_cfs_rq(se));
8514 raw_spin_unlock_irqrestore(&rq->lock, flags); 8732 raw_spin_unlock_irqrestore(&rq->lock, flags);
8515 } 8733 }
8516 8734
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..7bacd83a4158 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
179 179
180 raw_spin_lock_irqsave(&rq->lock, flags); 180 raw_spin_lock_irqsave(&rq->lock, flags);
181 if (cfs_rq->rb_leftmost) 181 if (cfs_rq->rb_leftmost)
182 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 182 MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
183 last = __pick_last_entity(cfs_rq); 183 last = __pick_last_entity(cfs_rq);
184 if (last) 184 if (last)
185 max_vruntime = last->vruntime; 185 max_vruntime = last->vruntime;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..d384e739ea95 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8;
69unsigned int sysctl_sched_child_runs_first __read_mostly; 69unsigned int sysctl_sched_child_runs_first __read_mostly;
70 70
71/* 71/*
72 * sys_sched_yield() compat mode
73 *
74 * This option switches the agressive yield implementation of the
75 * old scheduler back on.
76 */
77unsigned int __read_mostly sysctl_sched_compat_yield;
78
79/*
80 * SCHED_OTHER wake-up granularity. 72 * SCHED_OTHER wake-up granularity.
81 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 73 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
82 * 74 *
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
419 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 411 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
420} 412}
421 413
422static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 414static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
423{ 415{
424 struct rb_node *left = cfs_rq->rb_leftmost; 416 struct rb_node *left = cfs_rq->rb_leftmost;
425 417
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
429 return rb_entry(left, struct sched_entity, run_node); 421 return rb_entry(left, struct sched_entity, run_node);
430} 422}
431 423
424static struct sched_entity *__pick_next_entity(struct sched_entity *se)
425{
426 struct rb_node *next = rb_next(&se->run_node);
427
428 if (!next)
429 return NULL;
430
431 return rb_entry(next, struct sched_entity, run_node);
432}
433
434#ifdef CONFIG_SCHED_DEBUG
432static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) 435static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
433{ 436{
434 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); 437 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
443 * Scheduling class statistics methods: 446 * Scheduling class statistics methods:
444 */ 447 */
445 448
446#ifdef CONFIG_SCHED_DEBUG
447int sched_proc_update_handler(struct ctl_table *table, int write, 449int sched_proc_update_handler(struct ctl_table *table, int write,
448 void __user *buffer, size_t *lenp, 450 void __user *buffer, size_t *lenp,
449 loff_t *ppos) 451 loff_t *ppos)
@@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
540} 542}
541 543
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); 544static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); 545static void update_cfs_shares(struct cfs_rq *cfs_rq);
544 546
545/* 547/*
546 * Update the current task's runtime statistics. Skip current tasks that 548 * Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
733 now - cfs_rq->load_last > 4 * period) { 735 now - cfs_rq->load_last > 4 * period) {
734 cfs_rq->load_period = 0; 736 cfs_rq->load_period = 0;
735 cfs_rq->load_avg = 0; 737 cfs_rq->load_avg = 0;
738 delta = period - 1;
736 } 739 }
737 740
738 cfs_rq->load_stamp = now; 741 cfs_rq->load_stamp = now;
@@ -763,16 +766,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
763 list_del_leaf_cfs_rq(cfs_rq); 766 list_del_leaf_cfs_rq(cfs_rq);
764} 767}
765 768
766static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 769static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
767 long weight_delta)
768{ 770{
769 long load_weight, load, shares; 771 long load_weight, load, shares;
770 772
771 load = cfs_rq->load.weight + weight_delta; 773 load = cfs_rq->load.weight;
772 774
773 load_weight = atomic_read(&tg->load_weight); 775 load_weight = atomic_read(&tg->load_weight);
774 load_weight -= cfs_rq->load_contribution;
775 load_weight += load; 776 load_weight += load;
777 load_weight -= cfs_rq->load_contribution;
776 778
777 shares = (tg->shares * load); 779 shares = (tg->shares * load);
778 if (load_weight) 780 if (load_weight)
@@ -790,7 +792,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
790{ 792{
791 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { 793 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
792 update_cfs_load(cfs_rq, 0); 794 update_cfs_load(cfs_rq, 0);
793 update_cfs_shares(cfs_rq, 0); 795 update_cfs_shares(cfs_rq);
794 } 796 }
795} 797}
796# else /* CONFIG_SMP */ 798# else /* CONFIG_SMP */
@@ -798,8 +800,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
798{ 800{
799} 801}
800 802
801static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, 803static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
802 long weight_delta)
803{ 804{
804 return tg->shares; 805 return tg->shares;
805} 806}
@@ -824,7 +825,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
824 account_entity_enqueue(cfs_rq, se); 825 account_entity_enqueue(cfs_rq, se);
825} 826}
826 827
827static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 828static void update_cfs_shares(struct cfs_rq *cfs_rq)
828{ 829{
829 struct task_group *tg; 830 struct task_group *tg;
830 struct sched_entity *se; 831 struct sched_entity *se;
@@ -838,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
838 if (likely(se->load.weight == tg->shares)) 839 if (likely(se->load.weight == tg->shares))
839 return; 840 return;
840#endif 841#endif
841 shares = calc_cfs_shares(cfs_rq, tg, weight_delta); 842 shares = calc_cfs_shares(cfs_rq, tg);
842 843
843 reweight_entity(cfs_rq_of(se), se, shares); 844 reweight_entity(cfs_rq_of(se), se, shares);
844} 845}
@@ -847,7 +848,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
847{ 848{
848} 849}
849 850
850static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) 851static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
851{ 852{
852} 853}
853 854
@@ -978,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
978 */ 979 */
979 update_curr(cfs_rq); 980 update_curr(cfs_rq);
980 update_cfs_load(cfs_rq, 0); 981 update_cfs_load(cfs_rq, 0);
981 update_cfs_shares(cfs_rq, se->load.weight);
982 account_entity_enqueue(cfs_rq, se); 982 account_entity_enqueue(cfs_rq, se);
983 update_cfs_shares(cfs_rq);
983 984
984 if (flags & ENQUEUE_WAKEUP) { 985 if (flags & ENQUEUE_WAKEUP) {
985 place_entity(cfs_rq, se, 0); 986 place_entity(cfs_rq, se, 0);
@@ -996,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
996 list_add_leaf_cfs_rq(cfs_rq); 997 list_add_leaf_cfs_rq(cfs_rq);
997} 998}
998 999
999static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1000static void __clear_buddies_last(struct sched_entity *se)
1001{
1002 for_each_sched_entity(se) {
1003 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1004 if (cfs_rq->last == se)
1005 cfs_rq->last = NULL;
1006 else
1007 break;
1008 }
1009}
1010
1011static void __clear_buddies_next(struct sched_entity *se)
1000{ 1012{
1001 if (!se || cfs_rq->last == se) 1013 for_each_sched_entity(se) {
1002 cfs_rq->last = NULL; 1014 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1015 if (cfs_rq->next == se)
1016 cfs_rq->next = NULL;
1017 else
1018 break;
1019 }
1020}
1003 1021
1004 if (!se || cfs_rq->next == se) 1022static void __clear_buddies_skip(struct sched_entity *se)
1005 cfs_rq->next = NULL; 1023{
1024 for_each_sched_entity(se) {
1025 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1026 if (cfs_rq->skip == se)
1027 cfs_rq->skip = NULL;
1028 else
1029 break;
1030 }
1006} 1031}
1007 1032
1008static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 1033static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1009{ 1034{
1010 for_each_sched_entity(se) 1035 if (cfs_rq->last == se)
1011 __clear_buddies(cfs_rq_of(se), se); 1036 __clear_buddies_last(se);
1037
1038 if (cfs_rq->next == se)
1039 __clear_buddies_next(se);
1040
1041 if (cfs_rq->skip == se)
1042 __clear_buddies_skip(se);
1012} 1043}
1013 1044
1014static void 1045static void
@@ -1041,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1041 update_cfs_load(cfs_rq, 0); 1072 update_cfs_load(cfs_rq, 0);
1042 account_entity_dequeue(cfs_rq, se); 1073 account_entity_dequeue(cfs_rq, se);
1043 update_min_vruntime(cfs_rq); 1074 update_min_vruntime(cfs_rq);
1044 update_cfs_shares(cfs_rq, 0); 1075 update_cfs_shares(cfs_rq);
1045 1076
1046 /* 1077 /*
1047 * Normalize the entity after updating the min_vruntime because the 1078 * Normalize the entity after updating the min_vruntime because the
@@ -1084,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
1084 return; 1115 return;
1085 1116
1086 if (cfs_rq->nr_running > 1) { 1117 if (cfs_rq->nr_running > 1) {
1087 struct sched_entity *se = __pick_next_entity(cfs_rq); 1118 struct sched_entity *se = __pick_first_entity(cfs_rq);
1088 s64 delta = curr->vruntime - se->vruntime; 1119 s64 delta = curr->vruntime - se->vruntime;
1089 1120
1090 if (delta < 0) 1121 if (delta < 0)
@@ -1128,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
1128static int 1159static int
1129wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); 1160wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1130 1161
1162/*
1163 * Pick the next process, keeping these things in mind, in this order:
1164 * 1) keep things fair between processes/task groups
1165 * 2) pick the "next" process, since someone really wants that to run
1166 * 3) pick the "last" process, for cache locality
1167 * 4) do not run the "skip" process, if something else is available
1168 */
1131static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 1169static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1132{ 1170{
1133 struct sched_entity *se = __pick_next_entity(cfs_rq); 1171 struct sched_entity *se = __pick_first_entity(cfs_rq);
1134 struct sched_entity *left = se; 1172 struct sched_entity *left = se;
1135 1173
1136 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 1174 /*
1137 se = cfs_rq->next; 1175 * Avoid running the skip buddy, if running something else can
1176 * be done without getting too unfair.
1177 */
1178 if (cfs_rq->skip == se) {
1179 struct sched_entity *second = __pick_next_entity(se);
1180 if (second && wakeup_preempt_entity(second, left) < 1)
1181 se = second;
1182 }
1138 1183
1139 /* 1184 /*
1140 * Prefer last buddy, try to return the CPU to a preempted task. 1185 * Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
1142 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) 1187 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1143 se = cfs_rq->last; 1188 se = cfs_rq->last;
1144 1189
1190 /*
1191 * Someone really wants this to run. If it's not unfair, run it.
1192 */
1193 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1194 se = cfs_rq->next;
1195
1145 clear_buddies(cfs_rq, se); 1196 clear_buddies(cfs_rq, se);
1146 1197
1147 return se; 1198 return se;
@@ -1282,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1282 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1333 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1283 1334
1284 update_cfs_load(cfs_rq, 0); 1335 update_cfs_load(cfs_rq, 0);
1285 update_cfs_shares(cfs_rq, 0); 1336 update_cfs_shares(cfs_rq);
1286 } 1337 }
1287 1338
1288 hrtick_update(rq); 1339 hrtick_update(rq);
@@ -1312,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1312 struct cfs_rq *cfs_rq = cfs_rq_of(se); 1363 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1313 1364
1314 update_cfs_load(cfs_rq, 0); 1365 update_cfs_load(cfs_rq, 0);
1315 update_cfs_shares(cfs_rq, 0); 1366 update_cfs_shares(cfs_rq);
1316 } 1367 }
1317 1368
1318 hrtick_update(rq); 1369 hrtick_update(rq);
1319} 1370}
1320 1371
1321/*
1322 * sched_yield() support is very simple - we dequeue and enqueue.
1323 *
1324 * If compat_yield is turned on then we requeue to the end of the tree.
1325 */
1326static void yield_task_fair(struct rq *rq)
1327{
1328 struct task_struct *curr = rq->curr;
1329 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1330 struct sched_entity *rightmost, *se = &curr->se;
1331
1332 /*
1333 * Are we the only task in the tree?
1334 */
1335 if (unlikely(cfs_rq->nr_running == 1))
1336 return;
1337
1338 clear_buddies(cfs_rq, se);
1339
1340 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
1341 update_rq_clock(rq);
1342 /*
1343 * Update run-time statistics of the 'current'.
1344 */
1345 update_curr(cfs_rq);
1346
1347 return;
1348 }
1349 /*
1350 * Find the rightmost entry in the rbtree:
1351 */
1352 rightmost = __pick_last_entity(cfs_rq);
1353 /*
1354 * Already in the rightmost position?
1355 */
1356 if (unlikely(!rightmost || entity_before(rightmost, se)))
1357 return;
1358
1359 /*
1360 * Minimally necessary key value to be last in the tree:
1361 * Upon rescheduling, sched_class::put_prev_task() will place
1362 * 'current' within the tree based on its new key value.
1363 */
1364 se->vruntime = rightmost->vruntime + 1;
1365}
1366
1367#ifdef CONFIG_SMP 1372#ifdef CONFIG_SMP
1368 1373
1369static void task_waking_fair(struct rq *rq, struct task_struct *p) 1374static void task_waking_fair(struct rq *rq, struct task_struct *p)
@@ -1834,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se)
1834 } 1839 }
1835} 1840}
1836 1841
1842static void set_skip_buddy(struct sched_entity *se)
1843{
1844 if (likely(task_of(se)->policy != SCHED_IDLE)) {
1845 for_each_sched_entity(se)
1846 cfs_rq_of(se)->skip = se;
1847 }
1848}
1849
1837/* 1850/*
1838 * Preempt the current task with a newly woken task if needed: 1851 * Preempt the current task with a newly woken task if needed:
1839 */ 1852 */
@@ -1932,6 +1945,55 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1932 } 1945 }
1933} 1946}
1934 1947
1948/*
1949 * sched_yield() is very simple
1950 *
1951 * The magic of dealing with the ->skip buddy is in pick_next_entity.
1952 */
1953static void yield_task_fair(struct rq *rq)
1954{
1955 struct task_struct *curr = rq->curr;
1956 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1957 struct sched_entity *se = &curr->se;
1958
1959 /*
1960 * Are we the only task in the tree?
1961 */
1962 if (unlikely(rq->nr_running == 1))
1963 return;
1964
1965 clear_buddies(cfs_rq, se);
1966
1967 if (curr->policy != SCHED_BATCH) {
1968 update_rq_clock(rq);
1969 /*
1970 * Update run-time statistics of the 'current'.
1971 */
1972 update_curr(cfs_rq);
1973 }
1974
1975 set_skip_buddy(se);
1976}
1977
1978static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
1979{
1980 struct sched_entity *se = &p->se;
1981
1982 if (!se->on_rq)
1983 return false;
1984
1985 /* Tell the scheduler that we'd really like pse to run next. */
1986 set_next_buddy(se);
1987
1988 /* Make p's CPU reschedule; pick_next_entity takes care of fairness. */
1989 if (preempt)
1990 resched_task(rq->curr);
1991
1992 yield_task_fair(rq);
1993
1994 return true;
1995}
1996
1935#ifdef CONFIG_SMP 1997#ifdef CONFIG_SMP
1936/************************************************** 1998/**************************************************
1937 * Fair scheduling class load-balancing methods: 1999 * Fair scheduling class load-balancing methods:
@@ -2123,7 +2185,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
2123 * We need to update shares after updating tg->load_weight in 2185 * We need to update shares after updating tg->load_weight in
2124 * order to adjust the weight of groups with long running tasks. 2186 * order to adjust the weight of groups with long running tasks.
2125 */ 2187 */
2126 update_cfs_shares(cfs_rq, 0); 2188 update_cfs_shares(cfs_rq);
2127 2189
2128 raw_spin_unlock_irqrestore(&rq->lock, flags); 2190 raw_spin_unlock_irqrestore(&rq->lock, flags);
2129 2191
@@ -2610,7 +2672,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2610 * @this_cpu: Cpu for which load balance is currently performed. 2672 * @this_cpu: Cpu for which load balance is currently performed.
2611 * @idle: Idle status of this_cpu 2673 * @idle: Idle status of this_cpu
2612 * @load_idx: Load index of sched_domain of this_cpu for load calc. 2674 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2613 * @sd_idle: Idle status of the sched_domain containing group.
2614 * @local_group: Does group contain this_cpu. 2675 * @local_group: Does group contain this_cpu.
2615 * @cpus: Set of cpus considered for load balancing. 2676 * @cpus: Set of cpus considered for load balancing.
2616 * @balance: Should we balance. 2677 * @balance: Should we balance.
@@ -2618,7 +2679,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2618 */ 2679 */
2619static inline void update_sg_lb_stats(struct sched_domain *sd, 2680static inline void update_sg_lb_stats(struct sched_domain *sd,
2620 struct sched_group *group, int this_cpu, 2681 struct sched_group *group, int this_cpu,
2621 enum cpu_idle_type idle, int load_idx, int *sd_idle, 2682 enum cpu_idle_type idle, int load_idx,
2622 int local_group, const struct cpumask *cpus, 2683 int local_group, const struct cpumask *cpus,
2623 int *balance, struct sg_lb_stats *sgs) 2684 int *balance, struct sg_lb_stats *sgs)
2624{ 2685{
@@ -2638,9 +2699,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2638 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2699 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2639 struct rq *rq = cpu_rq(i); 2700 struct rq *rq = cpu_rq(i);
2640 2701
2641 if (*sd_idle && rq->nr_running)
2642 *sd_idle = 0;
2643
2644 /* Bias balancing toward cpus of our domain */ 2702 /* Bias balancing toward cpus of our domain */
2645 if (local_group) { 2703 if (local_group) {
2646 if (idle_cpu(i) && !first_idle_cpu) { 2704 if (idle_cpu(i) && !first_idle_cpu) {
@@ -2755,15 +2813,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
2755 * @sd: sched_domain whose statistics are to be updated. 2813 * @sd: sched_domain whose statistics are to be updated.
2756 * @this_cpu: Cpu for which load balance is currently performed. 2814 * @this_cpu: Cpu for which load balance is currently performed.
2757 * @idle: Idle status of this_cpu 2815 * @idle: Idle status of this_cpu
2758 * @sd_idle: Idle status of the sched_domain containing sg.
2759 * @cpus: Set of cpus considered for load balancing. 2816 * @cpus: Set of cpus considered for load balancing.
2760 * @balance: Should we balance. 2817 * @balance: Should we balance.
2761 * @sds: variable to hold the statistics for this sched_domain. 2818 * @sds: variable to hold the statistics for this sched_domain.
2762 */ 2819 */
2763static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 2820static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2764 enum cpu_idle_type idle, int *sd_idle, 2821 enum cpu_idle_type idle, const struct cpumask *cpus,
2765 const struct cpumask *cpus, int *balance, 2822 int *balance, struct sd_lb_stats *sds)
2766 struct sd_lb_stats *sds)
2767{ 2823{
2768 struct sched_domain *child = sd->child; 2824 struct sched_domain *child = sd->child;
2769 struct sched_group *sg = sd->groups; 2825 struct sched_group *sg = sd->groups;
@@ -2781,7 +2837,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2781 2837
2782 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); 2838 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2783 memset(&sgs, 0, sizeof(sgs)); 2839 memset(&sgs, 0, sizeof(sgs));
2784 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, 2840 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
2785 local_group, cpus, balance, &sgs); 2841 local_group, cpus, balance, &sgs);
2786 2842
2787 if (local_group && !(*balance)) 2843 if (local_group && !(*balance))
@@ -3033,7 +3089,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3033 * @imbalance: Variable which stores amount of weighted load which should 3089 * @imbalance: Variable which stores amount of weighted load which should
3034 * be moved to restore balance/put a group to idle. 3090 * be moved to restore balance/put a group to idle.
3035 * @idle: The idle status of this_cpu. 3091 * @idle: The idle status of this_cpu.
3036 * @sd_idle: The idleness of sd
3037 * @cpus: The set of CPUs under consideration for load-balancing. 3092 * @cpus: The set of CPUs under consideration for load-balancing.
3038 * @balance: Pointer to a variable indicating if this_cpu 3093 * @balance: Pointer to a variable indicating if this_cpu
3039 * is the appropriate cpu to perform load balancing at this_level. 3094 * is the appropriate cpu to perform load balancing at this_level.
@@ -3046,7 +3101,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3046static struct sched_group * 3101static struct sched_group *
3047find_busiest_group(struct sched_domain *sd, int this_cpu, 3102find_busiest_group(struct sched_domain *sd, int this_cpu,
3048 unsigned long *imbalance, enum cpu_idle_type idle, 3103 unsigned long *imbalance, enum cpu_idle_type idle,
3049 int *sd_idle, const struct cpumask *cpus, int *balance) 3104 const struct cpumask *cpus, int *balance)
3050{ 3105{
3051 struct sd_lb_stats sds; 3106 struct sd_lb_stats sds;
3052 3107
@@ -3056,8 +3111,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3056 * Compute the various statistics relavent for load balancing at 3111 * Compute the various statistics relavent for load balancing at
3057 * this level. 3112 * this level.
3058 */ 3113 */
3059 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3114 update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
3060 balance, &sds);
3061 3115
3062 /* Cases where imbalance does not exist from POV of this_cpu */ 3116 /* Cases where imbalance does not exist from POV of this_cpu */
3063 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3117 /* 1) this_cpu is not the appropriate cpu to perform load balancing
@@ -3193,7 +3247,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
3193/* Working cpumask for load_balance and load_balance_newidle. */ 3247/* Working cpumask for load_balance and load_balance_newidle. */
3194static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 3248static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3195 3249
3196static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, 3250static int need_active_balance(struct sched_domain *sd, int idle,
3197 int busiest_cpu, int this_cpu) 3251 int busiest_cpu, int this_cpu)
3198{ 3252{
3199 if (idle == CPU_NEWLY_IDLE) { 3253 if (idle == CPU_NEWLY_IDLE) {
@@ -3225,10 +3279,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
3225 * move_tasks() will succeed. ld_moved will be true and this 3279 * move_tasks() will succeed. ld_moved will be true and this
3226 * active balance code will not be triggered. 3280 * active balance code will not be triggered.
3227 */ 3281 */
3228 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3229 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3230 return 0;
3231
3232 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 3282 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
3233 return 0; 3283 return 0;
3234 } 3284 }
@@ -3246,7 +3296,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3246 struct sched_domain *sd, enum cpu_idle_type idle, 3296 struct sched_domain *sd, enum cpu_idle_type idle,
3247 int *balance) 3297 int *balance)
3248{ 3298{
3249 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3299 int ld_moved, all_pinned = 0, active_balance = 0;
3250 struct sched_group *group; 3300 struct sched_group *group;
3251 unsigned long imbalance; 3301 unsigned long imbalance;
3252 struct rq *busiest; 3302 struct rq *busiest;
@@ -3255,20 +3305,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3255 3305
3256 cpumask_copy(cpus, cpu_active_mask); 3306 cpumask_copy(cpus, cpu_active_mask);
3257 3307
3258 /*
3259 * When power savings policy is enabled for the parent domain, idle
3260 * sibling can pick up load irrespective of busy siblings. In this case,
3261 * let the state of idle sibling percolate up as CPU_IDLE, instead of
3262 * portraying it as CPU_NOT_IDLE.
3263 */
3264 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
3265 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3266 sd_idle = 1;
3267
3268 schedstat_inc(sd, lb_count[idle]); 3308 schedstat_inc(sd, lb_count[idle]);
3269 3309
3270redo: 3310redo:
3271 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3311 group = find_busiest_group(sd, this_cpu, &imbalance, idle,
3272 cpus, balance); 3312 cpus, balance);
3273 3313
3274 if (*balance == 0) 3314 if (*balance == 0)
@@ -3330,8 +3370,7 @@ redo:
3330 if (idle != CPU_NEWLY_IDLE) 3370 if (idle != CPU_NEWLY_IDLE)
3331 sd->nr_balance_failed++; 3371 sd->nr_balance_failed++;
3332 3372
3333 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3373 if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
3334 this_cpu)) {
3335 raw_spin_lock_irqsave(&busiest->lock, flags); 3374 raw_spin_lock_irqsave(&busiest->lock, flags);
3336 3375
3337 /* don't kick the active_load_balance_cpu_stop, 3376 /* don't kick the active_load_balance_cpu_stop,
@@ -3386,10 +3425,6 @@ redo:
3386 sd->balance_interval *= 2; 3425 sd->balance_interval *= 2;
3387 } 3426 }
3388 3427
3389 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3390 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3391 ld_moved = -1;
3392
3393 goto out; 3428 goto out;
3394 3429
3395out_balanced: 3430out_balanced:
@@ -3403,11 +3438,7 @@ out_one_pinned:
3403 (sd->balance_interval < sd->max_interval)) 3438 (sd->balance_interval < sd->max_interval))
3404 sd->balance_interval *= 2; 3439 sd->balance_interval *= 2;
3405 3440
3406 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3441 ld_moved = 0;
3407 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3408 ld_moved = -1;
3409 else
3410 ld_moved = 0;
3411out: 3442out:
3412 return ld_moved; 3443 return ld_moved;
3413} 3444}
@@ -4079,33 +4110,62 @@ static void task_fork_fair(struct task_struct *p)
4079 * Priority of the task has changed. Check to see if we preempt 4110 * Priority of the task has changed. Check to see if we preempt
4080 * the current task. 4111 * the current task.
4081 */ 4112 */
4082static void prio_changed_fair(struct rq *rq, struct task_struct *p, 4113static void
4083 int oldprio, int running) 4114prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
4084{ 4115{
4116 if (!p->se.on_rq)
4117 return;
4118
4085 /* 4119 /*
4086 * Reschedule if we are currently running on this runqueue and 4120 * Reschedule if we are currently running on this runqueue and
4087 * our priority decreased, or if we are not currently running on 4121 * our priority decreased, or if we are not currently running on
4088 * this runqueue and our priority is higher than the current's 4122 * this runqueue and our priority is higher than the current's
4089 */ 4123 */
4090 if (running) { 4124 if (rq->curr == p) {
4091 if (p->prio > oldprio) 4125 if (p->prio > oldprio)
4092 resched_task(rq->curr); 4126 resched_task(rq->curr);
4093 } else 4127 } else
4094 check_preempt_curr(rq, p, 0); 4128 check_preempt_curr(rq, p, 0);
4095} 4129}
4096 4130
4131static void switched_from_fair(struct rq *rq, struct task_struct *p)
4132{
4133 struct sched_entity *se = &p->se;
4134 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4135
4136 /*
4137 * Ensure the task's vruntime is normalized, so that when its
4138 * switched back to the fair class the enqueue_entity(.flags=0) will
4139 * do the right thing.
4140 *
4141 * If it was on_rq, then the dequeue_entity(.flags=0) will already
4142 * have normalized the vruntime, if it was !on_rq, then only when
4143 * the task is sleeping will it still have non-normalized vruntime.
4144 */
4145 if (!se->on_rq && p->state != TASK_RUNNING) {
4146 /*
4147 * Fix up our vruntime so that the current sleep doesn't
4148 * cause 'unlimited' sleep bonus.
4149 */
4150 place_entity(cfs_rq, se, 0);
4151 se->vruntime -= cfs_rq->min_vruntime;
4152 }
4153}
4154
4097/* 4155/*
4098 * We switched to the sched_fair class. 4156 * We switched to the sched_fair class.
4099 */ 4157 */
4100static void switched_to_fair(struct rq *rq, struct task_struct *p, 4158static void switched_to_fair(struct rq *rq, struct task_struct *p)
4101 int running)
4102{ 4159{
4160 if (!p->se.on_rq)
4161 return;
4162
4103 /* 4163 /*
4104 * We were most likely switched from sched_rt, so 4164 * We were most likely switched from sched_rt, so
4105 * kick off the schedule if running, otherwise just see 4165 * kick off the schedule if running, otherwise just see
4106 * if we can still preempt the current task. 4166 * if we can still preempt the current task.
4107 */ 4167 */
4108 if (running) 4168 if (rq->curr == p)
4109 resched_task(rq->curr); 4169 resched_task(rq->curr);
4110 else 4170 else
4111 check_preempt_curr(rq, p, 0); 4171 check_preempt_curr(rq, p, 0);
@@ -4171,6 +4231,7 @@ static const struct sched_class fair_sched_class = {
4171 .enqueue_task = enqueue_task_fair, 4231 .enqueue_task = enqueue_task_fair,
4172 .dequeue_task = dequeue_task_fair, 4232 .dequeue_task = dequeue_task_fair,
4173 .yield_task = yield_task_fair, 4233 .yield_task = yield_task_fair,
4234 .yield_to_task = yield_to_task_fair,
4174 4235
4175 .check_preempt_curr = check_preempt_wakeup, 4236 .check_preempt_curr = check_preempt_wakeup,
4176 4237
@@ -4191,6 +4252,7 @@ static const struct sched_class fair_sched_class = {
4191 .task_fork = task_fork_fair, 4252 .task_fork = task_fork_fair,
4192 4253
4193 .prio_changed = prio_changed_fair, 4254 .prio_changed = prio_changed_fair,
4255 .switched_from = switched_from_fair,
4194 .switched_to = switched_to_fair, 4256 .switched_to = switched_to_fair,
4195 4257
4196 .get_rr_interval = get_rr_interval_fair, 4258 .get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..c82f26c1b7c3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
52{ 52{
53} 53}
54 54
55static void switched_to_idle(struct rq *rq, struct task_struct *p, 55static void switched_to_idle(struct rq *rq, struct task_struct *p)
56 int running)
57{ 56{
58 /* Can this actually happen?? */ 57 BUG();
59 if (running)
60 resched_task(rq->curr);
61 else
62 check_preempt_curr(rq, p, 0);
63} 58}
64 59
65static void prio_changed_idle(struct rq *rq, struct task_struct *p, 60static void
66 int oldprio, int running) 61prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
67{ 62{
68 /* This can happen for hot plug CPUS */ 63 BUG();
69
70 /*
71 * Reschedule if we are currently running on this runqueue and
72 * our priority decreased, or if we are not currently running on
73 * this runqueue and our priority is higher than the current's
74 */
75 if (running) {
76 if (p->prio > oldprio)
77 resched_task(rq->curr);
78 } else
79 check_preempt_curr(rq, p, 0);
80} 64}
81 65
82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 66static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ad6267714c84..4e108f8ecb6a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1595,8 +1595,7 @@ static void rq_offline_rt(struct rq *rq)
1595 * When switch from the rt queue, we bring ourselves to a position 1595 * When switch from the rt queue, we bring ourselves to a position
1596 * that we might want to pull RT tasks from other runqueues. 1596 * that we might want to pull RT tasks from other runqueues.
1597 */ 1597 */
1598static void switched_from_rt(struct rq *rq, struct task_struct *p, 1598static void switched_from_rt(struct rq *rq, struct task_struct *p)
1599 int running)
1600{ 1599{
1601 /* 1600 /*
1602 * If there are other RT tasks then we will reschedule 1601 * If there are other RT tasks then we will reschedule
@@ -1605,7 +1604,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1605 * we may need to handle the pulling of RT tasks 1604 * we may need to handle the pulling of RT tasks
1606 * now. 1605 * now.
1607 */ 1606 */
1608 if (!rq->rt.rt_nr_running) 1607 if (p->se.on_rq && !rq->rt.rt_nr_running)
1609 pull_rt_task(rq); 1608 pull_rt_task(rq);
1610} 1609}
1611 1610
@@ -1624,8 +1623,7 @@ static inline void init_sched_rt_class(void)
1624 * with RT tasks. In this case we try to push them off to 1623 * with RT tasks. In this case we try to push them off to
1625 * other runqueues. 1624 * other runqueues.
1626 */ 1625 */
1627static void switched_to_rt(struct rq *rq, struct task_struct *p, 1626static void switched_to_rt(struct rq *rq, struct task_struct *p)
1628 int running)
1629{ 1627{
1630 int check_resched = 1; 1628 int check_resched = 1;
1631 1629
@@ -1636,7 +1634,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1636 * If that current running task is also an RT task 1634 * If that current running task is also an RT task
1637 * then see if we can move to another run queue. 1635 * then see if we can move to another run queue.
1638 */ 1636 */
1639 if (!running) { 1637 if (p->se.on_rq && rq->curr != p) {
1640#ifdef CONFIG_SMP 1638#ifdef CONFIG_SMP
1641 if (rq->rt.overloaded && push_rt_task(rq) && 1639 if (rq->rt.overloaded && push_rt_task(rq) &&
1642 /* Don't resched if we changed runqueues */ 1640 /* Don't resched if we changed runqueues */
@@ -1652,10 +1650,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
1652 * Priority of the task has changed. This may cause 1650 * Priority of the task has changed. This may cause
1653 * us to initiate a push or pull. 1651 * us to initiate a push or pull.
1654 */ 1652 */
1655static void prio_changed_rt(struct rq *rq, struct task_struct *p, 1653static void
1656 int oldprio, int running) 1654prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1657{ 1655{
1658 if (running) { 1656 if (!p->se.on_rq)
1657 return;
1658
1659 if (rq->curr == p) {
1659#ifdef CONFIG_SMP 1660#ifdef CONFIG_SMP
1660 /* 1661 /*
1661 * If our priority decreases while running, we 1662 * If our priority decreases while running, we
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..84ec9bcf82d9 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq)
59{ 59{
60} 60}
61 61
62static void switched_to_stop(struct rq *rq, struct task_struct *p, 62static void switched_to_stop(struct rq *rq, struct task_struct *p)
63 int running)
64{ 63{
65 BUG(); /* its impossible to change to this class */ 64 BUG(); /* its impossible to change to this class */
66} 65}
67 66
68static void prio_changed_stop(struct rq *rq, struct task_struct *p, 67static void
69 int oldprio, int running) 68prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
70{ 69{
71 BUG(); /* how!?, what priority? */ 70 BUG(); /* how!?, what priority? */
72} 71}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..0cee50487629 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat);
54 54
55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
@@ -721,7 +721,6 @@ static int run_ksoftirqd(void * __bind_cpu)
721{ 721{
722 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
723 723
724 current->flags |= PF_KSOFTIRQD;
725 while (!kthread_should_stop()) { 724 while (!kthread_should_stop()) {
726 preempt_disable(); 725 preempt_disable();
727 if (!local_softirq_pending()) { 726 if (!local_softirq_pending()) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83db985..cb7c830f7faa 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -361,13 +361,6 @@ static struct ctl_table kern_table[] = {
361 .mode = 0644, 361 .mode = 0644,
362 .proc_handler = sched_rt_handler, 362 .proc_handler = sched_rt_handler,
363 }, 363 },
364 {
365 .procname = "sched_compat_yield",
366 .data = &sysctl_sched_compat_yield,
367 .maxlen = sizeof(unsigned int),
368 .mode = 0644,
369 .proc_handler = proc_dointvec,
370 },
371#ifdef CONFIG_SCHED_AUTOGROUP 364#ifdef CONFIG_SCHED_AUTOGROUP
372 { 365 {
373 .procname = "sched_autogroup_enabled", 366 .procname = "sched_autogroup_enabled",
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..55337a816b20 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
645} 645}
646 646
647/** 647/**
648 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies 648 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
649 * 649 *
650 * @n: nsecs in u64 650 * @n: nsecs in u64
651 * 651 *
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) 657 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years 658 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
659 */ 659 */
660unsigned long nsecs_to_jiffies(u64 n) 660u64 nsecs_to_jiffies64(u64 n)
661{ 661{
662#if (NSEC_PER_SEC % HZ) == 0 662#if (NSEC_PER_SEC % HZ) == 0
663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ 663 /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,6 +674,25 @@ unsigned long nsecs_to_jiffies(u64 n)
674#endif 674#endif
675} 675}
676 676
677
678/**
679 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
680 *
681 * @n: nsecs in u64
682 *
683 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
684 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
685 * for scheduler, not for use in device drivers to calculate timeout value.
686 *
687 * note:
688 * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
689 * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
690 */
691unsigned long nsecs_to_jiffies(u64 n)
692{
693 return (unsigned long)nsecs_to_jiffies64(n);
694}
695
677#if (BITS_PER_LONG < 64) 696#if (BITS_PER_LONG < 64)
678u64 get_jiffies_64(void) 697u64 get_jiffies_64(void)
679{ 698{