aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c404
1 files changed, 314 insertions, 90 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 04949089e76..c8e40b7005c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -278,14 +278,12 @@ struct task_group {
278#endif 278#endif
279}; 279};
280 280
281#define root_task_group init_task_group
282
283/* task_group_lock serializes the addition/removal of task groups */ 281/* task_group_lock serializes the addition/removal of task groups */
284static DEFINE_SPINLOCK(task_group_lock); 282static DEFINE_SPINLOCK(task_group_lock);
285 283
286#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
287 285
288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
289 287
290/* 288/*
291 * A weight of 0 or 1 can cause arithmetics problems. 289 * A weight of 0 or 1 can cause arithmetics problems.
@@ -298,13 +296,13 @@ static DEFINE_SPINLOCK(task_group_lock);
298#define MIN_SHARES 2 296#define MIN_SHARES 2
299#define MAX_SHARES (1UL << 18) 297#define MAX_SHARES (1UL << 18)
300 298
301static int init_task_group_load = INIT_TASK_GROUP_LOAD; 299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
302#endif 300#endif
303 301
304/* Default task group. 302/* Default task group.
305 * Every task in system belong to this group at bootup. 303 * Every task in system belong to this group at bootup.
306 */ 304 */
307struct task_group init_task_group; 305struct task_group root_task_group;
308 306
309#endif /* CONFIG_CGROUP_SCHED */ 307#endif /* CONFIG_CGROUP_SCHED */
310 308
@@ -326,7 +324,7 @@ struct cfs_rq {
326 * 'curr' points to currently running entity on this cfs_rq. 324 * 'curr' points to currently running entity on this cfs_rq.
327 * It is set to NULL otherwise (i.e when none are currently running). 325 * It is set to NULL otherwise (i.e when none are currently running).
328 */ 326 */
329 struct sched_entity *curr, *next, *last; 327 struct sched_entity *curr, *next, *last, *skip;
330 328
331 unsigned int nr_spread_over; 329 unsigned int nr_spread_over;
332 330
@@ -555,9 +553,6 @@ struct rq {
555 /* try_to_wake_up() stats */ 553 /* try_to_wake_up() stats */
556 unsigned int ttwu_count; 554 unsigned int ttwu_count;
557 unsigned int ttwu_local; 555 unsigned int ttwu_local;
558
559 /* BKL stats */
560 unsigned int bkl_count;
561#endif 556#endif
562}; 557};
563 558
@@ -743,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
743 buf[cnt] = 0; 738 buf[cnt] = 0;
744 cmp = strstrip(buf); 739 cmp = strstrip(buf);
745 740
746 if (strncmp(buf, "NO_", 3) == 0) { 741 if (strncmp(cmp, "NO_", 3) == 0) {
747 neg = 1; 742 neg = 1;
748 cmp += 3; 743 cmp += 3;
749 } 744 }
@@ -1688,6 +1683,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1688 __release(rq2->lock); 1683 __release(rq2->lock);
1689} 1684}
1690 1685
1686#else /* CONFIG_SMP */
1687
1688/*
1689 * double_rq_lock - safely lock two runqueues
1690 *
1691 * Note this does not disable interrupts like task_rq_lock,
1692 * you need to do so manually before calling.
1693 */
1694static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1695 __acquires(rq1->lock)
1696 __acquires(rq2->lock)
1697{
1698 BUG_ON(!irqs_disabled());
1699 BUG_ON(rq1 != rq2);
1700 raw_spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */
1702}
1703
1704/*
1705 * double_rq_unlock - safely unlock two runqueues
1706 *
1707 * Note this does not restore interrupts like task_rq_unlock,
1708 * you need to do so manually after calling.
1709 */
1710static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1711 __releases(rq1->lock)
1712 __releases(rq2->lock)
1713{
1714 BUG_ON(rq1 != rq2);
1715 raw_spin_unlock(&rq1->lock);
1716 __release(rq2->lock);
1717}
1718
1691#endif 1719#endif
1692 1720
1693static void calc_load_account_idle(struct rq *this_rq); 1721static void calc_load_account_idle(struct rq *this_rq);
@@ -1882,7 +1910,7 @@ void account_system_vtime(struct task_struct *curr)
1882 */ 1910 */
1883 if (hardirq_count()) 1911 if (hardirq_count())
1884 __this_cpu_add(cpu_hardirq_time, delta); 1912 __this_cpu_add(cpu_hardirq_time, delta);
1885 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1913 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1886 __this_cpu_add(cpu_softirq_time, delta); 1914 __this_cpu_add(cpu_softirq_time, delta);
1887 1915
1888 irq_time_write_end(); 1916 irq_time_write_end();
@@ -1922,8 +1950,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1922 sched_rt_avg_update(rq, irq_delta); 1950 sched_rt_avg_update(rq, irq_delta);
1923} 1951}
1924 1952
1953static int irqtime_account_hi_update(void)
1954{
1955 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1956 unsigned long flags;
1957 u64 latest_ns;
1958 int ret = 0;
1959
1960 local_irq_save(flags);
1961 latest_ns = this_cpu_read(cpu_hardirq_time);
1962 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1963 ret = 1;
1964 local_irq_restore(flags);
1965 return ret;
1966}
1967
1968static int irqtime_account_si_update(void)
1969{
1970 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1971 unsigned long flags;
1972 u64 latest_ns;
1973 int ret = 0;
1974
1975 local_irq_save(flags);
1976 latest_ns = this_cpu_read(cpu_softirq_time);
1977 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1978 ret = 1;
1979 local_irq_restore(flags);
1980 return ret;
1981}
1982
1925#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 1983#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1926 1984
1985#define sched_clock_irqtime (0)
1986
1927static void update_rq_clock_task(struct rq *rq, s64 delta) 1987static void update_rq_clock_task(struct rq *rq, s64 delta)
1928{ 1988{
1929 rq->clock_task += delta; 1989 rq->clock_task += delta;
@@ -2027,14 +2087,14 @@ inline int task_curr(const struct task_struct *p)
2027 2087
2028static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2088static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2029 const struct sched_class *prev_class, 2089 const struct sched_class *prev_class,
2030 int oldprio, int running) 2090 int oldprio)
2031{ 2091{
2032 if (prev_class != p->sched_class) { 2092 if (prev_class != p->sched_class) {
2033 if (prev_class->switched_from) 2093 if (prev_class->switched_from)
2034 prev_class->switched_from(rq, p, running); 2094 prev_class->switched_from(rq, p);
2035 p->sched_class->switched_to(rq, p, running); 2095 p->sched_class->switched_to(rq, p);
2036 } else 2096 } else if (oldprio != p->prio)
2037 p->sched_class->prio_changed(rq, p, oldprio, running); 2097 p->sched_class->prio_changed(rq, p, oldprio);
2038} 2098}
2039 2099
2040static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2100static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2226,7 +2286,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2226 * yield - it could be a while. 2286 * yield - it could be a while.
2227 */ 2287 */
2228 if (unlikely(on_rq)) { 2288 if (unlikely(on_rq)) {
2229 schedule_timeout_uninterruptible(1); 2289 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2290
2291 set_current_state(TASK_UNINTERRUPTIBLE);
2292 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2230 continue; 2293 continue;
2231 } 2294 }
2232 2295
@@ -2267,27 +2330,6 @@ void kick_process(struct task_struct *p)
2267EXPORT_SYMBOL_GPL(kick_process); 2330EXPORT_SYMBOL_GPL(kick_process);
2268#endif /* CONFIG_SMP */ 2331#endif /* CONFIG_SMP */
2269 2332
2270/**
2271 * task_oncpu_function_call - call a function on the cpu on which a task runs
2272 * @p: the task to evaluate
2273 * @func: the function to be called
2274 * @info: the function call argument
2275 *
2276 * Calls the function @func when the task is currently running. This might
2277 * be on the current CPU, which just calls the function directly
2278 */
2279void task_oncpu_function_call(struct task_struct *p,
2280 void (*func) (void *info), void *info)
2281{
2282 int cpu;
2283
2284 preempt_disable();
2285 cpu = task_cpu(p);
2286 if (task_curr(p))
2287 smp_call_function_single(cpu, func, info, 1);
2288 preempt_enable();
2289}
2290
2291#ifdef CONFIG_SMP 2333#ifdef CONFIG_SMP
2292/* 2334/*
2293 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2335 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2507,7 +2549,7 @@ out:
2507 * try_to_wake_up_local - try to wake up a local task with rq lock held 2549 * try_to_wake_up_local - try to wake up a local task with rq lock held
2508 * @p: the thread to be awakened 2550 * @p: the thread to be awakened
2509 * 2551 *
2510 * Put @p on the run-queue if it's not alredy there. The caller must 2552 * Put @p on the run-queue if it's not already there. The caller must
2511 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2553 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2512 * the current task. this_rq() stays locked over invocation. 2554 * the current task. this_rq() stays locked over invocation.
2513 */ 2555 */
@@ -2568,6 +2610,7 @@ static void __sched_fork(struct task_struct *p)
2568 p->se.sum_exec_runtime = 0; 2610 p->se.sum_exec_runtime = 0;
2569 p->se.prev_sum_exec_runtime = 0; 2611 p->se.prev_sum_exec_runtime = 0;
2570 p->se.nr_migrations = 0; 2612 p->se.nr_migrations = 0;
2613 p->se.vruntime = 0;
2571 2614
2572#ifdef CONFIG_SCHEDSTATS 2615#ifdef CONFIG_SCHEDSTATS
2573 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2616 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2778,9 +2821,12 @@ static inline void
2778prepare_task_switch(struct rq *rq, struct task_struct *prev, 2821prepare_task_switch(struct rq *rq, struct task_struct *prev,
2779 struct task_struct *next) 2822 struct task_struct *next)
2780{ 2823{
2824 sched_info_switch(prev, next);
2825 perf_event_task_sched_out(prev, next);
2781 fire_sched_out_preempt_notifiers(prev, next); 2826 fire_sched_out_preempt_notifiers(prev, next);
2782 prepare_lock_switch(rq, next); 2827 prepare_lock_switch(rq, next);
2783 prepare_arch_switch(next); 2828 prepare_arch_switch(next);
2829 trace_sched_switch(prev, next);
2784} 2830}
2785 2831
2786/** 2832/**
@@ -2913,7 +2959,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2913 struct mm_struct *mm, *oldmm; 2959 struct mm_struct *mm, *oldmm;
2914 2960
2915 prepare_task_switch(rq, prev, next); 2961 prepare_task_switch(rq, prev, next);
2916 trace_sched_switch(prev, next); 2962
2917 mm = next->mm; 2963 mm = next->mm;
2918 oldmm = prev->active_mm; 2964 oldmm = prev->active_mm;
2919 /* 2965 /*
@@ -3570,6 +3616,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3570} 3616}
3571 3617
3572/* 3618/*
3619 * Account system cpu time to a process and desired cpustat field
3620 * @p: the process that the cpu time gets accounted to
3621 * @cputime: the cpu time spent in kernel space since the last update
3622 * @cputime_scaled: cputime scaled by cpu frequency
3623 * @target_cputime64: pointer to cpustat field that has to be updated
3624 */
3625static inline
3626void __account_system_time(struct task_struct *p, cputime_t cputime,
3627 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3628{
3629 cputime64_t tmp = cputime_to_cputime64(cputime);
3630
3631 /* Add system time to process. */
3632 p->stime = cputime_add(p->stime, cputime);
3633 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3634 account_group_system_time(p, cputime);
3635
3636 /* Add system time to cpustat. */
3637 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3638 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3639
3640 /* Account for system time used */
3641 acct_update_integrals(p);
3642}
3643
3644/*
3573 * Account system cpu time to a process. 3645 * Account system cpu time to a process.
3574 * @p: the process that the cpu time gets accounted to 3646 * @p: the process that the cpu time gets accounted to
3575 * @hardirq_offset: the offset to subtract from hardirq_count() 3647 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3580,36 +3652,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3580 cputime_t cputime, cputime_t cputime_scaled) 3652 cputime_t cputime, cputime_t cputime_scaled)
3581{ 3653{
3582 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3654 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3583 cputime64_t tmp; 3655 cputime64_t *target_cputime64;
3584 3656
3585 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3657 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3586 account_guest_time(p, cputime, cputime_scaled); 3658 account_guest_time(p, cputime, cputime_scaled);
3587 return; 3659 return;
3588 } 3660 }
3589 3661
3590 /* Add system time to process. */
3591 p->stime = cputime_add(p->stime, cputime);
3592 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3593 account_group_system_time(p, cputime);
3594
3595 /* Add system time to cpustat. */
3596 tmp = cputime_to_cputime64(cputime);
3597 if (hardirq_count() - hardirq_offset) 3662 if (hardirq_count() - hardirq_offset)
3598 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3663 target_cputime64 = &cpustat->irq;
3599 else if (in_serving_softirq()) 3664 else if (in_serving_softirq())
3600 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3665 target_cputime64 = &cpustat->softirq;
3601 else 3666 else
3602 cpustat->system = cputime64_add(cpustat->system, tmp); 3667 target_cputime64 = &cpustat->system;
3603 3668
3604 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3669 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3605
3606 /* Account for system time used */
3607 acct_update_integrals(p);
3608} 3670}
3609 3671
3610/* 3672/*
3611 * Account for involuntary wait time. 3673 * Account for involuntary wait time.
3612 * @steal: the cpu time spent in involuntary wait 3674 * @cputime: the cpu time spent in involuntary wait
3613 */ 3675 */
3614void account_steal_time(cputime_t cputime) 3676void account_steal_time(cputime_t cputime)
3615{ 3677{
@@ -3637,6 +3699,73 @@ void account_idle_time(cputime_t cputime)
3637 3699
3638#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3700#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3639 3701
3702#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3703/*
3704 * Account a tick to a process and cpustat
3705 * @p: the process that the cpu time gets accounted to
3706 * @user_tick: is the tick from userspace
3707 * @rq: the pointer to rq
3708 *
3709 * Tick demultiplexing follows the order
3710 * - pending hardirq update
3711 * - pending softirq update
3712 * - user_time
3713 * - idle_time
3714 * - system time
3715 * - check for guest_time
3716 * - else account as system_time
3717 *
3718 * Check for hardirq is done both for system and user time as there is
3719 * no timer going off while we are on hardirq and hence we may never get an
3720 * opportunity to update it solely in system time.
3721 * p->stime and friends are only updated on system time and not on irq
3722 * softirq as those do not count in task exec_runtime any more.
3723 */
3724static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3725 struct rq *rq)
3726{
3727 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3728 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3729 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3730
3731 if (irqtime_account_hi_update()) {
3732 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3733 } else if (irqtime_account_si_update()) {
3734 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3735 } else if (this_cpu_ksoftirqd() == p) {
3736 /*
3737 * ksoftirqd time do not get accounted in cpu_softirq_time.
3738 * So, we have to handle it separately here.
3739 * Also, p->stime needs to be updated for ksoftirqd.
3740 */
3741 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3742 &cpustat->softirq);
3743 } else if (user_tick) {
3744 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3745 } else if (p == rq->idle) {
3746 account_idle_time(cputime_one_jiffy);
3747 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3748 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3749 } else {
3750 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3751 &cpustat->system);
3752 }
3753}
3754
3755static void irqtime_account_idle_ticks(int ticks)
3756{
3757 int i;
3758 struct rq *rq = this_rq();
3759
3760 for (i = 0; i < ticks; i++)
3761 irqtime_account_process_tick(current, 0, rq);
3762}
3763#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3764static void irqtime_account_idle_ticks(int ticks) {}
3765static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3766 struct rq *rq) {}
3767#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3768
3640/* 3769/*
3641 * Account a single tick of cpu time. 3770 * Account a single tick of cpu time.
3642 * @p: the process that the cpu time gets accounted to 3771 * @p: the process that the cpu time gets accounted to
@@ -3647,6 +3776,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3647 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3776 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3648 struct rq *rq = this_rq(); 3777 struct rq *rq = this_rq();
3649 3778
3779 if (sched_clock_irqtime) {
3780 irqtime_account_process_tick(p, user_tick, rq);
3781 return;
3782 }
3783
3650 if (user_tick) 3784 if (user_tick)
3651 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3785 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3652 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3786 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3672,6 +3806,12 @@ void account_steal_ticks(unsigned long ticks)
3672 */ 3806 */
3673void account_idle_ticks(unsigned long ticks) 3807void account_idle_ticks(unsigned long ticks)
3674{ 3808{
3809
3810 if (sched_clock_irqtime) {
3811 irqtime_account_idle_ticks(ticks);
3812 return;
3813 }
3814
3675 account_idle_time(jiffies_to_cputime(ticks)); 3815 account_idle_time(jiffies_to_cputime(ticks));
3676} 3816}
3677 3817
@@ -3889,7 +4029,7 @@ static inline void schedule_debug(struct task_struct *prev)
3889 schedstat_inc(this_rq(), sched_count); 4029 schedstat_inc(this_rq(), sched_count);
3890#ifdef CONFIG_SCHEDSTATS 4030#ifdef CONFIG_SCHEDSTATS
3891 if (unlikely(prev->lock_depth >= 0)) { 4031 if (unlikely(prev->lock_depth >= 0)) {
3892 schedstat_inc(this_rq(), bkl_count); 4032 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
3893 schedstat_inc(prev, sched_info.bkl_count); 4033 schedstat_inc(prev, sched_info.bkl_count);
3894 } 4034 }
3895#endif 4035#endif
@@ -3991,9 +4131,6 @@ need_resched_nonpreemptible:
3991 rq->skip_clock_update = 0; 4131 rq->skip_clock_update = 0;
3992 4132
3993 if (likely(prev != next)) { 4133 if (likely(prev != next)) {
3994 sched_info_switch(prev, next);
3995 perf_event_task_sched_out(prev, next);
3996
3997 rq->nr_switches++; 4134 rq->nr_switches++;
3998 rq->curr = next; 4135 rq->curr = next;
3999 ++*switch_count; 4136 ++*switch_count;
@@ -4215,6 +4352,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4215{ 4352{
4216 __wake_up_common(q, mode, 1, 0, key); 4353 __wake_up_common(q, mode, 1, 0, key);
4217} 4354}
4355EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4218 4356
4219/** 4357/**
4220 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4358 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4572,11 +4710,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4572 4710
4573 if (running) 4711 if (running)
4574 p->sched_class->set_curr_task(rq); 4712 p->sched_class->set_curr_task(rq);
4575 if (on_rq) { 4713 if (on_rq)
4576 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4714 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4577 4715
4578 check_class_changed(rq, p, prev_class, oldprio, running); 4716 check_class_changed(rq, p, prev_class, oldprio);
4579 }
4580 task_rq_unlock(rq, &flags); 4717 task_rq_unlock(rq, &flags);
4581} 4718}
4582 4719
@@ -4824,12 +4961,15 @@ recheck:
4824 param->sched_priority > rlim_rtprio) 4961 param->sched_priority > rlim_rtprio)
4825 return -EPERM; 4962 return -EPERM;
4826 } 4963 }
4964
4827 /* 4965 /*
4828 * Like positive nice levels, dont allow tasks to 4966 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4829 * move out of SCHED_IDLE either: 4967 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4830 */ 4968 */
4831 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4969 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4832 return -EPERM; 4970 if (!can_nice(p, TASK_NICE(p)))
4971 return -EPERM;
4972 }
4833 4973
4834 /* can't change other user's priorities */ 4974 /* can't change other user's priorities */
4835 if (!check_same_owner(p)) 4975 if (!check_same_owner(p))
@@ -4873,7 +5013,8 @@ recheck:
4873 * assigned. 5013 * assigned.
4874 */ 5014 */
4875 if (rt_bandwidth_enabled() && rt_policy(policy) && 5015 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4876 task_group(p)->rt_bandwidth.rt_runtime == 0) { 5016 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5017 !task_group_is_autogroup(task_group(p))) {
4877 __task_rq_unlock(rq); 5018 __task_rq_unlock(rq);
4878 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5019 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4879 return -EPERM; 5020 return -EPERM;
@@ -4903,11 +5044,10 @@ recheck:
4903 5044
4904 if (running) 5045 if (running)
4905 p->sched_class->set_curr_task(rq); 5046 p->sched_class->set_curr_task(rq);
4906 if (on_rq) { 5047 if (on_rq)
4907 activate_task(rq, p, 0); 5048 activate_task(rq, p, 0);
4908 5049
4909 check_class_changed(rq, p, prev_class, oldprio, running); 5050 check_class_changed(rq, p, prev_class, oldprio);
4910 }
4911 __task_rq_unlock(rq); 5051 __task_rq_unlock(rq);
4912 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5052 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4913 5053
@@ -5324,6 +5464,65 @@ void __sched yield(void)
5324} 5464}
5325EXPORT_SYMBOL(yield); 5465EXPORT_SYMBOL(yield);
5326 5466
5467/**
5468 * yield_to - yield the current processor to another thread in
5469 * your thread group, or accelerate that thread toward the
5470 * processor it's on.
5471 *
5472 * It's the caller's job to ensure that the target task struct
5473 * can't go away on us before we can do any checks.
5474 *
5475 * Returns true if we indeed boosted the target task.
5476 */
5477bool __sched yield_to(struct task_struct *p, bool preempt)
5478{
5479 struct task_struct *curr = current;
5480 struct rq *rq, *p_rq;
5481 unsigned long flags;
5482 bool yielded = 0;
5483
5484 local_irq_save(flags);
5485 rq = this_rq();
5486
5487again:
5488 p_rq = task_rq(p);
5489 double_rq_lock(rq, p_rq);
5490 while (task_rq(p) != p_rq) {
5491 double_rq_unlock(rq, p_rq);
5492 goto again;
5493 }
5494
5495 if (!curr->sched_class->yield_to_task)
5496 goto out;
5497
5498 if (curr->sched_class != p->sched_class)
5499 goto out;
5500
5501 if (task_running(p_rq, p) || p->state)
5502 goto out;
5503
5504 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5505 if (yielded) {
5506 schedstat_inc(rq, yld_count);
5507 /*
5508 * Make p's CPU reschedule; pick_next_entity takes care of
5509 * fairness.
5510 */
5511 if (preempt && rq != p_rq)
5512 resched_task(p_rq->curr);
5513 }
5514
5515out:
5516 double_rq_unlock(rq, p_rq);
5517 local_irq_restore(flags);
5518
5519 if (yielded)
5520 schedule();
5521
5522 return yielded;
5523}
5524EXPORT_SYMBOL_GPL(yield_to);
5525
5327/* 5526/*
5328 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5527 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5329 * that process accounting knows that this is a task in IO wait state. 5528 * that process accounting knows that this is a task in IO wait state.
@@ -5572,7 +5771,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5572 * The idle tasks have their own, simple scheduling class: 5771 * The idle tasks have their own, simple scheduling class:
5573 */ 5772 */
5574 idle->sched_class = &idle_sched_class; 5773 idle->sched_class = &idle_sched_class;
5575 ftrace_graph_init_task(idle); 5774 ftrace_graph_init_idle_task(idle, cpu);
5576} 5775}
5577 5776
5578/* 5777/*
@@ -7797,6 +7996,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7797 INIT_LIST_HEAD(&cfs_rq->tasks); 7996 INIT_LIST_HEAD(&cfs_rq->tasks);
7798#ifdef CONFIG_FAIR_GROUP_SCHED 7997#ifdef CONFIG_FAIR_GROUP_SCHED
7799 cfs_rq->rq = rq; 7998 cfs_rq->rq = rq;
7999 /* allow initial update_cfs_load() to truncate */
8000#ifdef CONFIG_SMP
8001 cfs_rq->load_stamp = 1;
8002#endif
7800#endif 8003#endif
7801 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8004 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7802} 8005}
@@ -7848,7 +8051,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7848 cfs_rq->tg = tg; 8051 cfs_rq->tg = tg;
7849 8052
7850 tg->se[cpu] = se; 8053 tg->se[cpu] = se;
7851 /* se could be NULL for init_task_group */ 8054 /* se could be NULL for root_task_group */
7852 if (!se) 8055 if (!se)
7853 return; 8056 return;
7854 8057
@@ -7908,18 +8111,18 @@ void __init sched_init(void)
7908 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 8111 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7909 8112
7910#ifdef CONFIG_FAIR_GROUP_SCHED 8113#ifdef CONFIG_FAIR_GROUP_SCHED
7911 init_task_group.se = (struct sched_entity **)ptr; 8114 root_task_group.se = (struct sched_entity **)ptr;
7912 ptr += nr_cpu_ids * sizeof(void **); 8115 ptr += nr_cpu_ids * sizeof(void **);
7913 8116
7914 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 8117 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7915 ptr += nr_cpu_ids * sizeof(void **); 8118 ptr += nr_cpu_ids * sizeof(void **);
7916 8119
7917#endif /* CONFIG_FAIR_GROUP_SCHED */ 8120#endif /* CONFIG_FAIR_GROUP_SCHED */
7918#ifdef CONFIG_RT_GROUP_SCHED 8121#ifdef CONFIG_RT_GROUP_SCHED
7919 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8122 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7920 ptr += nr_cpu_ids * sizeof(void **); 8123 ptr += nr_cpu_ids * sizeof(void **);
7921 8124
7922 init_task_group.rt_rq = (struct rt_rq **)ptr; 8125 root_task_group.rt_rq = (struct rt_rq **)ptr;
7923 ptr += nr_cpu_ids * sizeof(void **); 8126 ptr += nr_cpu_ids * sizeof(void **);
7924 8127
7925#endif /* CONFIG_RT_GROUP_SCHED */ 8128#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7939,13 +8142,13 @@ void __init sched_init(void)
7939 global_rt_period(), global_rt_runtime()); 8142 global_rt_period(), global_rt_runtime());
7940 8143
7941#ifdef CONFIG_RT_GROUP_SCHED 8144#ifdef CONFIG_RT_GROUP_SCHED
7942 init_rt_bandwidth(&init_task_group.rt_bandwidth, 8145 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7943 global_rt_period(), global_rt_runtime()); 8146 global_rt_period(), global_rt_runtime());
7944#endif /* CONFIG_RT_GROUP_SCHED */ 8147#endif /* CONFIG_RT_GROUP_SCHED */
7945 8148
7946#ifdef CONFIG_CGROUP_SCHED 8149#ifdef CONFIG_CGROUP_SCHED
7947 list_add(&init_task_group.list, &task_groups); 8150 list_add(&root_task_group.list, &task_groups);
7948 INIT_LIST_HEAD(&init_task_group.children); 8151 INIT_LIST_HEAD(&root_task_group.children);
7949 autogroup_init(&init_task); 8152 autogroup_init(&init_task);
7950#endif /* CONFIG_CGROUP_SCHED */ 8153#endif /* CONFIG_CGROUP_SCHED */
7951 8154
@@ -7960,34 +8163,34 @@ void __init sched_init(void)
7960 init_cfs_rq(&rq->cfs, rq); 8163 init_cfs_rq(&rq->cfs, rq);
7961 init_rt_rq(&rq->rt, rq); 8164 init_rt_rq(&rq->rt, rq);
7962#ifdef CONFIG_FAIR_GROUP_SCHED 8165#ifdef CONFIG_FAIR_GROUP_SCHED
7963 init_task_group.shares = init_task_group_load; 8166 root_task_group.shares = root_task_group_load;
7964 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 8167 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7965 /* 8168 /*
7966 * How much cpu bandwidth does init_task_group get? 8169 * How much cpu bandwidth does root_task_group get?
7967 * 8170 *
7968 * In case of task-groups formed thr' the cgroup filesystem, it 8171 * In case of task-groups formed thr' the cgroup filesystem, it
7969 * gets 100% of the cpu resources in the system. This overall 8172 * gets 100% of the cpu resources in the system. This overall
7970 * system cpu resource is divided among the tasks of 8173 * system cpu resource is divided among the tasks of
7971 * init_task_group and its child task-groups in a fair manner, 8174 * root_task_group and its child task-groups in a fair manner,
7972 * based on each entity's (task or task-group's) weight 8175 * based on each entity's (task or task-group's) weight
7973 * (se->load.weight). 8176 * (se->load.weight).
7974 * 8177 *
7975 * In other words, if init_task_group has 10 tasks of weight 8178 * In other words, if root_task_group has 10 tasks of weight
7976 * 1024) and two child groups A0 and A1 (of weight 1024 each), 8179 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7977 * then A0's share of the cpu resource is: 8180 * then A0's share of the cpu resource is:
7978 * 8181 *
7979 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 8182 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7980 * 8183 *
7981 * We achieve this by letting init_task_group's tasks sit 8184 * We achieve this by letting root_task_group's tasks sit
7982 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 8185 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
7983 */ 8186 */
7984 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); 8187 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7985#endif /* CONFIG_FAIR_GROUP_SCHED */ 8188#endif /* CONFIG_FAIR_GROUP_SCHED */
7986 8189
7987 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 8190 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7988#ifdef CONFIG_RT_GROUP_SCHED 8191#ifdef CONFIG_RT_GROUP_SCHED
7989 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 8192 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7990 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); 8193 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7991#endif 8194#endif
7992 8195
7993 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 8196 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8110,6 +8313,8 @@ EXPORT_SYMBOL(__might_sleep);
8110#ifdef CONFIG_MAGIC_SYSRQ 8313#ifdef CONFIG_MAGIC_SYSRQ
8111static void normalize_task(struct rq *rq, struct task_struct *p) 8314static void normalize_task(struct rq *rq, struct task_struct *p)
8112{ 8315{
8316 const struct sched_class *prev_class = p->sched_class;
8317 int old_prio = p->prio;
8113 int on_rq; 8318 int on_rq;
8114 8319
8115 on_rq = p->se.on_rq; 8320 on_rq = p->se.on_rq;
@@ -8120,6 +8325,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8120 activate_task(rq, p, 0); 8325 activate_task(rq, p, 0);
8121 resched_task(rq->curr); 8326 resched_task(rq->curr);
8122 } 8327 }
8328
8329 check_class_changed(rq, p, prev_class, old_prio);
8123} 8330}
8124 8331
8125void normalize_rt_tasks(void) 8332void normalize_rt_tasks(void)
@@ -8379,6 +8586,7 @@ static void free_sched_group(struct task_group *tg)
8379{ 8586{
8380 free_fair_sched_group(tg); 8587 free_fair_sched_group(tg);
8381 free_rt_sched_group(tg); 8588 free_rt_sched_group(tg);
8589 autogroup_free(tg);
8382 kfree(tg); 8590 kfree(tg);
8383} 8591}
8384 8592
@@ -8510,7 +8718,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510 /* Propagate contribution to hierarchy */ 8718 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags); 8719 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se) 8720 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0); 8721 update_cfs_shares(group_cfs_rq(se));
8514 raw_spin_unlock_irqrestore(&rq->lock, flags); 8722 raw_spin_unlock_irqrestore(&rq->lock, flags);
8515 } 8723 }
8516 8724
@@ -8812,7 +9020,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8812 9020
8813 if (!cgrp->parent) { 9021 if (!cgrp->parent) {
8814 /* This is early initialization for the top cgroup */ 9022 /* This is early initialization for the top cgroup */
8815 return &init_task_group.css; 9023 return &root_task_group.css;
8816 } 9024 }
8817 9025
8818 parent = cgroup_tg(cgrp->parent); 9026 parent = cgroup_tg(cgrp->parent);
@@ -8883,6 +9091,21 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8883 } 9091 }
8884} 9092}
8885 9093
9094static void
9095cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9096 struct cgroup *old_cgrp, struct task_struct *task)
9097{
9098 /*
9099 * cgroup_exit() is called in the copy_process() failure path.
9100 * Ignore this case since the task hasn't ran yet, this avoids
9101 * trying to poke a half freed task state from generic code.
9102 */
9103 if (!(task->flags & PF_EXITING))
9104 return;
9105
9106 sched_move_task(task);
9107}
9108
8886#ifdef CONFIG_FAIR_GROUP_SCHED 9109#ifdef CONFIG_FAIR_GROUP_SCHED
8887static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 9110static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8888 u64 shareval) 9111 u64 shareval)
@@ -8955,6 +9178,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8955 .destroy = cpu_cgroup_destroy, 9178 .destroy = cpu_cgroup_destroy,
8956 .can_attach = cpu_cgroup_can_attach, 9179 .can_attach = cpu_cgroup_can_attach,
8957 .attach = cpu_cgroup_attach, 9180 .attach = cpu_cgroup_attach,
9181 .exit = cpu_cgroup_exit,
8958 .populate = cpu_cgroup_populate, 9182 .populate = cpu_cgroup_populate,
8959 .subsys_id = cpu_cgroup_subsys_id, 9183 .subsys_id = cpu_cgroup_subsys_id,
8960 .early_init = 1, 9184 .early_init = 1,