aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c377
1 files changed, 296 insertions, 81 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..f592ce6f8616 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/uaccess.h> 33#include <linux/uaccess.h>
34#include <linux/highmem.h> 34#include <linux/highmem.h>
35#include <linux/smp_lock.h>
36#include <asm/mmu_context.h> 35#include <asm/mmu_context.h>
37#include <linux/interrupt.h> 36#include <linux/interrupt.h>
38#include <linux/capability.h> 37#include <linux/capability.h>
@@ -324,7 +323,7 @@ struct cfs_rq {
324 * 'curr' points to currently running entity on this cfs_rq. 323 * 'curr' points to currently running entity on this cfs_rq.
325 * It is set to NULL otherwise (i.e when none are currently running). 324 * It is set to NULL otherwise (i.e when none are currently running).
326 */ 325 */
327 struct sched_entity *curr, *next, *last; 326 struct sched_entity *curr, *next, *last, *skip;
328 327
329 unsigned int nr_spread_over; 328 unsigned int nr_spread_over;
330 329
@@ -606,9 +605,6 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct task_group *tg; 605 struct task_group *tg;
607 struct cgroup_subsys_state *css; 606 struct cgroup_subsys_state *css;
608 607
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
613 lockdep_is_held(&task_rq(p)->lock)); 609 lockdep_is_held(&task_rq(p)->lock));
614 tg = container_of(css, struct task_group, css); 610 tg = container_of(css, struct task_group, css);
@@ -664,10 +660,9 @@ static void update_rq_clock(struct rq *rq)
664#endif 660#endif
665 661
666/** 662/**
667 * runqueue_is_locked 663 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
668 * @cpu: the processor in question. 664 * @cpu: the processor in question.
669 * 665 *
670 * Returns true if the current cpu runqueue is locked.
671 * This interface allows printk to be called with the runqueue lock 666 * This interface allows printk to be called with the runqueue lock
672 * held and know whether or not it is OK to wake up the klogd. 667 * held and know whether or not it is OK to wake up the klogd.
673 */ 668 */
@@ -1686,6 +1681,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1686 __release(rq2->lock); 1681 __release(rq2->lock);
1687} 1682}
1688 1683
1684#else /* CONFIG_SMP */
1685
1686/*
1687 * double_rq_lock - safely lock two runqueues
1688 *
1689 * Note this does not disable interrupts like task_rq_lock,
1690 * you need to do so manually before calling.
1691 */
1692static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1693 __acquires(rq1->lock)
1694 __acquires(rq2->lock)
1695{
1696 BUG_ON(!irqs_disabled());
1697 BUG_ON(rq1 != rq2);
1698 raw_spin_lock(&rq1->lock);
1699 __acquire(rq2->lock); /* Fake it out ;) */
1700}
1701
1702/*
1703 * double_rq_unlock - safely unlock two runqueues
1704 *
1705 * Note this does not restore interrupts like task_rq_unlock,
1706 * you need to do so manually after calling.
1707 */
1708static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1709 __releases(rq1->lock)
1710 __releases(rq2->lock)
1711{
1712 BUG_ON(rq1 != rq2);
1713 raw_spin_unlock(&rq1->lock);
1714 __release(rq2->lock);
1715}
1716
1689#endif 1717#endif
1690 1718
1691static void calc_load_account_idle(struct rq *this_rq); 1719static void calc_load_account_idle(struct rq *this_rq);
@@ -1880,7 +1908,7 @@ void account_system_vtime(struct task_struct *curr)
1880 */ 1908 */
1881 if (hardirq_count()) 1909 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta); 1910 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1911 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
1884 __this_cpu_add(cpu_softirq_time, delta); 1912 __this_cpu_add(cpu_softirq_time, delta);
1885 1913
1886 irq_time_write_end(); 1914 irq_time_write_end();
@@ -1920,8 +1948,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
1920 sched_rt_avg_update(rq, irq_delta); 1948 sched_rt_avg_update(rq, irq_delta);
1921} 1949}
1922 1950
1951static int irqtime_account_hi_update(void)
1952{
1953 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1954 unsigned long flags;
1955 u64 latest_ns;
1956 int ret = 0;
1957
1958 local_irq_save(flags);
1959 latest_ns = this_cpu_read(cpu_hardirq_time);
1960 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
1961 ret = 1;
1962 local_irq_restore(flags);
1963 return ret;
1964}
1965
1966static int irqtime_account_si_update(void)
1967{
1968 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
1969 unsigned long flags;
1970 u64 latest_ns;
1971 int ret = 0;
1972
1973 local_irq_save(flags);
1974 latest_ns = this_cpu_read(cpu_softirq_time);
1975 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
1976 ret = 1;
1977 local_irq_restore(flags);
1978 return ret;
1979}
1980
1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 1981#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1924 1982
1983#define sched_clock_irqtime (0)
1984
1925static void update_rq_clock_task(struct rq *rq, s64 delta) 1985static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{ 1986{
1927 rq->clock_task += delta; 1987 rq->clock_task += delta;
@@ -2025,14 +2085,14 @@ inline int task_curr(const struct task_struct *p)
2025 2085
2026static inline void check_class_changed(struct rq *rq, struct task_struct *p, 2086static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2027 const struct sched_class *prev_class, 2087 const struct sched_class *prev_class,
2028 int oldprio, int running) 2088 int oldprio)
2029{ 2089{
2030 if (prev_class != p->sched_class) { 2090 if (prev_class != p->sched_class) {
2031 if (prev_class->switched_from) 2091 if (prev_class->switched_from)
2032 prev_class->switched_from(rq, p, running); 2092 prev_class->switched_from(rq, p);
2033 p->sched_class->switched_to(rq, p, running); 2093 p->sched_class->switched_to(rq, p);
2034 } else 2094 } else if (oldprio != p->prio)
2035 p->sched_class->prio_changed(rq, p, oldprio, running); 2095 p->sched_class->prio_changed(rq, p, oldprio);
2036} 2096}
2037 2097
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 2098static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2224,7 +2284,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2224 * yield - it could be a while. 2284 * yield - it could be a while.
2225 */ 2285 */
2226 if (unlikely(on_rq)) { 2286 if (unlikely(on_rq)) {
2227 schedule_timeout_uninterruptible(1); 2287 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2288
2289 set_current_state(TASK_UNINTERRUPTIBLE);
2290 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2228 continue; 2291 continue;
2229 } 2292 }
2230 2293
@@ -2265,27 +2328,6 @@ void kick_process(struct task_struct *p)
2265EXPORT_SYMBOL_GPL(kick_process); 2328EXPORT_SYMBOL_GPL(kick_process);
2266#endif /* CONFIG_SMP */ 2329#endif /* CONFIG_SMP */
2267 2330
2268/**
2269 * task_oncpu_function_call - call a function on the cpu on which a task runs
2270 * @p: the task to evaluate
2271 * @func: the function to be called
2272 * @info: the function call argument
2273 *
2274 * Calls the function @func when the task is currently running. This might
2275 * be on the current CPU, which just calls the function directly
2276 */
2277void task_oncpu_function_call(struct task_struct *p,
2278 void (*func) (void *info), void *info)
2279{
2280 int cpu;
2281
2282 preempt_disable();
2283 cpu = task_cpu(p);
2284 if (task_curr(p))
2285 smp_call_function_single(cpu, func, info, 1);
2286 preempt_enable();
2287}
2288
2289#ifdef CONFIG_SMP 2331#ifdef CONFIG_SMP
2290/* 2332/*
2291 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
@@ -2566,6 +2608,7 @@ static void __sched_fork(struct task_struct *p)
2566 p->se.sum_exec_runtime = 0; 2608 p->se.sum_exec_runtime = 0;
2567 p->se.prev_sum_exec_runtime = 0; 2609 p->se.prev_sum_exec_runtime = 0;
2568 p->se.nr_migrations = 0; 2610 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0;
2569 2612
2570#ifdef CONFIG_SCHEDSTATS 2613#ifdef CONFIG_SCHEDSTATS
2571 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2776,9 +2819,12 @@ static inline void
2776prepare_task_switch(struct rq *rq, struct task_struct *prev, 2819prepare_task_switch(struct rq *rq, struct task_struct *prev,
2777 struct task_struct *next) 2820 struct task_struct *next)
2778{ 2821{
2822 sched_info_switch(prev, next);
2823 perf_event_task_sched_out(prev, next);
2779 fire_sched_out_preempt_notifiers(prev, next); 2824 fire_sched_out_preempt_notifiers(prev, next);
2780 prepare_lock_switch(rq, next); 2825 prepare_lock_switch(rq, next);
2781 prepare_arch_switch(next); 2826 prepare_arch_switch(next);
2827 trace_sched_switch(prev, next);
2782} 2828}
2783 2829
2784/** 2830/**
@@ -2911,7 +2957,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2911 struct mm_struct *mm, *oldmm; 2957 struct mm_struct *mm, *oldmm;
2912 2958
2913 prepare_task_switch(rq, prev, next); 2959 prepare_task_switch(rq, prev, next);
2914 trace_sched_switch(prev, next); 2960
2915 mm = next->mm; 2961 mm = next->mm;
2916 oldmm = prev->active_mm; 2962 oldmm = prev->active_mm;
2917 /* 2963 /*
@@ -3568,6 +3614,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
3568} 3614}
3569 3615
3570/* 3616/*
3617 * Account system cpu time to a process and desired cpustat field
3618 * @p: the process that the cpu time gets accounted to
3619 * @cputime: the cpu time spent in kernel space since the last update
3620 * @cputime_scaled: cputime scaled by cpu frequency
3621 * @target_cputime64: pointer to cpustat field that has to be updated
3622 */
3623static inline
3624void __account_system_time(struct task_struct *p, cputime_t cputime,
3625 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3626{
3627 cputime64_t tmp = cputime_to_cputime64(cputime);
3628
3629 /* Add system time to process. */
3630 p->stime = cputime_add(p->stime, cputime);
3631 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3632 account_group_system_time(p, cputime);
3633
3634 /* Add system time to cpustat. */
3635 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3636 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3637
3638 /* Account for system time used */
3639 acct_update_integrals(p);
3640}
3641
3642/*
3571 * Account system cpu time to a process. 3643 * Account system cpu time to a process.
3572 * @p: the process that the cpu time gets accounted to 3644 * @p: the process that the cpu time gets accounted to
3573 * @hardirq_offset: the offset to subtract from hardirq_count() 3645 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3650,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3578 cputime_t cputime, cputime_t cputime_scaled) 3650 cputime_t cputime, cputime_t cputime_scaled)
3579{ 3651{
3580 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3652 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3581 cputime64_t tmp; 3653 cputime64_t *target_cputime64;
3582 3654
3583 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3655 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3584 account_guest_time(p, cputime, cputime_scaled); 3656 account_guest_time(p, cputime, cputime_scaled);
3585 return; 3657 return;
3586 } 3658 }
3587 3659
3588 /* Add system time to process. */
3589 p->stime = cputime_add(p->stime, cputime);
3590 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3591 account_group_system_time(p, cputime);
3592
3593 /* Add system time to cpustat. */
3594 tmp = cputime_to_cputime64(cputime);
3595 if (hardirq_count() - hardirq_offset) 3660 if (hardirq_count() - hardirq_offset)
3596 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3661 target_cputime64 = &cpustat->irq;
3597 else if (in_serving_softirq()) 3662 else if (in_serving_softirq())
3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3663 target_cputime64 = &cpustat->softirq;
3599 else 3664 else
3600 cpustat->system = cputime64_add(cpustat->system, tmp); 3665 target_cputime64 = &cpustat->system;
3601 3666
3602 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3667 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
3603
3604 /* Account for system time used */
3605 acct_update_integrals(p);
3606} 3668}
3607 3669
3608/* 3670/*
3609 * Account for involuntary wait time. 3671 * Account for involuntary wait time.
3610 * @steal: the cpu time spent in involuntary wait 3672 * @cputime: the cpu time spent in involuntary wait
3611 */ 3673 */
3612void account_steal_time(cputime_t cputime) 3674void account_steal_time(cputime_t cputime)
3613{ 3675{
@@ -3635,6 +3697,73 @@ void account_idle_time(cputime_t cputime)
3635 3697
3636#ifndef CONFIG_VIRT_CPU_ACCOUNTING 3698#ifndef CONFIG_VIRT_CPU_ACCOUNTING
3637 3699
3700#ifdef CONFIG_IRQ_TIME_ACCOUNTING
3701/*
3702 * Account a tick to a process and cpustat
3703 * @p: the process that the cpu time gets accounted to
3704 * @user_tick: is the tick from userspace
3705 * @rq: the pointer to rq
3706 *
3707 * Tick demultiplexing follows the order
3708 * - pending hardirq update
3709 * - pending softirq update
3710 * - user_time
3711 * - idle_time
3712 * - system time
3713 * - check for guest_time
3714 * - else account as system_time
3715 *
3716 * Check for hardirq is done both for system and user time as there is
3717 * no timer going off while we are on hardirq and hence we may never get an
3718 * opportunity to update it solely in system time.
3719 * p->stime and friends are only updated on system time and not on irq
3720 * softirq as those do not count in task exec_runtime any more.
3721 */
3722static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3723 struct rq *rq)
3724{
3725 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3726 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
3727 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3728
3729 if (irqtime_account_hi_update()) {
3730 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3731 } else if (irqtime_account_si_update()) {
3732 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3733 } else if (this_cpu_ksoftirqd() == p) {
3734 /*
3735 * ksoftirqd time do not get accounted in cpu_softirq_time.
3736 * So, we have to handle it separately here.
3737 * Also, p->stime needs to be updated for ksoftirqd.
3738 */
3739 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3740 &cpustat->softirq);
3741 } else if (user_tick) {
3742 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3743 } else if (p == rq->idle) {
3744 account_idle_time(cputime_one_jiffy);
3745 } else if (p->flags & PF_VCPU) { /* System time or guest time */
3746 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
3747 } else {
3748 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
3749 &cpustat->system);
3750 }
3751}
3752
3753static void irqtime_account_idle_ticks(int ticks)
3754{
3755 int i;
3756 struct rq *rq = this_rq();
3757
3758 for (i = 0; i < ticks; i++)
3759 irqtime_account_process_tick(current, 0, rq);
3760}
3761#else /* CONFIG_IRQ_TIME_ACCOUNTING */
3762static void irqtime_account_idle_ticks(int ticks) {}
3763static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
3764 struct rq *rq) {}
3765#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
3766
3638/* 3767/*
3639 * Account a single tick of cpu time. 3768 * Account a single tick of cpu time.
3640 * @p: the process that the cpu time gets accounted to 3769 * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3774,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
3645 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3774 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3646 struct rq *rq = this_rq(); 3775 struct rq *rq = this_rq();
3647 3776
3777 if (sched_clock_irqtime) {
3778 irqtime_account_process_tick(p, user_tick, rq);
3779 return;
3780 }
3781
3648 if (user_tick) 3782 if (user_tick)
3649 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3783 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3650 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3784 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3804,12 @@ void account_steal_ticks(unsigned long ticks)
3670 */ 3804 */
3671void account_idle_ticks(unsigned long ticks) 3805void account_idle_ticks(unsigned long ticks)
3672{ 3806{
3807
3808 if (sched_clock_irqtime) {
3809 irqtime_account_idle_ticks(ticks);
3810 return;
3811 }
3812
3673 account_idle_time(jiffies_to_cputime(ticks)); 3813 account_idle_time(jiffies_to_cputime(ticks));
3674} 3814}
3675 3815
@@ -3945,9 +4085,6 @@ need_resched:
3945 rcu_note_context_switch(cpu); 4085 rcu_note_context_switch(cpu);
3946 prev = rq->curr; 4086 prev = rq->curr;
3947 4087
3948 release_kernel_lock(prev);
3949need_resched_nonpreemptible:
3950
3951 schedule_debug(prev); 4088 schedule_debug(prev);
3952 4089
3953 if (sched_feat(HRTICK)) 4090 if (sched_feat(HRTICK))
@@ -3978,6 +4115,16 @@ need_resched_nonpreemptible:
3978 switch_count = &prev->nvcsw; 4115 switch_count = &prev->nvcsw;
3979 } 4116 }
3980 4117
4118 /*
4119 * If we are going to sleep and we have plugged IO queued, make
4120 * sure to submit it to avoid deadlocks.
4121 */
4122 if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
4123 raw_spin_unlock(&rq->lock);
4124 blk_flush_plug(prev);
4125 raw_spin_lock(&rq->lock);
4126 }
4127
3981 pre_schedule(rq, prev); 4128 pre_schedule(rq, prev);
3982 4129
3983 if (unlikely(!rq->nr_running)) 4130 if (unlikely(!rq->nr_running))
@@ -3989,9 +4136,6 @@ need_resched_nonpreemptible:
3989 rq->skip_clock_update = 0; 4136 rq->skip_clock_update = 0;
3990 4137
3991 if (likely(prev != next)) { 4138 if (likely(prev != next)) {
3992 sched_info_switch(prev, next);
3993 perf_event_task_sched_out(prev, next);
3994
3995 rq->nr_switches++; 4139 rq->nr_switches++;
3996 rq->curr = next; 4140 rq->curr = next;
3997 ++*switch_count; 4141 ++*switch_count;
@@ -4010,9 +4154,6 @@ need_resched_nonpreemptible:
4010 4154
4011 post_schedule(rq); 4155 post_schedule(rq);
4012 4156
4013 if (unlikely(reacquire_kernel_lock(prev)))
4014 goto need_resched_nonpreemptible;
4015
4016 preempt_enable_no_resched(); 4157 preempt_enable_no_resched();
4017 if (need_resched()) 4158 if (need_resched())
4018 goto need_resched; 4159 goto need_resched;
@@ -4213,6 +4354,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4213{ 4354{
4214 __wake_up_common(q, mode, 1, 0, key); 4355 __wake_up_common(q, mode, 1, 0, key);
4215} 4356}
4357EXPORT_SYMBOL_GPL(__wake_up_locked_key);
4216 4358
4217/** 4359/**
4218 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 4360 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4570,11 +4712,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4570 4712
4571 if (running) 4713 if (running)
4572 p->sched_class->set_curr_task(rq); 4714 p->sched_class->set_curr_task(rq);
4573 if (on_rq) { 4715 if (on_rq)
4574 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4575 4717
4576 check_class_changed(rq, p, prev_class, oldprio, running); 4718 check_class_changed(rq, p, prev_class, oldprio);
4577 }
4578 task_rq_unlock(rq, &flags); 4719 task_rq_unlock(rq, &flags);
4579} 4720}
4580 4721
@@ -4761,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p)
4761 4902
4762 rcu_read_lock(); 4903 rcu_read_lock();
4763 pcred = __task_cred(p); 4904 pcred = __task_cred(p);
4764 match = (cred->euid == pcred->euid || 4905 if (cred->user->user_ns == pcred->user->user_ns)
4765 cred->euid == pcred->uid); 4906 match = (cred->euid == pcred->euid ||
4907 cred->euid == pcred->uid);
4908 else
4909 match = false;
4766 rcu_read_unlock(); 4910 rcu_read_unlock();
4767 return match; 4911 return match;
4768} 4912}
@@ -4822,12 +4966,15 @@ recheck:
4822 param->sched_priority > rlim_rtprio) 4966 param->sched_priority > rlim_rtprio)
4823 return -EPERM; 4967 return -EPERM;
4824 } 4968 }
4969
4825 /* 4970 /*
4826 * Like positive nice levels, dont allow tasks to 4971 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4827 * move out of SCHED_IDLE either: 4972 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4828 */ 4973 */
4829 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4974 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4830 return -EPERM; 4975 if (!can_nice(p, TASK_NICE(p)))
4976 return -EPERM;
4977 }
4831 4978
4832 /* can't change other user's priorities */ 4979 /* can't change other user's priorities */
4833 if (!check_same_owner(p)) 4980 if (!check_same_owner(p))
@@ -4902,11 +5049,10 @@ recheck:
4902 5049
4903 if (running) 5050 if (running)
4904 p->sched_class->set_curr_task(rq); 5051 p->sched_class->set_curr_task(rq);
4905 if (on_rq) { 5052 if (on_rq)
4906 activate_task(rq, p, 0); 5053 activate_task(rq, p, 0);
4907 5054
4908 check_class_changed(rq, p, prev_class, oldprio, running); 5055 check_class_changed(rq, p, prev_class, oldprio);
4909 }
4910 __task_rq_unlock(rq); 5056 __task_rq_unlock(rq);
4911 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5057 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4912 5058
@@ -5088,7 +5234,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5088 goto out_free_cpus_allowed; 5234 goto out_free_cpus_allowed;
5089 } 5235 }
5090 retval = -EPERM; 5236 retval = -EPERM;
5091 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5237 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
5092 goto out_unlock; 5238 goto out_unlock;
5093 5239
5094 retval = security_task_setscheduler(p); 5240 retval = security_task_setscheduler(p);
@@ -5323,6 +5469,67 @@ void __sched yield(void)
5323} 5469}
5324EXPORT_SYMBOL(yield); 5470EXPORT_SYMBOL(yield);
5325 5471
5472/**
5473 * yield_to - yield the current processor to another thread in
5474 * your thread group, or accelerate that thread toward the
5475 * processor it's on.
5476 * @p: target task
5477 * @preempt: whether task preemption is allowed or not
5478 *
5479 * It's the caller's job to ensure that the target task struct
5480 * can't go away on us before we can do any checks.
5481 *
5482 * Returns true if we indeed boosted the target task.
5483 */
5484bool __sched yield_to(struct task_struct *p, bool preempt)
5485{
5486 struct task_struct *curr = current;
5487 struct rq *rq, *p_rq;
5488 unsigned long flags;
5489 bool yielded = 0;
5490
5491 local_irq_save(flags);
5492 rq = this_rq();
5493
5494again:
5495 p_rq = task_rq(p);
5496 double_rq_lock(rq, p_rq);
5497 while (task_rq(p) != p_rq) {
5498 double_rq_unlock(rq, p_rq);
5499 goto again;
5500 }
5501
5502 if (!curr->sched_class->yield_to_task)
5503 goto out;
5504
5505 if (curr->sched_class != p->sched_class)
5506 goto out;
5507
5508 if (task_running(p_rq, p) || p->state)
5509 goto out;
5510
5511 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5512 if (yielded) {
5513 schedstat_inc(rq, yld_count);
5514 /*
5515 * Make p's CPU reschedule; pick_next_entity takes care of
5516 * fairness.
5517 */
5518 if (preempt && rq != p_rq)
5519 resched_task(p_rq->curr);
5520 }
5521
5522out:
5523 double_rq_unlock(rq, p_rq);
5524 local_irq_restore(flags);
5525
5526 if (yielded)
5527 schedule();
5528
5529 return yielded;
5530}
5531EXPORT_SYMBOL_GPL(yield_to);
5532
5326/* 5533/*
5327 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5534 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5328 * that process accounting knows that this is a task in IO wait state. 5535 * that process accounting knows that this is a task in IO wait state.
@@ -5333,6 +5540,7 @@ void __sched io_schedule(void)
5333 5540
5334 delayacct_blkio_start(); 5541 delayacct_blkio_start();
5335 atomic_inc(&rq->nr_iowait); 5542 atomic_inc(&rq->nr_iowait);
5543 blk_flush_plug(current);
5336 current->in_iowait = 1; 5544 current->in_iowait = 1;
5337 schedule(); 5545 schedule();
5338 current->in_iowait = 0; 5546 current->in_iowait = 0;
@@ -5348,6 +5556,7 @@ long __sched io_schedule_timeout(long timeout)
5348 5556
5349 delayacct_blkio_start(); 5557 delayacct_blkio_start();
5350 atomic_inc(&rq->nr_iowait); 5558 atomic_inc(&rq->nr_iowait);
5559 blk_flush_plug(current);
5351 current->in_iowait = 1; 5560 current->in_iowait = 1;
5352 ret = schedule_timeout(timeout); 5561 ret = schedule_timeout(timeout);
5353 current->in_iowait = 0; 5562 current->in_iowait = 0;
@@ -5571,7 +5780,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5571 * The idle tasks have their own, simple scheduling class: 5780 * The idle tasks have their own, simple scheduling class:
5572 */ 5781 */
5573 idle->sched_class = &idle_sched_class; 5782 idle->sched_class = &idle_sched_class;
5574 ftrace_graph_init_task(idle); 5783 ftrace_graph_init_idle_task(idle, cpu);
5575} 5784}
5576 5785
5577/* 5786/*
@@ -7796,6 +8005,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7796 INIT_LIST_HEAD(&cfs_rq->tasks); 8005 INIT_LIST_HEAD(&cfs_rq->tasks);
7797#ifdef CONFIG_FAIR_GROUP_SCHED 8006#ifdef CONFIG_FAIR_GROUP_SCHED
7798 cfs_rq->rq = rq; 8007 cfs_rq->rq = rq;
8008 /* allow initial update_cfs_load() to truncate */
8009#ifdef CONFIG_SMP
8010 cfs_rq->load_stamp = 1;
8011#endif
7799#endif 8012#endif
7800 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 8013 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7801} 8014}
@@ -8074,7 +8287,7 @@ static inline int preempt_count_equals(int preempt_offset)
8074{ 8287{
8075 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 8288 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
8076 8289
8077 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 8290 return (nested == preempt_offset);
8078} 8291}
8079 8292
8080void __might_sleep(const char *file, int line, int preempt_offset) 8293void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8109,6 +8322,8 @@ EXPORT_SYMBOL(__might_sleep);
8109#ifdef CONFIG_MAGIC_SYSRQ 8322#ifdef CONFIG_MAGIC_SYSRQ
8110static void normalize_task(struct rq *rq, struct task_struct *p) 8323static void normalize_task(struct rq *rq, struct task_struct *p)
8111{ 8324{
8325 const struct sched_class *prev_class = p->sched_class;
8326 int old_prio = p->prio;
8112 int on_rq; 8327 int on_rq;
8113 8328
8114 on_rq = p->se.on_rq; 8329 on_rq = p->se.on_rq;
@@ -8119,6 +8334,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8119 activate_task(rq, p, 0); 8334 activate_task(rq, p, 0);
8120 resched_task(rq->curr); 8335 resched_task(rq->curr);
8121 } 8336 }
8337
8338 check_class_changed(rq, p, prev_class, old_prio);
8122} 8339}
8123 8340
8124void normalize_rt_tasks(void) 8341void normalize_rt_tasks(void)
@@ -8234,7 +8451,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8234{ 8451{
8235 struct cfs_rq *cfs_rq; 8452 struct cfs_rq *cfs_rq;
8236 struct sched_entity *se; 8453 struct sched_entity *se;
8237 struct rq *rq;
8238 int i; 8454 int i;
8239 8455
8240 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 8456 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8247,8 +8463,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8247 tg->shares = NICE_0_LOAD; 8463 tg->shares = NICE_0_LOAD;
8248 8464
8249 for_each_possible_cpu(i) { 8465 for_each_possible_cpu(i) {
8250 rq = cpu_rq(i);
8251
8252 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8466 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8253 GFP_KERNEL, cpu_to_node(i)); 8467 GFP_KERNEL, cpu_to_node(i));
8254 if (!cfs_rq) 8468 if (!cfs_rq)
@@ -8510,7 +8724,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510 /* Propagate contribution to hierarchy */ 8724 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags); 8725 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se) 8726 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0); 8727 update_cfs_shares(group_cfs_rq(se));
8514 raw_spin_unlock_irqrestore(&rq->lock, flags); 8728 raw_spin_unlock_irqrestore(&rq->lock, flags);
8515 } 8729 }
8516 8730
@@ -8884,7 +9098,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8884} 9098}
8885 9099
8886static void 9100static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) 9101cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9102 struct cgroup *old_cgrp, struct task_struct *task)
8888{ 9103{
8889 /* 9104 /*
8890 * cgroup_exit() is called in the copy_process() failure path. 9105 * cgroup_exit() is called in the copy_process() failure path.