diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 377 |
1 files changed, 296 insertions, 81 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 18d38e4ec7ba..f592ce6f8616 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -32,7 +32,6 @@ | |||
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/uaccess.h> | 33 | #include <linux/uaccess.h> |
34 | #include <linux/highmem.h> | 34 | #include <linux/highmem.h> |
35 | #include <linux/smp_lock.h> | ||
36 | #include <asm/mmu_context.h> | 35 | #include <asm/mmu_context.h> |
37 | #include <linux/interrupt.h> | 36 | #include <linux/interrupt.h> |
38 | #include <linux/capability.h> | 37 | #include <linux/capability.h> |
@@ -324,7 +323,7 @@ struct cfs_rq { | |||
324 | * 'curr' points to currently running entity on this cfs_rq. | 323 | * 'curr' points to currently running entity on this cfs_rq. |
325 | * It is set to NULL otherwise (i.e when none are currently running). | 324 | * It is set to NULL otherwise (i.e when none are currently running). |
326 | */ | 325 | */ |
327 | struct sched_entity *curr, *next, *last; | 326 | struct sched_entity *curr, *next, *last, *skip; |
328 | 327 | ||
329 | unsigned int nr_spread_over; | 328 | unsigned int nr_spread_over; |
330 | 329 | ||
@@ -606,9 +605,6 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
606 | struct task_group *tg; | 605 | struct task_group *tg; |
607 | struct cgroup_subsys_state *css; | 606 | struct cgroup_subsys_state *css; |
608 | 607 | ||
609 | if (p->flags & PF_EXITING) | ||
610 | return &root_task_group; | ||
611 | |||
612 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 608 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
613 | lockdep_is_held(&task_rq(p)->lock)); | 609 | lockdep_is_held(&task_rq(p)->lock)); |
614 | tg = container_of(css, struct task_group, css); | 610 | tg = container_of(css, struct task_group, css); |
@@ -664,10 +660,9 @@ static void update_rq_clock(struct rq *rq) | |||
664 | #endif | 660 | #endif |
665 | 661 | ||
666 | /** | 662 | /** |
667 | * runqueue_is_locked | 663 | * runqueue_is_locked - Returns true if the current cpu runqueue is locked |
668 | * @cpu: the processor in question. | 664 | * @cpu: the processor in question. |
669 | * | 665 | * |
670 | * Returns true if the current cpu runqueue is locked. | ||
671 | * This interface allows printk to be called with the runqueue lock | 666 | * This interface allows printk to be called with the runqueue lock |
672 | * held and know whether or not it is OK to wake up the klogd. | 667 | * held and know whether or not it is OK to wake up the klogd. |
673 | */ | 668 | */ |
@@ -1686,6 +1681,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1686 | __release(rq2->lock); | 1681 | __release(rq2->lock); |
1687 | } | 1682 | } |
1688 | 1683 | ||
1684 | #else /* CONFIG_SMP */ | ||
1685 | |||
1686 | /* | ||
1687 | * double_rq_lock - safely lock two runqueues | ||
1688 | * | ||
1689 | * Note this does not disable interrupts like task_rq_lock, | ||
1690 | * you need to do so manually before calling. | ||
1691 | */ | ||
1692 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1693 | __acquires(rq1->lock) | ||
1694 | __acquires(rq2->lock) | ||
1695 | { | ||
1696 | BUG_ON(!irqs_disabled()); | ||
1697 | BUG_ON(rq1 != rq2); | ||
1698 | raw_spin_lock(&rq1->lock); | ||
1699 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1700 | } | ||
1701 | |||
1702 | /* | ||
1703 | * double_rq_unlock - safely unlock two runqueues | ||
1704 | * | ||
1705 | * Note this does not restore interrupts like task_rq_unlock, | ||
1706 | * you need to do so manually after calling. | ||
1707 | */ | ||
1708 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1709 | __releases(rq1->lock) | ||
1710 | __releases(rq2->lock) | ||
1711 | { | ||
1712 | BUG_ON(rq1 != rq2); | ||
1713 | raw_spin_unlock(&rq1->lock); | ||
1714 | __release(rq2->lock); | ||
1715 | } | ||
1716 | |||
1689 | #endif | 1717 | #endif |
1690 | 1718 | ||
1691 | static void calc_load_account_idle(struct rq *this_rq); | 1719 | static void calc_load_account_idle(struct rq *this_rq); |
@@ -1880,7 +1908,7 @@ void account_system_vtime(struct task_struct *curr) | |||
1880 | */ | 1908 | */ |
1881 | if (hardirq_count()) | 1909 | if (hardirq_count()) |
1882 | __this_cpu_add(cpu_hardirq_time, delta); | 1910 | __this_cpu_add(cpu_hardirq_time, delta); |
1883 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 1911 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
1884 | __this_cpu_add(cpu_softirq_time, delta); | 1912 | __this_cpu_add(cpu_softirq_time, delta); |
1885 | 1913 | ||
1886 | irq_time_write_end(); | 1914 | irq_time_write_end(); |
@@ -1920,8 +1948,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
1920 | sched_rt_avg_update(rq, irq_delta); | 1948 | sched_rt_avg_update(rq, irq_delta); |
1921 | } | 1949 | } |
1922 | 1950 | ||
1951 | static int irqtime_account_hi_update(void) | ||
1952 | { | ||
1953 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
1954 | unsigned long flags; | ||
1955 | u64 latest_ns; | ||
1956 | int ret = 0; | ||
1957 | |||
1958 | local_irq_save(flags); | ||
1959 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
1960 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | ||
1961 | ret = 1; | ||
1962 | local_irq_restore(flags); | ||
1963 | return ret; | ||
1964 | } | ||
1965 | |||
1966 | static int irqtime_account_si_update(void) | ||
1967 | { | ||
1968 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
1969 | unsigned long flags; | ||
1970 | u64 latest_ns; | ||
1971 | int ret = 0; | ||
1972 | |||
1973 | local_irq_save(flags); | ||
1974 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
1975 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | ||
1976 | ret = 1; | ||
1977 | local_irq_restore(flags); | ||
1978 | return ret; | ||
1979 | } | ||
1980 | |||
1923 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 1981 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1924 | 1982 | ||
1983 | #define sched_clock_irqtime (0) | ||
1984 | |||
1925 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 1985 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1926 | { | 1986 | { |
1927 | rq->clock_task += delta; | 1987 | rq->clock_task += delta; |
@@ -2025,14 +2085,14 @@ inline int task_curr(const struct task_struct *p) | |||
2025 | 2085 | ||
2026 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2086 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
2027 | const struct sched_class *prev_class, | 2087 | const struct sched_class *prev_class, |
2028 | int oldprio, int running) | 2088 | int oldprio) |
2029 | { | 2089 | { |
2030 | if (prev_class != p->sched_class) { | 2090 | if (prev_class != p->sched_class) { |
2031 | if (prev_class->switched_from) | 2091 | if (prev_class->switched_from) |
2032 | prev_class->switched_from(rq, p, running); | 2092 | prev_class->switched_from(rq, p); |
2033 | p->sched_class->switched_to(rq, p, running); | 2093 | p->sched_class->switched_to(rq, p); |
2034 | } else | 2094 | } else if (oldprio != p->prio) |
2035 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2095 | p->sched_class->prio_changed(rq, p, oldprio); |
2036 | } | 2096 | } |
2037 | 2097 | ||
2038 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 2098 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
@@ -2224,7 +2284,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
2224 | * yield - it could be a while. | 2284 | * yield - it could be a while. |
2225 | */ | 2285 | */ |
2226 | if (unlikely(on_rq)) { | 2286 | if (unlikely(on_rq)) { |
2227 | schedule_timeout_uninterruptible(1); | 2287 | ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); |
2288 | |||
2289 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
2290 | schedule_hrtimeout(&to, HRTIMER_MODE_REL); | ||
2228 | continue; | 2291 | continue; |
2229 | } | 2292 | } |
2230 | 2293 | ||
@@ -2265,27 +2328,6 @@ void kick_process(struct task_struct *p) | |||
2265 | EXPORT_SYMBOL_GPL(kick_process); | 2328 | EXPORT_SYMBOL_GPL(kick_process); |
2266 | #endif /* CONFIG_SMP */ | 2329 | #endif /* CONFIG_SMP */ |
2267 | 2330 | ||
2268 | /** | ||
2269 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
2270 | * @p: the task to evaluate | ||
2271 | * @func: the function to be called | ||
2272 | * @info: the function call argument | ||
2273 | * | ||
2274 | * Calls the function @func when the task is currently running. This might | ||
2275 | * be on the current CPU, which just calls the function directly | ||
2276 | */ | ||
2277 | void task_oncpu_function_call(struct task_struct *p, | ||
2278 | void (*func) (void *info), void *info) | ||
2279 | { | ||
2280 | int cpu; | ||
2281 | |||
2282 | preempt_disable(); | ||
2283 | cpu = task_cpu(p); | ||
2284 | if (task_curr(p)) | ||
2285 | smp_call_function_single(cpu, func, info, 1); | ||
2286 | preempt_enable(); | ||
2287 | } | ||
2288 | |||
2289 | #ifdef CONFIG_SMP | 2331 | #ifdef CONFIG_SMP |
2290 | /* | 2332 | /* |
2291 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. | 2333 | * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. |
@@ -2566,6 +2608,7 @@ static void __sched_fork(struct task_struct *p) | |||
2566 | p->se.sum_exec_runtime = 0; | 2608 | p->se.sum_exec_runtime = 0; |
2567 | p->se.prev_sum_exec_runtime = 0; | 2609 | p->se.prev_sum_exec_runtime = 0; |
2568 | p->se.nr_migrations = 0; | 2610 | p->se.nr_migrations = 0; |
2611 | p->se.vruntime = 0; | ||
2569 | 2612 | ||
2570 | #ifdef CONFIG_SCHEDSTATS | 2613 | #ifdef CONFIG_SCHEDSTATS |
2571 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2614 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
@@ -2776,9 +2819,12 @@ static inline void | |||
2776 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 2819 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
2777 | struct task_struct *next) | 2820 | struct task_struct *next) |
2778 | { | 2821 | { |
2822 | sched_info_switch(prev, next); | ||
2823 | perf_event_task_sched_out(prev, next); | ||
2779 | fire_sched_out_preempt_notifiers(prev, next); | 2824 | fire_sched_out_preempt_notifiers(prev, next); |
2780 | prepare_lock_switch(rq, next); | 2825 | prepare_lock_switch(rq, next); |
2781 | prepare_arch_switch(next); | 2826 | prepare_arch_switch(next); |
2827 | trace_sched_switch(prev, next); | ||
2782 | } | 2828 | } |
2783 | 2829 | ||
2784 | /** | 2830 | /** |
@@ -2911,7 +2957,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2911 | struct mm_struct *mm, *oldmm; | 2957 | struct mm_struct *mm, *oldmm; |
2912 | 2958 | ||
2913 | prepare_task_switch(rq, prev, next); | 2959 | prepare_task_switch(rq, prev, next); |
2914 | trace_sched_switch(prev, next); | 2960 | |
2915 | mm = next->mm; | 2961 | mm = next->mm; |
2916 | oldmm = prev->active_mm; | 2962 | oldmm = prev->active_mm; |
2917 | /* | 2963 | /* |
@@ -3568,6 +3614,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3568 | } | 3614 | } |
3569 | 3615 | ||
3570 | /* | 3616 | /* |
3617 | * Account system cpu time to a process and desired cpustat field | ||
3618 | * @p: the process that the cpu time gets accounted to | ||
3619 | * @cputime: the cpu time spent in kernel space since the last update | ||
3620 | * @cputime_scaled: cputime scaled by cpu frequency | ||
3621 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
3622 | */ | ||
3623 | static inline | ||
3624 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
3625 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | ||
3626 | { | ||
3627 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3628 | |||
3629 | /* Add system time to process. */ | ||
3630 | p->stime = cputime_add(p->stime, cputime); | ||
3631 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3632 | account_group_system_time(p, cputime); | ||
3633 | |||
3634 | /* Add system time to cpustat. */ | ||
3635 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | ||
3636 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3637 | |||
3638 | /* Account for system time used */ | ||
3639 | acct_update_integrals(p); | ||
3640 | } | ||
3641 | |||
3642 | /* | ||
3571 | * Account system cpu time to a process. | 3643 | * Account system cpu time to a process. |
3572 | * @p: the process that the cpu time gets accounted to | 3644 | * @p: the process that the cpu time gets accounted to |
3573 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3645 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3578,36 +3650,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3578 | cputime_t cputime, cputime_t cputime_scaled) | 3650 | cputime_t cputime, cputime_t cputime_scaled) |
3579 | { | 3651 | { |
3580 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3652 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3581 | cputime64_t tmp; | 3653 | cputime64_t *target_cputime64; |
3582 | 3654 | ||
3583 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 3655 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3584 | account_guest_time(p, cputime, cputime_scaled); | 3656 | account_guest_time(p, cputime, cputime_scaled); |
3585 | return; | 3657 | return; |
3586 | } | 3658 | } |
3587 | 3659 | ||
3588 | /* Add system time to process. */ | ||
3589 | p->stime = cputime_add(p->stime, cputime); | ||
3590 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3591 | account_group_system_time(p, cputime); | ||
3592 | |||
3593 | /* Add system time to cpustat. */ | ||
3594 | tmp = cputime_to_cputime64(cputime); | ||
3595 | if (hardirq_count() - hardirq_offset) | 3660 | if (hardirq_count() - hardirq_offset) |
3596 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3661 | target_cputime64 = &cpustat->irq; |
3597 | else if (in_serving_softirq()) | 3662 | else if (in_serving_softirq()) |
3598 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3663 | target_cputime64 = &cpustat->softirq; |
3599 | else | 3664 | else |
3600 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3665 | target_cputime64 = &cpustat->system; |
3601 | 3666 | ||
3602 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | 3667 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); |
3603 | |||
3604 | /* Account for system time used */ | ||
3605 | acct_update_integrals(p); | ||
3606 | } | 3668 | } |
3607 | 3669 | ||
3608 | /* | 3670 | /* |
3609 | * Account for involuntary wait time. | 3671 | * Account for involuntary wait time. |
3610 | * @steal: the cpu time spent in involuntary wait | 3672 | * @cputime: the cpu time spent in involuntary wait |
3611 | */ | 3673 | */ |
3612 | void account_steal_time(cputime_t cputime) | 3674 | void account_steal_time(cputime_t cputime) |
3613 | { | 3675 | { |
@@ -3635,6 +3697,73 @@ void account_idle_time(cputime_t cputime) | |||
3635 | 3697 | ||
3636 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3698 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3637 | 3699 | ||
3700 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
3701 | /* | ||
3702 | * Account a tick to a process and cpustat | ||
3703 | * @p: the process that the cpu time gets accounted to | ||
3704 | * @user_tick: is the tick from userspace | ||
3705 | * @rq: the pointer to rq | ||
3706 | * | ||
3707 | * Tick demultiplexing follows the order | ||
3708 | * - pending hardirq update | ||
3709 | * - pending softirq update | ||
3710 | * - user_time | ||
3711 | * - idle_time | ||
3712 | * - system time | ||
3713 | * - check for guest_time | ||
3714 | * - else account as system_time | ||
3715 | * | ||
3716 | * Check for hardirq is done both for system and user time as there is | ||
3717 | * no timer going off while we are on hardirq and hence we may never get an | ||
3718 | * opportunity to update it solely in system time. | ||
3719 | * p->stime and friends are only updated on system time and not on irq | ||
3720 | * softirq as those do not count in task exec_runtime any more. | ||
3721 | */ | ||
3722 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3723 | struct rq *rq) | ||
3724 | { | ||
3725 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3726 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | ||
3727 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3728 | |||
3729 | if (irqtime_account_hi_update()) { | ||
3730 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | ||
3731 | } else if (irqtime_account_si_update()) { | ||
3732 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | ||
3733 | } else if (this_cpu_ksoftirqd() == p) { | ||
3734 | /* | ||
3735 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
3736 | * So, we have to handle it separately here. | ||
3737 | * Also, p->stime needs to be updated for ksoftirqd. | ||
3738 | */ | ||
3739 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3740 | &cpustat->softirq); | ||
3741 | } else if (user_tick) { | ||
3742 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3743 | } else if (p == rq->idle) { | ||
3744 | account_idle_time(cputime_one_jiffy); | ||
3745 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
3746 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3747 | } else { | ||
3748 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3749 | &cpustat->system); | ||
3750 | } | ||
3751 | } | ||
3752 | |||
3753 | static void irqtime_account_idle_ticks(int ticks) | ||
3754 | { | ||
3755 | int i; | ||
3756 | struct rq *rq = this_rq(); | ||
3757 | |||
3758 | for (i = 0; i < ticks; i++) | ||
3759 | irqtime_account_process_tick(current, 0, rq); | ||
3760 | } | ||
3761 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3762 | static void irqtime_account_idle_ticks(int ticks) {} | ||
3763 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3764 | struct rq *rq) {} | ||
3765 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3766 | |||
3638 | /* | 3767 | /* |
3639 | * Account a single tick of cpu time. | 3768 | * Account a single tick of cpu time. |
3640 | * @p: the process that the cpu time gets accounted to | 3769 | * @p: the process that the cpu time gets accounted to |
@@ -3645,6 +3774,11 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
3645 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 3774 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3646 | struct rq *rq = this_rq(); | 3775 | struct rq *rq = this_rq(); |
3647 | 3776 | ||
3777 | if (sched_clock_irqtime) { | ||
3778 | irqtime_account_process_tick(p, user_tick, rq); | ||
3779 | return; | ||
3780 | } | ||
3781 | |||
3648 | if (user_tick) | 3782 | if (user_tick) |
3649 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 3783 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3650 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 3784 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
@@ -3670,6 +3804,12 @@ void account_steal_ticks(unsigned long ticks) | |||
3670 | */ | 3804 | */ |
3671 | void account_idle_ticks(unsigned long ticks) | 3805 | void account_idle_ticks(unsigned long ticks) |
3672 | { | 3806 | { |
3807 | |||
3808 | if (sched_clock_irqtime) { | ||
3809 | irqtime_account_idle_ticks(ticks); | ||
3810 | return; | ||
3811 | } | ||
3812 | |||
3673 | account_idle_time(jiffies_to_cputime(ticks)); | 3813 | account_idle_time(jiffies_to_cputime(ticks)); |
3674 | } | 3814 | } |
3675 | 3815 | ||
@@ -3945,9 +4085,6 @@ need_resched: | |||
3945 | rcu_note_context_switch(cpu); | 4085 | rcu_note_context_switch(cpu); |
3946 | prev = rq->curr; | 4086 | prev = rq->curr; |
3947 | 4087 | ||
3948 | release_kernel_lock(prev); | ||
3949 | need_resched_nonpreemptible: | ||
3950 | |||
3951 | schedule_debug(prev); | 4088 | schedule_debug(prev); |
3952 | 4089 | ||
3953 | if (sched_feat(HRTICK)) | 4090 | if (sched_feat(HRTICK)) |
@@ -3978,6 +4115,16 @@ need_resched_nonpreemptible: | |||
3978 | switch_count = &prev->nvcsw; | 4115 | switch_count = &prev->nvcsw; |
3979 | } | 4116 | } |
3980 | 4117 | ||
4118 | /* | ||
4119 | * If we are going to sleep and we have plugged IO queued, make | ||
4120 | * sure to submit it to avoid deadlocks. | ||
4121 | */ | ||
4122 | if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) { | ||
4123 | raw_spin_unlock(&rq->lock); | ||
4124 | blk_flush_plug(prev); | ||
4125 | raw_spin_lock(&rq->lock); | ||
4126 | } | ||
4127 | |||
3981 | pre_schedule(rq, prev); | 4128 | pre_schedule(rq, prev); |
3982 | 4129 | ||
3983 | if (unlikely(!rq->nr_running)) | 4130 | if (unlikely(!rq->nr_running)) |
@@ -3989,9 +4136,6 @@ need_resched_nonpreemptible: | |||
3989 | rq->skip_clock_update = 0; | 4136 | rq->skip_clock_update = 0; |
3990 | 4137 | ||
3991 | if (likely(prev != next)) { | 4138 | if (likely(prev != next)) { |
3992 | sched_info_switch(prev, next); | ||
3993 | perf_event_task_sched_out(prev, next); | ||
3994 | |||
3995 | rq->nr_switches++; | 4139 | rq->nr_switches++; |
3996 | rq->curr = next; | 4140 | rq->curr = next; |
3997 | ++*switch_count; | 4141 | ++*switch_count; |
@@ -4010,9 +4154,6 @@ need_resched_nonpreemptible: | |||
4010 | 4154 | ||
4011 | post_schedule(rq); | 4155 | post_schedule(rq); |
4012 | 4156 | ||
4013 | if (unlikely(reacquire_kernel_lock(prev))) | ||
4014 | goto need_resched_nonpreemptible; | ||
4015 | |||
4016 | preempt_enable_no_resched(); | 4157 | preempt_enable_no_resched(); |
4017 | if (need_resched()) | 4158 | if (need_resched()) |
4018 | goto need_resched; | 4159 | goto need_resched; |
@@ -4213,6 +4354,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) | |||
4213 | { | 4354 | { |
4214 | __wake_up_common(q, mode, 1, 0, key); | 4355 | __wake_up_common(q, mode, 1, 0, key); |
4215 | } | 4356 | } |
4357 | EXPORT_SYMBOL_GPL(__wake_up_locked_key); | ||
4216 | 4358 | ||
4217 | /** | 4359 | /** |
4218 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. | 4360 | * __wake_up_sync_key - wake up threads blocked on a waitqueue. |
@@ -4570,11 +4712,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4570 | 4712 | ||
4571 | if (running) | 4713 | if (running) |
4572 | p->sched_class->set_curr_task(rq); | 4714 | p->sched_class->set_curr_task(rq); |
4573 | if (on_rq) { | 4715 | if (on_rq) |
4574 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4716 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4575 | 4717 | ||
4576 | check_class_changed(rq, p, prev_class, oldprio, running); | 4718 | check_class_changed(rq, p, prev_class, oldprio); |
4577 | } | ||
4578 | task_rq_unlock(rq, &flags); | 4719 | task_rq_unlock(rq, &flags); |
4579 | } | 4720 | } |
4580 | 4721 | ||
@@ -4761,8 +4902,11 @@ static bool check_same_owner(struct task_struct *p) | |||
4761 | 4902 | ||
4762 | rcu_read_lock(); | 4903 | rcu_read_lock(); |
4763 | pcred = __task_cred(p); | 4904 | pcred = __task_cred(p); |
4764 | match = (cred->euid == pcred->euid || | 4905 | if (cred->user->user_ns == pcred->user->user_ns) |
4765 | cred->euid == pcred->uid); | 4906 | match = (cred->euid == pcred->euid || |
4907 | cred->euid == pcred->uid); | ||
4908 | else | ||
4909 | match = false; | ||
4766 | rcu_read_unlock(); | 4910 | rcu_read_unlock(); |
4767 | return match; | 4911 | return match; |
4768 | } | 4912 | } |
@@ -4822,12 +4966,15 @@ recheck: | |||
4822 | param->sched_priority > rlim_rtprio) | 4966 | param->sched_priority > rlim_rtprio) |
4823 | return -EPERM; | 4967 | return -EPERM; |
4824 | } | 4968 | } |
4969 | |||
4825 | /* | 4970 | /* |
4826 | * Like positive nice levels, dont allow tasks to | 4971 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
4827 | * move out of SCHED_IDLE either: | 4972 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
4828 | */ | 4973 | */ |
4829 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 4974 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
4830 | return -EPERM; | 4975 | if (!can_nice(p, TASK_NICE(p))) |
4976 | return -EPERM; | ||
4977 | } | ||
4831 | 4978 | ||
4832 | /* can't change other user's priorities */ | 4979 | /* can't change other user's priorities */ |
4833 | if (!check_same_owner(p)) | 4980 | if (!check_same_owner(p)) |
@@ -4902,11 +5049,10 @@ recheck: | |||
4902 | 5049 | ||
4903 | if (running) | 5050 | if (running) |
4904 | p->sched_class->set_curr_task(rq); | 5051 | p->sched_class->set_curr_task(rq); |
4905 | if (on_rq) { | 5052 | if (on_rq) |
4906 | activate_task(rq, p, 0); | 5053 | activate_task(rq, p, 0); |
4907 | 5054 | ||
4908 | check_class_changed(rq, p, prev_class, oldprio, running); | 5055 | check_class_changed(rq, p, prev_class, oldprio); |
4909 | } | ||
4910 | __task_rq_unlock(rq); | 5056 | __task_rq_unlock(rq); |
4911 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5057 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4912 | 5058 | ||
@@ -5088,7 +5234,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
5088 | goto out_free_cpus_allowed; | 5234 | goto out_free_cpus_allowed; |
5089 | } | 5235 | } |
5090 | retval = -EPERM; | 5236 | retval = -EPERM; |
5091 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5237 | if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) |
5092 | goto out_unlock; | 5238 | goto out_unlock; |
5093 | 5239 | ||
5094 | retval = security_task_setscheduler(p); | 5240 | retval = security_task_setscheduler(p); |
@@ -5323,6 +5469,67 @@ void __sched yield(void) | |||
5323 | } | 5469 | } |
5324 | EXPORT_SYMBOL(yield); | 5470 | EXPORT_SYMBOL(yield); |
5325 | 5471 | ||
5472 | /** | ||
5473 | * yield_to - yield the current processor to another thread in | ||
5474 | * your thread group, or accelerate that thread toward the | ||
5475 | * processor it's on. | ||
5476 | * @p: target task | ||
5477 | * @preempt: whether task preemption is allowed or not | ||
5478 | * | ||
5479 | * It's the caller's job to ensure that the target task struct | ||
5480 | * can't go away on us before we can do any checks. | ||
5481 | * | ||
5482 | * Returns true if we indeed boosted the target task. | ||
5483 | */ | ||
5484 | bool __sched yield_to(struct task_struct *p, bool preempt) | ||
5485 | { | ||
5486 | struct task_struct *curr = current; | ||
5487 | struct rq *rq, *p_rq; | ||
5488 | unsigned long flags; | ||
5489 | bool yielded = 0; | ||
5490 | |||
5491 | local_irq_save(flags); | ||
5492 | rq = this_rq(); | ||
5493 | |||
5494 | again: | ||
5495 | p_rq = task_rq(p); | ||
5496 | double_rq_lock(rq, p_rq); | ||
5497 | while (task_rq(p) != p_rq) { | ||
5498 | double_rq_unlock(rq, p_rq); | ||
5499 | goto again; | ||
5500 | } | ||
5501 | |||
5502 | if (!curr->sched_class->yield_to_task) | ||
5503 | goto out; | ||
5504 | |||
5505 | if (curr->sched_class != p->sched_class) | ||
5506 | goto out; | ||
5507 | |||
5508 | if (task_running(p_rq, p) || p->state) | ||
5509 | goto out; | ||
5510 | |||
5511 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | ||
5512 | if (yielded) { | ||
5513 | schedstat_inc(rq, yld_count); | ||
5514 | /* | ||
5515 | * Make p's CPU reschedule; pick_next_entity takes care of | ||
5516 | * fairness. | ||
5517 | */ | ||
5518 | if (preempt && rq != p_rq) | ||
5519 | resched_task(p_rq->curr); | ||
5520 | } | ||
5521 | |||
5522 | out: | ||
5523 | double_rq_unlock(rq, p_rq); | ||
5524 | local_irq_restore(flags); | ||
5525 | |||
5526 | if (yielded) | ||
5527 | schedule(); | ||
5528 | |||
5529 | return yielded; | ||
5530 | } | ||
5531 | EXPORT_SYMBOL_GPL(yield_to); | ||
5532 | |||
5326 | /* | 5533 | /* |
5327 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5534 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5328 | * that process accounting knows that this is a task in IO wait state. | 5535 | * that process accounting knows that this is a task in IO wait state. |
@@ -5333,6 +5540,7 @@ void __sched io_schedule(void) | |||
5333 | 5540 | ||
5334 | delayacct_blkio_start(); | 5541 | delayacct_blkio_start(); |
5335 | atomic_inc(&rq->nr_iowait); | 5542 | atomic_inc(&rq->nr_iowait); |
5543 | blk_flush_plug(current); | ||
5336 | current->in_iowait = 1; | 5544 | current->in_iowait = 1; |
5337 | schedule(); | 5545 | schedule(); |
5338 | current->in_iowait = 0; | 5546 | current->in_iowait = 0; |
@@ -5348,6 +5556,7 @@ long __sched io_schedule_timeout(long timeout) | |||
5348 | 5556 | ||
5349 | delayacct_blkio_start(); | 5557 | delayacct_blkio_start(); |
5350 | atomic_inc(&rq->nr_iowait); | 5558 | atomic_inc(&rq->nr_iowait); |
5559 | blk_flush_plug(current); | ||
5351 | current->in_iowait = 1; | 5560 | current->in_iowait = 1; |
5352 | ret = schedule_timeout(timeout); | 5561 | ret = schedule_timeout(timeout); |
5353 | current->in_iowait = 0; | 5562 | current->in_iowait = 0; |
@@ -5571,7 +5780,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5571 | * The idle tasks have their own, simple scheduling class: | 5780 | * The idle tasks have their own, simple scheduling class: |
5572 | */ | 5781 | */ |
5573 | idle->sched_class = &idle_sched_class; | 5782 | idle->sched_class = &idle_sched_class; |
5574 | ftrace_graph_init_task(idle); | 5783 | ftrace_graph_init_idle_task(idle, cpu); |
5575 | } | 5784 | } |
5576 | 5785 | ||
5577 | /* | 5786 | /* |
@@ -7796,6 +8005,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
7796 | INIT_LIST_HEAD(&cfs_rq->tasks); | 8005 | INIT_LIST_HEAD(&cfs_rq->tasks); |
7797 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8006 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7798 | cfs_rq->rq = rq; | 8007 | cfs_rq->rq = rq; |
8008 | /* allow initial update_cfs_load() to truncate */ | ||
8009 | #ifdef CONFIG_SMP | ||
8010 | cfs_rq->load_stamp = 1; | ||
8011 | #endif | ||
7799 | #endif | 8012 | #endif |
7800 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 8013 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
7801 | } | 8014 | } |
@@ -8074,7 +8287,7 @@ static inline int preempt_count_equals(int preempt_offset) | |||
8074 | { | 8287 | { |
8075 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); | 8288 | int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); |
8076 | 8289 | ||
8077 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | 8290 | return (nested == preempt_offset); |
8078 | } | 8291 | } |
8079 | 8292 | ||
8080 | void __might_sleep(const char *file, int line, int preempt_offset) | 8293 | void __might_sleep(const char *file, int line, int preempt_offset) |
@@ -8109,6 +8322,8 @@ EXPORT_SYMBOL(__might_sleep); | |||
8109 | #ifdef CONFIG_MAGIC_SYSRQ | 8322 | #ifdef CONFIG_MAGIC_SYSRQ |
8110 | static void normalize_task(struct rq *rq, struct task_struct *p) | 8323 | static void normalize_task(struct rq *rq, struct task_struct *p) |
8111 | { | 8324 | { |
8325 | const struct sched_class *prev_class = p->sched_class; | ||
8326 | int old_prio = p->prio; | ||
8112 | int on_rq; | 8327 | int on_rq; |
8113 | 8328 | ||
8114 | on_rq = p->se.on_rq; | 8329 | on_rq = p->se.on_rq; |
@@ -8119,6 +8334,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8119 | activate_task(rq, p, 0); | 8334 | activate_task(rq, p, 0); |
8120 | resched_task(rq->curr); | 8335 | resched_task(rq->curr); |
8121 | } | 8336 | } |
8337 | |||
8338 | check_class_changed(rq, p, prev_class, old_prio); | ||
8122 | } | 8339 | } |
8123 | 8340 | ||
8124 | void normalize_rt_tasks(void) | 8341 | void normalize_rt_tasks(void) |
@@ -8234,7 +8451,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8234 | { | 8451 | { |
8235 | struct cfs_rq *cfs_rq; | 8452 | struct cfs_rq *cfs_rq; |
8236 | struct sched_entity *se; | 8453 | struct sched_entity *se; |
8237 | struct rq *rq; | ||
8238 | int i; | 8454 | int i; |
8239 | 8455 | ||
8240 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); | 8456 | tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); |
@@ -8247,8 +8463,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8247 | tg->shares = NICE_0_LOAD; | 8463 | tg->shares = NICE_0_LOAD; |
8248 | 8464 | ||
8249 | for_each_possible_cpu(i) { | 8465 | for_each_possible_cpu(i) { |
8250 | rq = cpu_rq(i); | ||
8251 | |||
8252 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), | 8466 | cfs_rq = kzalloc_node(sizeof(struct cfs_rq), |
8253 | GFP_KERNEL, cpu_to_node(i)); | 8467 | GFP_KERNEL, cpu_to_node(i)); |
8254 | if (!cfs_rq) | 8468 | if (!cfs_rq) |
@@ -8510,7 +8724,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8510 | /* Propagate contribution to hierarchy */ | 8724 | /* Propagate contribution to hierarchy */ |
8511 | raw_spin_lock_irqsave(&rq->lock, flags); | 8725 | raw_spin_lock_irqsave(&rq->lock, flags); |
8512 | for_each_sched_entity(se) | 8726 | for_each_sched_entity(se) |
8513 | update_cfs_shares(group_cfs_rq(se), 0); | 8727 | update_cfs_shares(group_cfs_rq(se)); |
8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8728 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
8515 | } | 8729 | } |
8516 | 8730 | ||
@@ -8884,7 +9098,8 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
8884 | } | 9098 | } |
8885 | 9099 | ||
8886 | static void | 9100 | static void |
8887 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) | 9101 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, |
9102 | struct cgroup *old_cgrp, struct task_struct *task) | ||
8888 | { | 9103 | { |
8889 | /* | 9104 | /* |
8890 | * cgroup_exit() is called in the copy_process() failure path. | 9105 | * cgroup_exit() is called in the copy_process() failure path. |