diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-15 21:37:30 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-15 21:37:30 -0400 |
commit | 9620639b7ea3843983f4ced8b4c81eb4d8974838 (patch) | |
tree | 54266fac3bcf89e61ae06c7d36ca708df6e0ea33 | |
parent | a926021cb1f8a99a275eaf6eb546102e9469dc59 (diff) | |
parent | 6d1cafd8b56ea726c10a5a104de57cc3ed8fa953 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (26 commits)
sched: Resched proper CPU on yield_to()
sched: Allow users with sufficient RLIMIT_NICE to change from SCHED_IDLE policy
sched: Allow SCHED_BATCH to preempt SCHED_IDLE tasks
sched: Clean up the IRQ_TIME_ACCOUNTING code
sched: Add #ifdef around irq time accounting functions
sched, autogroup: Stop claiming ownership of the root task group
sched, autogroup: Stop going ahead if autogroup is disabled
sched, autogroup, sysctl: Use proc_dointvec_minmax() instead
sched: Fix the group_imb logic
sched: Clean up some f_b_g() comments
sched: Clean up remnants of sd_idle
sched: Wholesale removal of sd_idle logic
sched: Add yield_to(task, preempt) functionality
sched: Use a buddy to implement yield_task_fair()
sched: Limit the scope of clear_buddies
sched: Check the right ->nr_running in yield_task_fair()
sched: Avoid expensive initial update_cfs_load(), on UP too
sched: Fix switch_from_fair()
sched: Simplify the idle scheduling class
softirqs: Account ksoftirqd time as cpustat softirq
...
-rw-r--r-- | include/asm-generic/cputime.h | 3 | ||||
-rw-r--r-- | include/linux/interrupt.h | 7 | ||||
-rw-r--r-- | include/linux/jiffies.h | 1 | ||||
-rw-r--r-- | include/linux/sched.h | 13 | ||||
-rw-r--r-- | kernel/sched.c | 296 | ||||
-rw-r--r-- | kernel/sched_autogroup.c | 15 | ||||
-rw-r--r-- | kernel/sched_autogroup.h | 5 | ||||
-rw-r--r-- | kernel/sched_debug.c | 2 | ||||
-rw-r--r-- | kernel/sched_fair.c | 397 | ||||
-rw-r--r-- | kernel/sched_idletask.c | 26 | ||||
-rw-r--r-- | kernel/sched_rt.c | 19 | ||||
-rw-r--r-- | kernel/sched_stoptask.c | 7 | ||||
-rw-r--r-- | kernel/softirq.c | 3 | ||||
-rw-r--r-- | kernel/sysctl.c | 9 | ||||
-rw-r--r-- | kernel/time.c | 23 |
15 files changed, 565 insertions, 261 deletions
diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 2bcc5c7c22a6..61e03dd7939e 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h | |||
@@ -30,6 +30,9 @@ typedef u64 cputime64_t; | |||
30 | #define cputime64_to_jiffies64(__ct) (__ct) | 30 | #define cputime64_to_jiffies64(__ct) (__ct) |
31 | #define jiffies64_to_cputime64(__jif) (__jif) | 31 | #define jiffies64_to_cputime64(__jif) (__jif) |
32 | #define cputime_to_cputime64(__ct) ((u64) __ct) | 32 | #define cputime_to_cputime64(__ct) ((u64) __ct) |
33 | #define cputime64_gt(__a, __b) ((__a) > (__b)) | ||
34 | |||
35 | #define nsecs_to_cputime64(__ct) nsecs_to_jiffies64(__ct) | ||
33 | 36 | ||
34 | 37 | ||
35 | /* | 38 | /* |
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index d746da19c6a2..2eb16e03422f 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h | |||
@@ -427,6 +427,13 @@ extern void raise_softirq(unsigned int nr); | |||
427 | */ | 427 | */ |
428 | DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); | 428 | DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); |
429 | 429 | ||
430 | DECLARE_PER_CPU(struct task_struct *, ksoftirqd); | ||
431 | |||
432 | static inline struct task_struct *this_cpu_ksoftirqd(void) | ||
433 | { | ||
434 | return this_cpu_read(ksoftirqd); | ||
435 | } | ||
436 | |||
430 | /* Try to send a softirq to a remote cpu. If this cannot be done, the | 437 | /* Try to send a softirq to a remote cpu. If this cannot be done, the |
431 | * work will be queued to the local cpu. | 438 | * work will be queued to the local cpu. |
432 | */ | 439 | */ |
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 6811f4bfc6e7..922aa313c9f9 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h | |||
@@ -307,6 +307,7 @@ extern clock_t jiffies_to_clock_t(long x); | |||
307 | extern unsigned long clock_t_to_jiffies(unsigned long x); | 307 | extern unsigned long clock_t_to_jiffies(unsigned long x); |
308 | extern u64 jiffies_64_to_clock_t(u64 x); | 308 | extern u64 jiffies_64_to_clock_t(u64 x); |
309 | extern u64 nsec_to_clock_t(u64 x); | 309 | extern u64 nsec_to_clock_t(u64 x); |
310 | extern u64 nsecs_to_jiffies64(u64 n); | ||
310 | extern unsigned long nsecs_to_jiffies(u64 n); | 311 | extern unsigned long nsecs_to_jiffies(u64 n); |
311 | 312 | ||
312 | #define TIMESTAMP_SIZE 30 | 313 | #define TIMESTAMP_SIZE 30 |
diff --git a/include/linux/sched.h b/include/linux/sched.h index c57e5278df83..214af2ed11b5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1058,6 +1058,7 @@ struct sched_class { | |||
1058 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | 1058 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); |
1059 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | 1059 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); |
1060 | void (*yield_task) (struct rq *rq); | 1060 | void (*yield_task) (struct rq *rq); |
1061 | bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); | ||
1061 | 1062 | ||
1062 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | 1063 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); |
1063 | 1064 | ||
@@ -1084,12 +1085,10 @@ struct sched_class { | |||
1084 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1085 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); |
1085 | void (*task_fork) (struct task_struct *p); | 1086 | void (*task_fork) (struct task_struct *p); |
1086 | 1087 | ||
1087 | void (*switched_from) (struct rq *this_rq, struct task_struct *task, | 1088 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); |
1088 | int running); | 1089 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
1089 | void (*switched_to) (struct rq *this_rq, struct task_struct *task, | ||
1090 | int running); | ||
1091 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 1090 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
1092 | int oldprio, int running); | 1091 | int oldprio); |
1093 | 1092 | ||
1094 | unsigned int (*get_rr_interval) (struct rq *rq, | 1093 | unsigned int (*get_rr_interval) (struct rq *rq, |
1095 | struct task_struct *task); | 1094 | struct task_struct *task); |
@@ -1715,7 +1714,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * | |||
1715 | /* | 1714 | /* |
1716 | * Per process flags | 1715 | * Per process flags |
1717 | */ | 1716 | */ |
1718 | #define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */ | ||
1719 | #define PF_STARTING 0x00000002 /* being created */ | 1717 | #define PF_STARTING 0x00000002 /* being created */ |
1720 | #define PF_EXITING 0x00000004 /* getting shut down */ | 1718 | #define PF_EXITING 0x00000004 /* getting shut down */ |
1721 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ | 1719 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ |
@@ -1945,8 +1943,6 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
1945 | void __user *buffer, size_t *lenp, | 1943 | void __user *buffer, size_t *lenp, |
1946 | loff_t *ppos); | 1944 | loff_t *ppos); |
1947 | 1945 | ||
1948 | extern unsigned int sysctl_sched_compat_yield; | ||
1949 | |||
1950 | #ifdef CONFIG_SCHED_AUTOGROUP | 1946 | #ifdef CONFIG_SCHED_AUTOGROUP |
1951 | extern unsigned int sysctl_sched_autogroup_enabled; | 1947 | extern unsigned int sysctl_sched_autogroup_enabled; |
1952 | 1948 | ||
@@ -1977,6 +1973,7 @@ static inline int rt_mutex_getprio(struct task_struct *p) | |||
1977 | # define rt_mutex_adjust_pi(p) do { } while (0) | 1973 | # define rt_mutex_adjust_pi(p) do { } while (0) |
1978 | #endif | 1974 | #endif |
1979 | 1975 | ||
1976 | extern bool yield_to(struct task_struct *p, bool preempt); | ||
1980 | extern void set_user_nice(struct task_struct *p, long nice); | 1977 | extern void set_user_nice(struct task_struct *p, long nice); |
1981 | extern int task_prio(const struct task_struct *p); | 1978 | extern int task_prio(const struct task_struct *p); |
1982 | extern int task_nice(const struct task_struct *p); | 1979 | extern int task_nice(const struct task_struct *p); |
diff --git a/kernel/sched.c b/kernel/sched.c index 57a18e8d28c8..27125e413576 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -324,7 +324,7 @@ struct cfs_rq { | |||
324 | * 'curr' points to currently running entity on this cfs_rq. | 324 | * 'curr' points to currently running entity on this cfs_rq. |
325 | * It is set to NULL otherwise (i.e when none are currently running). | 325 | * It is set to NULL otherwise (i.e when none are currently running). |
326 | */ | 326 | */ |
327 | struct sched_entity *curr, *next, *last; | 327 | struct sched_entity *curr, *next, *last, *skip; |
328 | 328 | ||
329 | unsigned int nr_spread_over; | 329 | unsigned int nr_spread_over; |
330 | 330 | ||
@@ -1683,6 +1683,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1683 | __release(rq2->lock); | 1683 | __release(rq2->lock); |
1684 | } | 1684 | } |
1685 | 1685 | ||
1686 | #else /* CONFIG_SMP */ | ||
1687 | |||
1688 | /* | ||
1689 | * double_rq_lock - safely lock two runqueues | ||
1690 | * | ||
1691 | * Note this does not disable interrupts like task_rq_lock, | ||
1692 | * you need to do so manually before calling. | ||
1693 | */ | ||
1694 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | ||
1695 | __acquires(rq1->lock) | ||
1696 | __acquires(rq2->lock) | ||
1697 | { | ||
1698 | BUG_ON(!irqs_disabled()); | ||
1699 | BUG_ON(rq1 != rq2); | ||
1700 | raw_spin_lock(&rq1->lock); | ||
1701 | __acquire(rq2->lock); /* Fake it out ;) */ | ||
1702 | } | ||
1703 | |||
1704 | /* | ||
1705 | * double_rq_unlock - safely unlock two runqueues | ||
1706 | * | ||
1707 | * Note this does not restore interrupts like task_rq_unlock, | ||
1708 | * you need to do so manually after calling. | ||
1709 | */ | ||
1710 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | ||
1711 | __releases(rq1->lock) | ||
1712 | __releases(rq2->lock) | ||
1713 | { | ||
1714 | BUG_ON(rq1 != rq2); | ||
1715 | raw_spin_unlock(&rq1->lock); | ||
1716 | __release(rq2->lock); | ||
1717 | } | ||
1718 | |||
1686 | #endif | 1719 | #endif |
1687 | 1720 | ||
1688 | static void calc_load_account_idle(struct rq *this_rq); | 1721 | static void calc_load_account_idle(struct rq *this_rq); |
@@ -1877,7 +1910,7 @@ void account_system_vtime(struct task_struct *curr) | |||
1877 | */ | 1910 | */ |
1878 | if (hardirq_count()) | 1911 | if (hardirq_count()) |
1879 | __this_cpu_add(cpu_hardirq_time, delta); | 1912 | __this_cpu_add(cpu_hardirq_time, delta); |
1880 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 1913 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) |
1881 | __this_cpu_add(cpu_softirq_time, delta); | 1914 | __this_cpu_add(cpu_softirq_time, delta); |
1882 | 1915 | ||
1883 | irq_time_write_end(); | 1916 | irq_time_write_end(); |
@@ -1917,8 +1950,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
1917 | sched_rt_avg_update(rq, irq_delta); | 1950 | sched_rt_avg_update(rq, irq_delta); |
1918 | } | 1951 | } |
1919 | 1952 | ||
1953 | static int irqtime_account_hi_update(void) | ||
1954 | { | ||
1955 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
1956 | unsigned long flags; | ||
1957 | u64 latest_ns; | ||
1958 | int ret = 0; | ||
1959 | |||
1960 | local_irq_save(flags); | ||
1961 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
1962 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) | ||
1963 | ret = 1; | ||
1964 | local_irq_restore(flags); | ||
1965 | return ret; | ||
1966 | } | ||
1967 | |||
1968 | static int irqtime_account_si_update(void) | ||
1969 | { | ||
1970 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
1971 | unsigned long flags; | ||
1972 | u64 latest_ns; | ||
1973 | int ret = 0; | ||
1974 | |||
1975 | local_irq_save(flags); | ||
1976 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
1977 | if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) | ||
1978 | ret = 1; | ||
1979 | local_irq_restore(flags); | ||
1980 | return ret; | ||
1981 | } | ||
1982 | |||
1920 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 1983 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
1921 | 1984 | ||
1985 | #define sched_clock_irqtime (0) | ||
1986 | |||
1922 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 1987 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1923 | { | 1988 | { |
1924 | rq->clock_task += delta; | 1989 | rq->clock_task += delta; |
@@ -2022,14 +2087,14 @@ inline int task_curr(const struct task_struct *p) | |||
2022 | 2087 | ||
2023 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 2088 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
2024 | const struct sched_class *prev_class, | 2089 | const struct sched_class *prev_class, |
2025 | int oldprio, int running) | 2090 | int oldprio) |
2026 | { | 2091 | { |
2027 | if (prev_class != p->sched_class) { | 2092 | if (prev_class != p->sched_class) { |
2028 | if (prev_class->switched_from) | 2093 | if (prev_class->switched_from) |
2029 | prev_class->switched_from(rq, p, running); | 2094 | prev_class->switched_from(rq, p); |
2030 | p->sched_class->switched_to(rq, p, running); | 2095 | p->sched_class->switched_to(rq, p); |
2031 | } else | 2096 | } else if (oldprio != p->prio) |
2032 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2097 | p->sched_class->prio_changed(rq, p, oldprio); |
2033 | } | 2098 | } |
2034 | 2099 | ||
2035 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | 2100 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) |
@@ -2542,6 +2607,7 @@ static void __sched_fork(struct task_struct *p) | |||
2542 | p->se.sum_exec_runtime = 0; | 2607 | p->se.sum_exec_runtime = 0; |
2543 | p->se.prev_sum_exec_runtime = 0; | 2608 | p->se.prev_sum_exec_runtime = 0; |
2544 | p->se.nr_migrations = 0; | 2609 | p->se.nr_migrations = 0; |
2610 | p->se.vruntime = 0; | ||
2545 | 2611 | ||
2546 | #ifdef CONFIG_SCHEDSTATS | 2612 | #ifdef CONFIG_SCHEDSTATS |
2547 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); | 2613 | memset(&p->se.statistics, 0, sizeof(p->se.statistics)); |
@@ -3547,6 +3613,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
3547 | } | 3613 | } |
3548 | 3614 | ||
3549 | /* | 3615 | /* |
3616 | * Account system cpu time to a process and desired cpustat field | ||
3617 | * @p: the process that the cpu time gets accounted to | ||
3618 | * @cputime: the cpu time spent in kernel space since the last update | ||
3619 | * @cputime_scaled: cputime scaled by cpu frequency | ||
3620 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
3621 | */ | ||
3622 | static inline | ||
3623 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
3624 | cputime_t cputime_scaled, cputime64_t *target_cputime64) | ||
3625 | { | ||
3626 | cputime64_t tmp = cputime_to_cputime64(cputime); | ||
3627 | |||
3628 | /* Add system time to process. */ | ||
3629 | p->stime = cputime_add(p->stime, cputime); | ||
3630 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3631 | account_group_system_time(p, cputime); | ||
3632 | |||
3633 | /* Add system time to cpustat. */ | ||
3634 | *target_cputime64 = cputime64_add(*target_cputime64, tmp); | ||
3635 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3636 | |||
3637 | /* Account for system time used */ | ||
3638 | acct_update_integrals(p); | ||
3639 | } | ||
3640 | |||
3641 | /* | ||
3550 | * Account system cpu time to a process. | 3642 | * Account system cpu time to a process. |
3551 | * @p: the process that the cpu time gets accounted to | 3643 | * @p: the process that the cpu time gets accounted to |
3552 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3644 | * @hardirq_offset: the offset to subtract from hardirq_count() |
@@ -3557,36 +3649,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3557 | cputime_t cputime, cputime_t cputime_scaled) | 3649 | cputime_t cputime, cputime_t cputime_scaled) |
3558 | { | 3650 | { |
3559 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3651 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3560 | cputime64_t tmp; | 3652 | cputime64_t *target_cputime64; |
3561 | 3653 | ||
3562 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 3654 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
3563 | account_guest_time(p, cputime, cputime_scaled); | 3655 | account_guest_time(p, cputime, cputime_scaled); |
3564 | return; | 3656 | return; |
3565 | } | 3657 | } |
3566 | 3658 | ||
3567 | /* Add system time to process. */ | ||
3568 | p->stime = cputime_add(p->stime, cputime); | ||
3569 | p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); | ||
3570 | account_group_system_time(p, cputime); | ||
3571 | |||
3572 | /* Add system time to cpustat. */ | ||
3573 | tmp = cputime_to_cputime64(cputime); | ||
3574 | if (hardirq_count() - hardirq_offset) | 3659 | if (hardirq_count() - hardirq_offset) |
3575 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3660 | target_cputime64 = &cpustat->irq; |
3576 | else if (in_serving_softirq()) | 3661 | else if (in_serving_softirq()) |
3577 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3662 | target_cputime64 = &cpustat->softirq; |
3578 | else | 3663 | else |
3579 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3664 | target_cputime64 = &cpustat->system; |
3580 | |||
3581 | cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); | ||
3582 | 3665 | ||
3583 | /* Account for system time used */ | 3666 | __account_system_time(p, cputime, cputime_scaled, target_cputime64); |
3584 | acct_update_integrals(p); | ||
3585 | } | 3667 | } |
3586 | 3668 | ||
3587 | /* | 3669 | /* |
3588 | * Account for involuntary wait time. | 3670 | * Account for involuntary wait time. |
3589 | * @steal: the cpu time spent in involuntary wait | 3671 | * @cputime: the cpu time spent in involuntary wait |
3590 | */ | 3672 | */ |
3591 | void account_steal_time(cputime_t cputime) | 3673 | void account_steal_time(cputime_t cputime) |
3592 | { | 3674 | { |
@@ -3614,6 +3696,73 @@ void account_idle_time(cputime_t cputime) | |||
3614 | 3696 | ||
3615 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | 3697 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING |
3616 | 3698 | ||
3699 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
3700 | /* | ||
3701 | * Account a tick to a process and cpustat | ||
3702 | * @p: the process that the cpu time gets accounted to | ||
3703 | * @user_tick: is the tick from userspace | ||
3704 | * @rq: the pointer to rq | ||
3705 | * | ||
3706 | * Tick demultiplexing follows the order | ||
3707 | * - pending hardirq update | ||
3708 | * - pending softirq update | ||
3709 | * - user_time | ||
3710 | * - idle_time | ||
3711 | * - system time | ||
3712 | * - check for guest_time | ||
3713 | * - else account as system_time | ||
3714 | * | ||
3715 | * Check for hardirq is done both for system and user time as there is | ||
3716 | * no timer going off while we are on hardirq and hence we may never get an | ||
3717 | * opportunity to update it solely in system time. | ||
3718 | * p->stime and friends are only updated on system time and not on irq | ||
3719 | * softirq as those do not count in task exec_runtime any more. | ||
3720 | */ | ||
3721 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3722 | struct rq *rq) | ||
3723 | { | ||
3724 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
3725 | cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); | ||
3726 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | ||
3727 | |||
3728 | if (irqtime_account_hi_update()) { | ||
3729 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | ||
3730 | } else if (irqtime_account_si_update()) { | ||
3731 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | ||
3732 | } else if (this_cpu_ksoftirqd() == p) { | ||
3733 | /* | ||
3734 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
3735 | * So, we have to handle it separately here. | ||
3736 | * Also, p->stime needs to be updated for ksoftirqd. | ||
3737 | */ | ||
3738 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3739 | &cpustat->softirq); | ||
3740 | } else if (user_tick) { | ||
3741 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3742 | } else if (p == rq->idle) { | ||
3743 | account_idle_time(cputime_one_jiffy); | ||
3744 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
3745 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
3746 | } else { | ||
3747 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
3748 | &cpustat->system); | ||
3749 | } | ||
3750 | } | ||
3751 | |||
3752 | static void irqtime_account_idle_ticks(int ticks) | ||
3753 | { | ||
3754 | int i; | ||
3755 | struct rq *rq = this_rq(); | ||
3756 | |||
3757 | for (i = 0; i < ticks; i++) | ||
3758 | irqtime_account_process_tick(current, 0, rq); | ||
3759 | } | ||
3760 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3761 | static void irqtime_account_idle_ticks(int ticks) {} | ||
3762 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
3763 | struct rq *rq) {} | ||
3764 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
3765 | |||
3617 | /* | 3766 | /* |
3618 | * Account a single tick of cpu time. | 3767 | * Account a single tick of cpu time. |
3619 | * @p: the process that the cpu time gets accounted to | 3768 | * @p: the process that the cpu time gets accounted to |
@@ -3624,6 +3773,11 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
3624 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | 3773 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
3625 | struct rq *rq = this_rq(); | 3774 | struct rq *rq = this_rq(); |
3626 | 3775 | ||
3776 | if (sched_clock_irqtime) { | ||
3777 | irqtime_account_process_tick(p, user_tick, rq); | ||
3778 | return; | ||
3779 | } | ||
3780 | |||
3627 | if (user_tick) | 3781 | if (user_tick) |
3628 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | 3782 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
3629 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 3783 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
@@ -3649,6 +3803,12 @@ void account_steal_ticks(unsigned long ticks) | |||
3649 | */ | 3803 | */ |
3650 | void account_idle_ticks(unsigned long ticks) | 3804 | void account_idle_ticks(unsigned long ticks) |
3651 | { | 3805 | { |
3806 | |||
3807 | if (sched_clock_irqtime) { | ||
3808 | irqtime_account_idle_ticks(ticks); | ||
3809 | return; | ||
3810 | } | ||
3811 | |||
3652 | account_idle_time(jiffies_to_cputime(ticks)); | 3812 | account_idle_time(jiffies_to_cputime(ticks)); |
3653 | } | 3813 | } |
3654 | 3814 | ||
@@ -4547,11 +4707,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4547 | 4707 | ||
4548 | if (running) | 4708 | if (running) |
4549 | p->sched_class->set_curr_task(rq); | 4709 | p->sched_class->set_curr_task(rq); |
4550 | if (on_rq) { | 4710 | if (on_rq) |
4551 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); | 4711 | enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); |
4552 | 4712 | ||
4553 | check_class_changed(rq, p, prev_class, oldprio, running); | 4713 | check_class_changed(rq, p, prev_class, oldprio); |
4554 | } | ||
4555 | task_rq_unlock(rq, &flags); | 4714 | task_rq_unlock(rq, &flags); |
4556 | } | 4715 | } |
4557 | 4716 | ||
@@ -4799,12 +4958,15 @@ recheck: | |||
4799 | param->sched_priority > rlim_rtprio) | 4958 | param->sched_priority > rlim_rtprio) |
4800 | return -EPERM; | 4959 | return -EPERM; |
4801 | } | 4960 | } |
4961 | |||
4802 | /* | 4962 | /* |
4803 | * Like positive nice levels, dont allow tasks to | 4963 | * Treat SCHED_IDLE as nice 20. Only allow a switch to |
4804 | * move out of SCHED_IDLE either: | 4964 | * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. |
4805 | */ | 4965 | */ |
4806 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 4966 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { |
4807 | return -EPERM; | 4967 | if (!can_nice(p, TASK_NICE(p))) |
4968 | return -EPERM; | ||
4969 | } | ||
4808 | 4970 | ||
4809 | /* can't change other user's priorities */ | 4971 | /* can't change other user's priorities */ |
4810 | if (!check_same_owner(p)) | 4972 | if (!check_same_owner(p)) |
@@ -4879,11 +5041,10 @@ recheck: | |||
4879 | 5041 | ||
4880 | if (running) | 5042 | if (running) |
4881 | p->sched_class->set_curr_task(rq); | 5043 | p->sched_class->set_curr_task(rq); |
4882 | if (on_rq) { | 5044 | if (on_rq) |
4883 | activate_task(rq, p, 0); | 5045 | activate_task(rq, p, 0); |
4884 | 5046 | ||
4885 | check_class_changed(rq, p, prev_class, oldprio, running); | 5047 | check_class_changed(rq, p, prev_class, oldprio); |
4886 | } | ||
4887 | __task_rq_unlock(rq); | 5048 | __task_rq_unlock(rq); |
4888 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5049 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4889 | 5050 | ||
@@ -5300,6 +5461,65 @@ void __sched yield(void) | |||
5300 | } | 5461 | } |
5301 | EXPORT_SYMBOL(yield); | 5462 | EXPORT_SYMBOL(yield); |
5302 | 5463 | ||
5464 | /** | ||
5465 | * yield_to - yield the current processor to another thread in | ||
5466 | * your thread group, or accelerate that thread toward the | ||
5467 | * processor it's on. | ||
5468 | * | ||
5469 | * It's the caller's job to ensure that the target task struct | ||
5470 | * can't go away on us before we can do any checks. | ||
5471 | * | ||
5472 | * Returns true if we indeed boosted the target task. | ||
5473 | */ | ||
5474 | bool __sched yield_to(struct task_struct *p, bool preempt) | ||
5475 | { | ||
5476 | struct task_struct *curr = current; | ||
5477 | struct rq *rq, *p_rq; | ||
5478 | unsigned long flags; | ||
5479 | bool yielded = 0; | ||
5480 | |||
5481 | local_irq_save(flags); | ||
5482 | rq = this_rq(); | ||
5483 | |||
5484 | again: | ||
5485 | p_rq = task_rq(p); | ||
5486 | double_rq_lock(rq, p_rq); | ||
5487 | while (task_rq(p) != p_rq) { | ||
5488 | double_rq_unlock(rq, p_rq); | ||
5489 | goto again; | ||
5490 | } | ||
5491 | |||
5492 | if (!curr->sched_class->yield_to_task) | ||
5493 | goto out; | ||
5494 | |||
5495 | if (curr->sched_class != p->sched_class) | ||
5496 | goto out; | ||
5497 | |||
5498 | if (task_running(p_rq, p) || p->state) | ||
5499 | goto out; | ||
5500 | |||
5501 | yielded = curr->sched_class->yield_to_task(rq, p, preempt); | ||
5502 | if (yielded) { | ||
5503 | schedstat_inc(rq, yld_count); | ||
5504 | /* | ||
5505 | * Make p's CPU reschedule; pick_next_entity takes care of | ||
5506 | * fairness. | ||
5507 | */ | ||
5508 | if (preempt && rq != p_rq) | ||
5509 | resched_task(p_rq->curr); | ||
5510 | } | ||
5511 | |||
5512 | out: | ||
5513 | double_rq_unlock(rq, p_rq); | ||
5514 | local_irq_restore(flags); | ||
5515 | |||
5516 | if (yielded) | ||
5517 | schedule(); | ||
5518 | |||
5519 | return yielded; | ||
5520 | } | ||
5521 | EXPORT_SYMBOL_GPL(yield_to); | ||
5522 | |||
5303 | /* | 5523 | /* |
5304 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5524 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
5305 | * that process accounting knows that this is a task in IO wait state. | 5525 | * that process accounting knows that this is a task in IO wait state. |
@@ -7773,6 +7993,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
7773 | INIT_LIST_HEAD(&cfs_rq->tasks); | 7993 | INIT_LIST_HEAD(&cfs_rq->tasks); |
7774 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7994 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7775 | cfs_rq->rq = rq; | 7995 | cfs_rq->rq = rq; |
7996 | /* allow initial update_cfs_load() to truncate */ | ||
7997 | #ifdef CONFIG_SMP | ||
7998 | cfs_rq->load_stamp = 1; | ||
7999 | #endif | ||
7776 | #endif | 8000 | #endif |
7777 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 8001 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
7778 | } | 8002 | } |
@@ -8086,6 +8310,8 @@ EXPORT_SYMBOL(__might_sleep); | |||
8086 | #ifdef CONFIG_MAGIC_SYSRQ | 8310 | #ifdef CONFIG_MAGIC_SYSRQ |
8087 | static void normalize_task(struct rq *rq, struct task_struct *p) | 8311 | static void normalize_task(struct rq *rq, struct task_struct *p) |
8088 | { | 8312 | { |
8313 | const struct sched_class *prev_class = p->sched_class; | ||
8314 | int old_prio = p->prio; | ||
8089 | int on_rq; | 8315 | int on_rq; |
8090 | 8316 | ||
8091 | on_rq = p->se.on_rq; | 8317 | on_rq = p->se.on_rq; |
@@ -8096,6 +8322,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p) | |||
8096 | activate_task(rq, p, 0); | 8322 | activate_task(rq, p, 0); |
8097 | resched_task(rq->curr); | 8323 | resched_task(rq->curr); |
8098 | } | 8324 | } |
8325 | |||
8326 | check_class_changed(rq, p, prev_class, old_prio); | ||
8099 | } | 8327 | } |
8100 | 8328 | ||
8101 | void normalize_rt_tasks(void) | 8329 | void normalize_rt_tasks(void) |
@@ -8487,7 +8715,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8487 | /* Propagate contribution to hierarchy */ | 8715 | /* Propagate contribution to hierarchy */ |
8488 | raw_spin_lock_irqsave(&rq->lock, flags); | 8716 | raw_spin_lock_irqsave(&rq->lock, flags); |
8489 | for_each_sched_entity(se) | 8717 | for_each_sched_entity(se) |
8490 | update_cfs_shares(group_cfs_rq(se), 0); | 8718 | update_cfs_shares(group_cfs_rq(se)); |
8491 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 8719 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
8492 | } | 8720 | } |
8493 | 8721 | ||
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 9fb656283157..5946ac515602 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c | |||
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr; | |||
12 | static void __init autogroup_init(struct task_struct *init_task) | 12 | static void __init autogroup_init(struct task_struct *init_task) |
13 | { | 13 | { |
14 | autogroup_default.tg = &root_task_group; | 14 | autogroup_default.tg = &root_task_group; |
15 | root_task_group.autogroup = &autogroup_default; | ||
16 | kref_init(&autogroup_default.kref); | 15 | kref_init(&autogroup_default.kref); |
17 | init_rwsem(&autogroup_default.lock); | 16 | init_rwsem(&autogroup_default.lock); |
18 | init_task->signal->autogroup = &autogroup_default; | 17 | init_task->signal->autogroup = &autogroup_default; |
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) | |||
130 | 129 | ||
131 | static inline bool task_group_is_autogroup(struct task_group *tg) | 130 | static inline bool task_group_is_autogroup(struct task_group *tg) |
132 | { | 131 | { |
133 | return tg != &root_task_group && tg->autogroup; | 132 | return !!tg->autogroup; |
134 | } | 133 | } |
135 | 134 | ||
136 | static inline struct task_group * | 135 | static inline struct task_group * |
@@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
161 | 160 | ||
162 | p->signal->autogroup = autogroup_kref_get(ag); | 161 | p->signal->autogroup = autogroup_kref_get(ag); |
163 | 162 | ||
163 | if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) | ||
164 | goto out; | ||
165 | |||
164 | t = p; | 166 | t = p; |
165 | do { | 167 | do { |
166 | sched_move_task(t); | 168 | sched_move_task(t); |
167 | } while_each_thread(p, t); | 169 | } while_each_thread(p, t); |
168 | 170 | ||
171 | out: | ||
169 | unlock_task_sighand(p, &flags); | 172 | unlock_task_sighand(p, &flags); |
170 | autogroup_kref_put(prev); | 173 | autogroup_kref_put(prev); |
171 | } | 174 | } |
@@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | |||
247 | { | 250 | { |
248 | struct autogroup *ag = autogroup_task_get(p); | 251 | struct autogroup *ag = autogroup_task_get(p); |
249 | 252 | ||
253 | if (!task_group_is_autogroup(ag->tg)) | ||
254 | goto out; | ||
255 | |||
250 | down_read(&ag->lock); | 256 | down_read(&ag->lock); |
251 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); | 257 | seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); |
252 | up_read(&ag->lock); | 258 | up_read(&ag->lock); |
253 | 259 | ||
260 | out: | ||
254 | autogroup_kref_put(ag); | 261 | autogroup_kref_put(ag); |
255 | } | 262 | } |
256 | #endif /* CONFIG_PROC_FS */ | 263 | #endif /* CONFIG_PROC_FS */ |
@@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) | |||
258 | #ifdef CONFIG_SCHED_DEBUG | 265 | #ifdef CONFIG_SCHED_DEBUG |
259 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 266 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) |
260 | { | 267 | { |
261 | int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); | 268 | if (!task_group_is_autogroup(tg)) |
262 | |||
263 | if (!enabled || !tg->autogroup) | ||
264 | return 0; | 269 | return 0; |
265 | 270 | ||
266 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 271 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 7b859ffe5dad..05577055cfca 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h | |||
@@ -1,6 +1,11 @@ | |||
1 | #ifdef CONFIG_SCHED_AUTOGROUP | 1 | #ifdef CONFIG_SCHED_AUTOGROUP |
2 | 2 | ||
3 | struct autogroup { | 3 | struct autogroup { |
4 | /* | ||
5 | * reference doesn't mean how many thread attach to this | ||
6 | * autogroup now. It just stands for the number of task | ||
7 | * could use this autogroup. | ||
8 | */ | ||
4 | struct kref kref; | 9 | struct kref kref; |
5 | struct task_group *tg; | 10 | struct task_group *tg; |
6 | struct rw_semaphore lock; | 11 | struct rw_semaphore lock; |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index eb6cb8edd075..7bacd83a4158 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
179 | 179 | ||
180 | raw_spin_lock_irqsave(&rq->lock, flags); | 180 | raw_spin_lock_irqsave(&rq->lock, flags); |
181 | if (cfs_rq->rb_leftmost) | 181 | if (cfs_rq->rb_leftmost) |
182 | MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; | 182 | MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; |
183 | last = __pick_last_entity(cfs_rq); | 183 | last = __pick_last_entity(cfs_rq); |
184 | if (last) | 184 | if (last) |
185 | max_vruntime = last->vruntime; | 185 | max_vruntime = last->vruntime; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0c26e2df450e..3f7ec9e27ee1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -69,14 +69,6 @@ static unsigned int sched_nr_latency = 8; | |||
69 | unsigned int sysctl_sched_child_runs_first __read_mostly; | 69 | unsigned int sysctl_sched_child_runs_first __read_mostly; |
70 | 70 | ||
71 | /* | 71 | /* |
72 | * sys_sched_yield() compat mode | ||
73 | * | ||
74 | * This option switches the agressive yield implementation of the | ||
75 | * old scheduler back on. | ||
76 | */ | ||
77 | unsigned int __read_mostly sysctl_sched_compat_yield; | ||
78 | |||
79 | /* | ||
80 | * SCHED_OTHER wake-up granularity. | 72 | * SCHED_OTHER wake-up granularity. |
81 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | 73 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) |
82 | * | 74 | * |
@@ -419,7 +411,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
419 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 411 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
420 | } | 412 | } |
421 | 413 | ||
422 | static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | 414 | static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) |
423 | { | 415 | { |
424 | struct rb_node *left = cfs_rq->rb_leftmost; | 416 | struct rb_node *left = cfs_rq->rb_leftmost; |
425 | 417 | ||
@@ -429,6 +421,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | |||
429 | return rb_entry(left, struct sched_entity, run_node); | 421 | return rb_entry(left, struct sched_entity, run_node); |
430 | } | 422 | } |
431 | 423 | ||
424 | static struct sched_entity *__pick_next_entity(struct sched_entity *se) | ||
425 | { | ||
426 | struct rb_node *next = rb_next(&se->run_node); | ||
427 | |||
428 | if (!next) | ||
429 | return NULL; | ||
430 | |||
431 | return rb_entry(next, struct sched_entity, run_node); | ||
432 | } | ||
433 | |||
434 | #ifdef CONFIG_SCHED_DEBUG | ||
432 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | 435 | static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) |
433 | { | 436 | { |
434 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); | 437 | struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); |
@@ -443,7 +446,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
443 | * Scheduling class statistics methods: | 446 | * Scheduling class statistics methods: |
444 | */ | 447 | */ |
445 | 448 | ||
446 | #ifdef CONFIG_SCHED_DEBUG | ||
447 | int sched_proc_update_handler(struct ctl_table *table, int write, | 449 | int sched_proc_update_handler(struct ctl_table *table, int write, |
448 | void __user *buffer, size_t *lenp, | 450 | void __user *buffer, size_t *lenp, |
449 | loff_t *ppos) | 451 | loff_t *ppos) |
@@ -540,7 +542,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
540 | } | 542 | } |
541 | 543 | ||
542 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); | 544 | static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); |
543 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); | 545 | static void update_cfs_shares(struct cfs_rq *cfs_rq); |
544 | 546 | ||
545 | /* | 547 | /* |
546 | * Update the current task's runtime statistics. Skip current tasks that | 548 | * Update the current task's runtime statistics. Skip current tasks that |
@@ -733,6 +735,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
733 | now - cfs_rq->load_last > 4 * period) { | 735 | now - cfs_rq->load_last > 4 * period) { |
734 | cfs_rq->load_period = 0; | 736 | cfs_rq->load_period = 0; |
735 | cfs_rq->load_avg = 0; | 737 | cfs_rq->load_avg = 0; |
738 | delta = period - 1; | ||
736 | } | 739 | } |
737 | 740 | ||
738 | cfs_rq->load_stamp = now; | 741 | cfs_rq->load_stamp = now; |
@@ -763,16 +766,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
763 | list_del_leaf_cfs_rq(cfs_rq); | 766 | list_del_leaf_cfs_rq(cfs_rq); |
764 | } | 767 | } |
765 | 768 | ||
766 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | 769 | static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
767 | long weight_delta) | ||
768 | { | 770 | { |
769 | long load_weight, load, shares; | 771 | long load_weight, load, shares; |
770 | 772 | ||
771 | load = cfs_rq->load.weight + weight_delta; | 773 | load = cfs_rq->load.weight; |
772 | 774 | ||
773 | load_weight = atomic_read(&tg->load_weight); | 775 | load_weight = atomic_read(&tg->load_weight); |
774 | load_weight -= cfs_rq->load_contribution; | ||
775 | load_weight += load; | 776 | load_weight += load; |
777 | load_weight -= cfs_rq->load_contribution; | ||
776 | 778 | ||
777 | shares = (tg->shares * load); | 779 | shares = (tg->shares * load); |
778 | if (load_weight) | 780 | if (load_weight) |
@@ -790,7 +792,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq) | |||
790 | { | 792 | { |
791 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { | 793 | if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { |
792 | update_cfs_load(cfs_rq, 0); | 794 | update_cfs_load(cfs_rq, 0); |
793 | update_cfs_shares(cfs_rq, 0); | 795 | update_cfs_shares(cfs_rq); |
794 | } | 796 | } |
795 | } | 797 | } |
796 | # else /* CONFIG_SMP */ | 798 | # else /* CONFIG_SMP */ |
@@ -798,8 +800,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
798 | { | 800 | { |
799 | } | 801 | } |
800 | 802 | ||
801 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, | 803 | static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) |
802 | long weight_delta) | ||
803 | { | 804 | { |
804 | return tg->shares; | 805 | return tg->shares; |
805 | } | 806 | } |
@@ -824,7 +825,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | |||
824 | account_entity_enqueue(cfs_rq, se); | 825 | account_entity_enqueue(cfs_rq, se); |
825 | } | 826 | } |
826 | 827 | ||
827 | static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | 828 | static void update_cfs_shares(struct cfs_rq *cfs_rq) |
828 | { | 829 | { |
829 | struct task_group *tg; | 830 | struct task_group *tg; |
830 | struct sched_entity *se; | 831 | struct sched_entity *se; |
@@ -838,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | |||
838 | if (likely(se->load.weight == tg->shares)) | 839 | if (likely(se->load.weight == tg->shares)) |
839 | return; | 840 | return; |
840 | #endif | 841 | #endif |
841 | shares = calc_cfs_shares(cfs_rq, tg, weight_delta); | 842 | shares = calc_cfs_shares(cfs_rq, tg); |
842 | 843 | ||
843 | reweight_entity(cfs_rq_of(se), se, shares); | 844 | reweight_entity(cfs_rq_of(se), se, shares); |
844 | } | 845 | } |
@@ -847,7 +848,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) | |||
847 | { | 848 | { |
848 | } | 849 | } |
849 | 850 | ||
850 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) | 851 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) |
851 | { | 852 | { |
852 | } | 853 | } |
853 | 854 | ||
@@ -978,8 +979,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
978 | */ | 979 | */ |
979 | update_curr(cfs_rq); | 980 | update_curr(cfs_rq); |
980 | update_cfs_load(cfs_rq, 0); | 981 | update_cfs_load(cfs_rq, 0); |
981 | update_cfs_shares(cfs_rq, se->load.weight); | ||
982 | account_entity_enqueue(cfs_rq, se); | 982 | account_entity_enqueue(cfs_rq, se); |
983 | update_cfs_shares(cfs_rq); | ||
983 | 984 | ||
984 | if (flags & ENQUEUE_WAKEUP) { | 985 | if (flags & ENQUEUE_WAKEUP) { |
985 | place_entity(cfs_rq, se, 0); | 986 | place_entity(cfs_rq, se, 0); |
@@ -996,19 +997,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
996 | list_add_leaf_cfs_rq(cfs_rq); | 997 | list_add_leaf_cfs_rq(cfs_rq); |
997 | } | 998 | } |
998 | 999 | ||
999 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1000 | static void __clear_buddies_last(struct sched_entity *se) |
1001 | { | ||
1002 | for_each_sched_entity(se) { | ||
1003 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1004 | if (cfs_rq->last == se) | ||
1005 | cfs_rq->last = NULL; | ||
1006 | else | ||
1007 | break; | ||
1008 | } | ||
1009 | } | ||
1010 | |||
1011 | static void __clear_buddies_next(struct sched_entity *se) | ||
1000 | { | 1012 | { |
1001 | if (!se || cfs_rq->last == se) | 1013 | for_each_sched_entity(se) { |
1002 | cfs_rq->last = NULL; | 1014 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1015 | if (cfs_rq->next == se) | ||
1016 | cfs_rq->next = NULL; | ||
1017 | else | ||
1018 | break; | ||
1019 | } | ||
1020 | } | ||
1003 | 1021 | ||
1004 | if (!se || cfs_rq->next == se) | 1022 | static void __clear_buddies_skip(struct sched_entity *se) |
1005 | cfs_rq->next = NULL; | 1023 | { |
1024 | for_each_sched_entity(se) { | ||
1025 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1026 | if (cfs_rq->skip == se) | ||
1027 | cfs_rq->skip = NULL; | ||
1028 | else | ||
1029 | break; | ||
1030 | } | ||
1006 | } | 1031 | } |
1007 | 1032 | ||
1008 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1033 | static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
1009 | { | 1034 | { |
1010 | for_each_sched_entity(se) | 1035 | if (cfs_rq->last == se) |
1011 | __clear_buddies(cfs_rq_of(se), se); | 1036 | __clear_buddies_last(se); |
1037 | |||
1038 | if (cfs_rq->next == se) | ||
1039 | __clear_buddies_next(se); | ||
1040 | |||
1041 | if (cfs_rq->skip == se) | ||
1042 | __clear_buddies_skip(se); | ||
1012 | } | 1043 | } |
1013 | 1044 | ||
1014 | static void | 1045 | static void |
@@ -1041,7 +1072,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
1041 | update_cfs_load(cfs_rq, 0); | 1072 | update_cfs_load(cfs_rq, 0); |
1042 | account_entity_dequeue(cfs_rq, se); | 1073 | account_entity_dequeue(cfs_rq, se); |
1043 | update_min_vruntime(cfs_rq); | 1074 | update_min_vruntime(cfs_rq); |
1044 | update_cfs_shares(cfs_rq, 0); | 1075 | update_cfs_shares(cfs_rq); |
1045 | 1076 | ||
1046 | /* | 1077 | /* |
1047 | * Normalize the entity after updating the min_vruntime because the | 1078 | * Normalize the entity after updating the min_vruntime because the |
@@ -1084,7 +1115,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
1084 | return; | 1115 | return; |
1085 | 1116 | ||
1086 | if (cfs_rq->nr_running > 1) { | 1117 | if (cfs_rq->nr_running > 1) { |
1087 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1118 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
1088 | s64 delta = curr->vruntime - se->vruntime; | 1119 | s64 delta = curr->vruntime - se->vruntime; |
1089 | 1120 | ||
1090 | if (delta < 0) | 1121 | if (delta < 0) |
@@ -1128,13 +1159,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
1128 | static int | 1159 | static int |
1129 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | 1160 | wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); |
1130 | 1161 | ||
1162 | /* | ||
1163 | * Pick the next process, keeping these things in mind, in this order: | ||
1164 | * 1) keep things fair between processes/task groups | ||
1165 | * 2) pick the "next" process, since someone really wants that to run | ||
1166 | * 3) pick the "last" process, for cache locality | ||
1167 | * 4) do not run the "skip" process, if something else is available | ||
1168 | */ | ||
1131 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 1169 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
1132 | { | 1170 | { |
1133 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 1171 | struct sched_entity *se = __pick_first_entity(cfs_rq); |
1134 | struct sched_entity *left = se; | 1172 | struct sched_entity *left = se; |
1135 | 1173 | ||
1136 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | 1174 | /* |
1137 | se = cfs_rq->next; | 1175 | * Avoid running the skip buddy, if running something else can |
1176 | * be done without getting too unfair. | ||
1177 | */ | ||
1178 | if (cfs_rq->skip == se) { | ||
1179 | struct sched_entity *second = __pick_next_entity(se); | ||
1180 | if (second && wakeup_preempt_entity(second, left) < 1) | ||
1181 | se = second; | ||
1182 | } | ||
1138 | 1183 | ||
1139 | /* | 1184 | /* |
1140 | * Prefer last buddy, try to return the CPU to a preempted task. | 1185 | * Prefer last buddy, try to return the CPU to a preempted task. |
@@ -1142,6 +1187,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | |||
1142 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) | 1187 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) |
1143 | se = cfs_rq->last; | 1188 | se = cfs_rq->last; |
1144 | 1189 | ||
1190 | /* | ||
1191 | * Someone really wants this to run. If it's not unfair, run it. | ||
1192 | */ | ||
1193 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) | ||
1194 | se = cfs_rq->next; | ||
1195 | |||
1145 | clear_buddies(cfs_rq, se); | 1196 | clear_buddies(cfs_rq, se); |
1146 | 1197 | ||
1147 | return se; | 1198 | return se; |
@@ -1282,7 +1333,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1282 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1333 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1283 | 1334 | ||
1284 | update_cfs_load(cfs_rq, 0); | 1335 | update_cfs_load(cfs_rq, 0); |
1285 | update_cfs_shares(cfs_rq, 0); | 1336 | update_cfs_shares(cfs_rq); |
1286 | } | 1337 | } |
1287 | 1338 | ||
1288 | hrtick_update(rq); | 1339 | hrtick_update(rq); |
@@ -1312,58 +1363,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1312 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 1363 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
1313 | 1364 | ||
1314 | update_cfs_load(cfs_rq, 0); | 1365 | update_cfs_load(cfs_rq, 0); |
1315 | update_cfs_shares(cfs_rq, 0); | 1366 | update_cfs_shares(cfs_rq); |
1316 | } | 1367 | } |
1317 | 1368 | ||
1318 | hrtick_update(rq); | 1369 | hrtick_update(rq); |
1319 | } | 1370 | } |
1320 | 1371 | ||
1321 | /* | ||
1322 | * sched_yield() support is very simple - we dequeue and enqueue. | ||
1323 | * | ||
1324 | * If compat_yield is turned on then we requeue to the end of the tree. | ||
1325 | */ | ||
1326 | static void yield_task_fair(struct rq *rq) | ||
1327 | { | ||
1328 | struct task_struct *curr = rq->curr; | ||
1329 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
1330 | struct sched_entity *rightmost, *se = &curr->se; | ||
1331 | |||
1332 | /* | ||
1333 | * Are we the only task in the tree? | ||
1334 | */ | ||
1335 | if (unlikely(cfs_rq->nr_running == 1)) | ||
1336 | return; | ||
1337 | |||
1338 | clear_buddies(cfs_rq, se); | ||
1339 | |||
1340 | if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { | ||
1341 | update_rq_clock(rq); | ||
1342 | /* | ||
1343 | * Update run-time statistics of the 'current'. | ||
1344 | */ | ||
1345 | update_curr(cfs_rq); | ||
1346 | |||
1347 | return; | ||
1348 | } | ||
1349 | /* | ||
1350 | * Find the rightmost entry in the rbtree: | ||
1351 | */ | ||
1352 | rightmost = __pick_last_entity(cfs_rq); | ||
1353 | /* | ||
1354 | * Already in the rightmost position? | ||
1355 | */ | ||
1356 | if (unlikely(!rightmost || entity_before(rightmost, se))) | ||
1357 | return; | ||
1358 | |||
1359 | /* | ||
1360 | * Minimally necessary key value to be last in the tree: | ||
1361 | * Upon rescheduling, sched_class::put_prev_task() will place | ||
1362 | * 'current' within the tree based on its new key value. | ||
1363 | */ | ||
1364 | se->vruntime = rightmost->vruntime + 1; | ||
1365 | } | ||
1366 | |||
1367 | #ifdef CONFIG_SMP | 1372 | #ifdef CONFIG_SMP |
1368 | 1373 | ||
1369 | static void task_waking_fair(struct rq *rq, struct task_struct *p) | 1374 | static void task_waking_fair(struct rq *rq, struct task_struct *p) |
@@ -1834,6 +1839,14 @@ static void set_next_buddy(struct sched_entity *se) | |||
1834 | } | 1839 | } |
1835 | } | 1840 | } |
1836 | 1841 | ||
1842 | static void set_skip_buddy(struct sched_entity *se) | ||
1843 | { | ||
1844 | if (likely(task_of(se)->policy != SCHED_IDLE)) { | ||
1845 | for_each_sched_entity(se) | ||
1846 | cfs_rq_of(se)->skip = se; | ||
1847 | } | ||
1848 | } | ||
1849 | |||
1837 | /* | 1850 | /* |
1838 | * Preempt the current task with a newly woken task if needed: | 1851 | * Preempt the current task with a newly woken task if needed: |
1839 | */ | 1852 | */ |
@@ -1857,16 +1870,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1857 | if (test_tsk_need_resched(curr)) | 1870 | if (test_tsk_need_resched(curr)) |
1858 | return; | 1871 | return; |
1859 | 1872 | ||
1873 | /* Idle tasks are by definition preempted by non-idle tasks. */ | ||
1874 | if (unlikely(curr->policy == SCHED_IDLE) && | ||
1875 | likely(p->policy != SCHED_IDLE)) | ||
1876 | goto preempt; | ||
1877 | |||
1860 | /* | 1878 | /* |
1861 | * Batch and idle tasks do not preempt (their preemption is driven by | 1879 | * Batch and idle tasks do not preempt non-idle tasks (their preemption |
1862 | * the tick): | 1880 | * is driven by the tick): |
1863 | */ | 1881 | */ |
1864 | if (unlikely(p->policy != SCHED_NORMAL)) | 1882 | if (unlikely(p->policy != SCHED_NORMAL)) |
1865 | return; | 1883 | return; |
1866 | 1884 | ||
1867 | /* Idle tasks are by definition preempted by everybody. */ | ||
1868 | if (unlikely(curr->policy == SCHED_IDLE)) | ||
1869 | goto preempt; | ||
1870 | 1885 | ||
1871 | if (!sched_feat(WAKEUP_PREEMPT)) | 1886 | if (!sched_feat(WAKEUP_PREEMPT)) |
1872 | return; | 1887 | return; |
@@ -1932,6 +1947,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) | |||
1932 | } | 1947 | } |
1933 | } | 1948 | } |
1934 | 1949 | ||
1950 | /* | ||
1951 | * sched_yield() is very simple | ||
1952 | * | ||
1953 | * The magic of dealing with the ->skip buddy is in pick_next_entity. | ||
1954 | */ | ||
1955 | static void yield_task_fair(struct rq *rq) | ||
1956 | { | ||
1957 | struct task_struct *curr = rq->curr; | ||
1958 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | ||
1959 | struct sched_entity *se = &curr->se; | ||
1960 | |||
1961 | /* | ||
1962 | * Are we the only task in the tree? | ||
1963 | */ | ||
1964 | if (unlikely(rq->nr_running == 1)) | ||
1965 | return; | ||
1966 | |||
1967 | clear_buddies(cfs_rq, se); | ||
1968 | |||
1969 | if (curr->policy != SCHED_BATCH) { | ||
1970 | update_rq_clock(rq); | ||
1971 | /* | ||
1972 | * Update run-time statistics of the 'current'. | ||
1973 | */ | ||
1974 | update_curr(cfs_rq); | ||
1975 | } | ||
1976 | |||
1977 | set_skip_buddy(se); | ||
1978 | } | ||
1979 | |||
1980 | static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) | ||
1981 | { | ||
1982 | struct sched_entity *se = &p->se; | ||
1983 | |||
1984 | if (!se->on_rq) | ||
1985 | return false; | ||
1986 | |||
1987 | /* Tell the scheduler that we'd really like pse to run next. */ | ||
1988 | set_next_buddy(se); | ||
1989 | |||
1990 | yield_task_fair(rq); | ||
1991 | |||
1992 | return true; | ||
1993 | } | ||
1994 | |||
1935 | #ifdef CONFIG_SMP | 1995 | #ifdef CONFIG_SMP |
1936 | /************************************************** | 1996 | /************************************************** |
1937 | * Fair scheduling class load-balancing methods: | 1997 | * Fair scheduling class load-balancing methods: |
@@ -2123,7 +2183,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu) | |||
2123 | * We need to update shares after updating tg->load_weight in | 2183 | * We need to update shares after updating tg->load_weight in |
2124 | * order to adjust the weight of groups with long running tasks. | 2184 | * order to adjust the weight of groups with long running tasks. |
2125 | */ | 2185 | */ |
2126 | update_cfs_shares(cfs_rq, 0); | 2186 | update_cfs_shares(cfs_rq); |
2127 | 2187 | ||
2128 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 2188 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
2129 | 2189 | ||
@@ -2610,7 +2670,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2610 | * @this_cpu: Cpu for which load balance is currently performed. | 2670 | * @this_cpu: Cpu for which load balance is currently performed. |
2611 | * @idle: Idle status of this_cpu | 2671 | * @idle: Idle status of this_cpu |
2612 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 2672 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
2613 | * @sd_idle: Idle status of the sched_domain containing group. | ||
2614 | * @local_group: Does group contain this_cpu. | 2673 | * @local_group: Does group contain this_cpu. |
2615 | * @cpus: Set of cpus considered for load balancing. | 2674 | * @cpus: Set of cpus considered for load balancing. |
2616 | * @balance: Should we balance. | 2675 | * @balance: Should we balance. |
@@ -2618,7 +2677,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
2618 | */ | 2677 | */ |
2619 | static inline void update_sg_lb_stats(struct sched_domain *sd, | 2678 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
2620 | struct sched_group *group, int this_cpu, | 2679 | struct sched_group *group, int this_cpu, |
2621 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 2680 | enum cpu_idle_type idle, int load_idx, |
2622 | int local_group, const struct cpumask *cpus, | 2681 | int local_group, const struct cpumask *cpus, |
2623 | int *balance, struct sg_lb_stats *sgs) | 2682 | int *balance, struct sg_lb_stats *sgs) |
2624 | { | 2683 | { |
@@ -2638,9 +2697,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2638 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { | 2697 | for_each_cpu_and(i, sched_group_cpus(group), cpus) { |
2639 | struct rq *rq = cpu_rq(i); | 2698 | struct rq *rq = cpu_rq(i); |
2640 | 2699 | ||
2641 | if (*sd_idle && rq->nr_running) | ||
2642 | *sd_idle = 0; | ||
2643 | |||
2644 | /* Bias balancing toward cpus of our domain */ | 2700 | /* Bias balancing toward cpus of our domain */ |
2645 | if (local_group) { | 2701 | if (local_group) { |
2646 | if (idle_cpu(i) && !first_idle_cpu) { | 2702 | if (idle_cpu(i) && !first_idle_cpu) { |
@@ -2685,7 +2741,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2685 | 2741 | ||
2686 | /* | 2742 | /* |
2687 | * Consider the group unbalanced when the imbalance is larger | 2743 | * Consider the group unbalanced when the imbalance is larger |
2688 | * than the average weight of two tasks. | 2744 | * than the average weight of a task. |
2689 | * | 2745 | * |
2690 | * APZ: with cgroup the avg task weight can vary wildly and | 2746 | * APZ: with cgroup the avg task weight can vary wildly and |
2691 | * might not be a suitable number - should we keep a | 2747 | * might not be a suitable number - should we keep a |
@@ -2695,7 +2751,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, | |||
2695 | if (sgs->sum_nr_running) | 2751 | if (sgs->sum_nr_running) |
2696 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 2752 | avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; |
2697 | 2753 | ||
2698 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) | 2754 | if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) |
2699 | sgs->group_imb = 1; | 2755 | sgs->group_imb = 1; |
2700 | 2756 | ||
2701 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | 2757 | sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); |
@@ -2755,15 +2811,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd, | |||
2755 | * @sd: sched_domain whose statistics are to be updated. | 2811 | * @sd: sched_domain whose statistics are to be updated. |
2756 | * @this_cpu: Cpu for which load balance is currently performed. | 2812 | * @this_cpu: Cpu for which load balance is currently performed. |
2757 | * @idle: Idle status of this_cpu | 2813 | * @idle: Idle status of this_cpu |
2758 | * @sd_idle: Idle status of the sched_domain containing sg. | ||
2759 | * @cpus: Set of cpus considered for load balancing. | 2814 | * @cpus: Set of cpus considered for load balancing. |
2760 | * @balance: Should we balance. | 2815 | * @balance: Should we balance. |
2761 | * @sds: variable to hold the statistics for this sched_domain. | 2816 | * @sds: variable to hold the statistics for this sched_domain. |
2762 | */ | 2817 | */ |
2763 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | 2818 | static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, |
2764 | enum cpu_idle_type idle, int *sd_idle, | 2819 | enum cpu_idle_type idle, const struct cpumask *cpus, |
2765 | const struct cpumask *cpus, int *balance, | 2820 | int *balance, struct sd_lb_stats *sds) |
2766 | struct sd_lb_stats *sds) | ||
2767 | { | 2821 | { |
2768 | struct sched_domain *child = sd->child; | 2822 | struct sched_domain *child = sd->child; |
2769 | struct sched_group *sg = sd->groups; | 2823 | struct sched_group *sg = sd->groups; |
@@ -2781,7 +2835,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
2781 | 2835 | ||
2782 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); | 2836 | local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); |
2783 | memset(&sgs, 0, sizeof(sgs)); | 2837 | memset(&sgs, 0, sizeof(sgs)); |
2784 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, | 2838 | update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, |
2785 | local_group, cpus, balance, &sgs); | 2839 | local_group, cpus, balance, &sgs); |
2786 | 2840 | ||
2787 | if (local_group && !(*balance)) | 2841 | if (local_group && !(*balance)) |
@@ -3033,7 +3087,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3033 | * @imbalance: Variable which stores amount of weighted load which should | 3087 | * @imbalance: Variable which stores amount of weighted load which should |
3034 | * be moved to restore balance/put a group to idle. | 3088 | * be moved to restore balance/put a group to idle. |
3035 | * @idle: The idle status of this_cpu. | 3089 | * @idle: The idle status of this_cpu. |
3036 | * @sd_idle: The idleness of sd | ||
3037 | * @cpus: The set of CPUs under consideration for load-balancing. | 3090 | * @cpus: The set of CPUs under consideration for load-balancing. |
3038 | * @balance: Pointer to a variable indicating if this_cpu | 3091 | * @balance: Pointer to a variable indicating if this_cpu |
3039 | * is the appropriate cpu to perform load balancing at this_level. | 3092 | * is the appropriate cpu to perform load balancing at this_level. |
@@ -3046,7 +3099,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3046 | static struct sched_group * | 3099 | static struct sched_group * |
3047 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 3100 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
3048 | unsigned long *imbalance, enum cpu_idle_type idle, | 3101 | unsigned long *imbalance, enum cpu_idle_type idle, |
3049 | int *sd_idle, const struct cpumask *cpus, int *balance) | 3102 | const struct cpumask *cpus, int *balance) |
3050 | { | 3103 | { |
3051 | struct sd_lb_stats sds; | 3104 | struct sd_lb_stats sds; |
3052 | 3105 | ||
@@ -3056,22 +3109,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3056 | * Compute the various statistics relavent for load balancing at | 3109 | * Compute the various statistics relavent for load balancing at |
3057 | * this level. | 3110 | * this level. |
3058 | */ | 3111 | */ |
3059 | update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, | 3112 | update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); |
3060 | balance, &sds); | 3113 | |
3061 | 3114 | /* | |
3062 | /* Cases where imbalance does not exist from POV of this_cpu */ | 3115 | * this_cpu is not the appropriate cpu to perform load balancing at |
3063 | /* 1) this_cpu is not the appropriate cpu to perform load balancing | 3116 | * this level. |
3064 | * at this level. | ||
3065 | * 2) There is no busy sibling group to pull from. | ||
3066 | * 3) This group is the busiest group. | ||
3067 | * 4) This group is more busy than the avg busieness at this | ||
3068 | * sched_domain. | ||
3069 | * 5) The imbalance is within the specified limit. | ||
3070 | * | ||
3071 | * Note: when doing newidle balance, if the local group has excess | ||
3072 | * capacity (i.e. nr_running < group_capacity) and the busiest group | ||
3073 | * does not have any capacity, we force a load balance to pull tasks | ||
3074 | * to the local group. In this case, we skip past checks 3, 4 and 5. | ||
3075 | */ | 3117 | */ |
3076 | if (!(*balance)) | 3118 | if (!(*balance)) |
3077 | goto ret; | 3119 | goto ret; |
@@ -3080,41 +3122,55 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
3080 | check_asym_packing(sd, &sds, this_cpu, imbalance)) | 3122 | check_asym_packing(sd, &sds, this_cpu, imbalance)) |
3081 | return sds.busiest; | 3123 | return sds.busiest; |
3082 | 3124 | ||
3125 | /* There is no busy sibling group to pull tasks from */ | ||
3083 | if (!sds.busiest || sds.busiest_nr_running == 0) | 3126 | if (!sds.busiest || sds.busiest_nr_running == 0) |
3084 | goto out_balanced; | 3127 | goto out_balanced; |
3085 | 3128 | ||
3086 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | 3129 | /* |
3130 | * If the busiest group is imbalanced the below checks don't | ||
3131 | * work because they assumes all things are equal, which typically | ||
3132 | * isn't true due to cpus_allowed constraints and the like. | ||
3133 | */ | ||
3134 | if (sds.group_imb) | ||
3135 | goto force_balance; | ||
3136 | |||
3137 | /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ | ||
3087 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && | 3138 | if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && |
3088 | !sds.busiest_has_capacity) | 3139 | !sds.busiest_has_capacity) |
3089 | goto force_balance; | 3140 | goto force_balance; |
3090 | 3141 | ||
3142 | /* | ||
3143 | * If the local group is more busy than the selected busiest group | ||
3144 | * don't try and pull any tasks. | ||
3145 | */ | ||
3091 | if (sds.this_load >= sds.max_load) | 3146 | if (sds.this_load >= sds.max_load) |
3092 | goto out_balanced; | 3147 | goto out_balanced; |
3093 | 3148 | ||
3149 | /* | ||
3150 | * Don't pull any tasks if this group is already above the domain | ||
3151 | * average load. | ||
3152 | */ | ||
3094 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; | 3153 | sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; |
3095 | |||
3096 | if (sds.this_load >= sds.avg_load) | 3154 | if (sds.this_load >= sds.avg_load) |
3097 | goto out_balanced; | 3155 | goto out_balanced; |
3098 | 3156 | ||
3099 | /* | 3157 | if (idle == CPU_IDLE) { |
3100 | * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative. | ||
3101 | * And to check for busy balance use !idle_cpu instead of | ||
3102 | * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE | ||
3103 | * even when they are idle. | ||
3104 | */ | ||
3105 | if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) { | ||
3106 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
3107 | goto out_balanced; | ||
3108 | } else { | ||
3109 | /* | 3158 | /* |
3110 | * This cpu is idle. If the busiest group load doesn't | 3159 | * This cpu is idle. If the busiest group load doesn't |
3111 | * have more tasks than the number of available cpu's and | 3160 | * have more tasks than the number of available cpu's and |
3112 | * there is no imbalance between this and busiest group | 3161 | * there is no imbalance between this and busiest group |
3113 | * wrt to idle cpu's, it is balanced. | 3162 | * wrt to idle cpu's, it is balanced. |
3114 | */ | 3163 | */ |
3115 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && | 3164 | if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && |
3116 | sds.busiest_nr_running <= sds.busiest_group_weight) | 3165 | sds.busiest_nr_running <= sds.busiest_group_weight) |
3117 | goto out_balanced; | 3166 | goto out_balanced; |
3167 | } else { | ||
3168 | /* | ||
3169 | * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use | ||
3170 | * imbalance_pct to be conservative. | ||
3171 | */ | ||
3172 | if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) | ||
3173 | goto out_balanced; | ||
3118 | } | 3174 | } |
3119 | 3175 | ||
3120 | force_balance: | 3176 | force_balance: |
@@ -3193,7 +3249,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, | |||
3193 | /* Working cpumask for load_balance and load_balance_newidle. */ | 3249 | /* Working cpumask for load_balance and load_balance_newidle. */ |
3194 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); | 3250 | static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); |
3195 | 3251 | ||
3196 | static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | 3252 | static int need_active_balance(struct sched_domain *sd, int idle, |
3197 | int busiest_cpu, int this_cpu) | 3253 | int busiest_cpu, int this_cpu) |
3198 | { | 3254 | { |
3199 | if (idle == CPU_NEWLY_IDLE) { | 3255 | if (idle == CPU_NEWLY_IDLE) { |
@@ -3225,10 +3281,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, | |||
3225 | * move_tasks() will succeed. ld_moved will be true and this | 3281 | * move_tasks() will succeed. ld_moved will be true and this |
3226 | * active balance code will not be triggered. | 3282 | * active balance code will not be triggered. |
3227 | */ | 3283 | */ |
3228 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3229 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3230 | return 0; | ||
3231 | |||
3232 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) | 3284 | if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) |
3233 | return 0; | 3285 | return 0; |
3234 | } | 3286 | } |
@@ -3246,7 +3298,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3246 | struct sched_domain *sd, enum cpu_idle_type idle, | 3298 | struct sched_domain *sd, enum cpu_idle_type idle, |
3247 | int *balance) | 3299 | int *balance) |
3248 | { | 3300 | { |
3249 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 3301 | int ld_moved, all_pinned = 0, active_balance = 0; |
3250 | struct sched_group *group; | 3302 | struct sched_group *group; |
3251 | unsigned long imbalance; | 3303 | unsigned long imbalance; |
3252 | struct rq *busiest; | 3304 | struct rq *busiest; |
@@ -3255,20 +3307,10 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3255 | 3307 | ||
3256 | cpumask_copy(cpus, cpu_active_mask); | 3308 | cpumask_copy(cpus, cpu_active_mask); |
3257 | 3309 | ||
3258 | /* | ||
3259 | * When power savings policy is enabled for the parent domain, idle | ||
3260 | * sibling can pick up load irrespective of busy siblings. In this case, | ||
3261 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | ||
3262 | * portraying it as CPU_NOT_IDLE. | ||
3263 | */ | ||
3264 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | ||
3265 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3266 | sd_idle = 1; | ||
3267 | |||
3268 | schedstat_inc(sd, lb_count[idle]); | 3310 | schedstat_inc(sd, lb_count[idle]); |
3269 | 3311 | ||
3270 | redo: | 3312 | redo: |
3271 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3313 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, |
3272 | cpus, balance); | 3314 | cpus, balance); |
3273 | 3315 | ||
3274 | if (*balance == 0) | 3316 | if (*balance == 0) |
@@ -3330,8 +3372,7 @@ redo: | |||
3330 | if (idle != CPU_NEWLY_IDLE) | 3372 | if (idle != CPU_NEWLY_IDLE) |
3331 | sd->nr_balance_failed++; | 3373 | sd->nr_balance_failed++; |
3332 | 3374 | ||
3333 | if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), | 3375 | if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { |
3334 | this_cpu)) { | ||
3335 | raw_spin_lock_irqsave(&busiest->lock, flags); | 3376 | raw_spin_lock_irqsave(&busiest->lock, flags); |
3336 | 3377 | ||
3337 | /* don't kick the active_load_balance_cpu_stop, | 3378 | /* don't kick the active_load_balance_cpu_stop, |
@@ -3386,10 +3427,6 @@ redo: | |||
3386 | sd->balance_interval *= 2; | 3427 | sd->balance_interval *= 2; |
3387 | } | 3428 | } |
3388 | 3429 | ||
3389 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | ||
3390 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3391 | ld_moved = -1; | ||
3392 | |||
3393 | goto out; | 3430 | goto out; |
3394 | 3431 | ||
3395 | out_balanced: | 3432 | out_balanced: |
@@ -3403,11 +3440,7 @@ out_one_pinned: | |||
3403 | (sd->balance_interval < sd->max_interval)) | 3440 | (sd->balance_interval < sd->max_interval)) |
3404 | sd->balance_interval *= 2; | 3441 | sd->balance_interval *= 2; |
3405 | 3442 | ||
3406 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3443 | ld_moved = 0; |
3407 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | ||
3408 | ld_moved = -1; | ||
3409 | else | ||
3410 | ld_moved = 0; | ||
3411 | out: | 3444 | out: |
3412 | return ld_moved; | 3445 | return ld_moved; |
3413 | } | 3446 | } |
@@ -3831,8 +3864,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3831 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 3864 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
3832 | /* | 3865 | /* |
3833 | * We've pulled tasks over so either we're no | 3866 | * We've pulled tasks over so either we're no |
3834 | * longer idle, or one of our SMT siblings is | 3867 | * longer idle. |
3835 | * not idle. | ||
3836 | */ | 3868 | */ |
3837 | idle = CPU_NOT_IDLE; | 3869 | idle = CPU_NOT_IDLE; |
3838 | } | 3870 | } |
@@ -4079,33 +4111,62 @@ static void task_fork_fair(struct task_struct *p) | |||
4079 | * Priority of the task has changed. Check to see if we preempt | 4111 | * Priority of the task has changed. Check to see if we preempt |
4080 | * the current task. | 4112 | * the current task. |
4081 | */ | 4113 | */ |
4082 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | 4114 | static void |
4083 | int oldprio, int running) | 4115 | prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
4084 | { | 4116 | { |
4117 | if (!p->se.on_rq) | ||
4118 | return; | ||
4119 | |||
4085 | /* | 4120 | /* |
4086 | * Reschedule if we are currently running on this runqueue and | 4121 | * Reschedule if we are currently running on this runqueue and |
4087 | * our priority decreased, or if we are not currently running on | 4122 | * our priority decreased, or if we are not currently running on |
4088 | * this runqueue and our priority is higher than the current's | 4123 | * this runqueue and our priority is higher than the current's |
4089 | */ | 4124 | */ |
4090 | if (running) { | 4125 | if (rq->curr == p) { |
4091 | if (p->prio > oldprio) | 4126 | if (p->prio > oldprio) |
4092 | resched_task(rq->curr); | 4127 | resched_task(rq->curr); |
4093 | } else | 4128 | } else |
4094 | check_preempt_curr(rq, p, 0); | 4129 | check_preempt_curr(rq, p, 0); |
4095 | } | 4130 | } |
4096 | 4131 | ||
4132 | static void switched_from_fair(struct rq *rq, struct task_struct *p) | ||
4133 | { | ||
4134 | struct sched_entity *se = &p->se; | ||
4135 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
4136 | |||
4137 | /* | ||
4138 | * Ensure the task's vruntime is normalized, so that when its | ||
4139 | * switched back to the fair class the enqueue_entity(.flags=0) will | ||
4140 | * do the right thing. | ||
4141 | * | ||
4142 | * If it was on_rq, then the dequeue_entity(.flags=0) will already | ||
4143 | * have normalized the vruntime, if it was !on_rq, then only when | ||
4144 | * the task is sleeping will it still have non-normalized vruntime. | ||
4145 | */ | ||
4146 | if (!se->on_rq && p->state != TASK_RUNNING) { | ||
4147 | /* | ||
4148 | * Fix up our vruntime so that the current sleep doesn't | ||
4149 | * cause 'unlimited' sleep bonus. | ||
4150 | */ | ||
4151 | place_entity(cfs_rq, se, 0); | ||
4152 | se->vruntime -= cfs_rq->min_vruntime; | ||
4153 | } | ||
4154 | } | ||
4155 | |||
4097 | /* | 4156 | /* |
4098 | * We switched to the sched_fair class. | 4157 | * We switched to the sched_fair class. |
4099 | */ | 4158 | */ |
4100 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | 4159 | static void switched_to_fair(struct rq *rq, struct task_struct *p) |
4101 | int running) | ||
4102 | { | 4160 | { |
4161 | if (!p->se.on_rq) | ||
4162 | return; | ||
4163 | |||
4103 | /* | 4164 | /* |
4104 | * We were most likely switched from sched_rt, so | 4165 | * We were most likely switched from sched_rt, so |
4105 | * kick off the schedule if running, otherwise just see | 4166 | * kick off the schedule if running, otherwise just see |
4106 | * if we can still preempt the current task. | 4167 | * if we can still preempt the current task. |
4107 | */ | 4168 | */ |
4108 | if (running) | 4169 | if (rq->curr == p) |
4109 | resched_task(rq->curr); | 4170 | resched_task(rq->curr); |
4110 | else | 4171 | else |
4111 | check_preempt_curr(rq, p, 0); | 4172 | check_preempt_curr(rq, p, 0); |
@@ -4171,6 +4232,7 @@ static const struct sched_class fair_sched_class = { | |||
4171 | .enqueue_task = enqueue_task_fair, | 4232 | .enqueue_task = enqueue_task_fair, |
4172 | .dequeue_task = dequeue_task_fair, | 4233 | .dequeue_task = dequeue_task_fair, |
4173 | .yield_task = yield_task_fair, | 4234 | .yield_task = yield_task_fair, |
4235 | .yield_to_task = yield_to_task_fair, | ||
4174 | 4236 | ||
4175 | .check_preempt_curr = check_preempt_wakeup, | 4237 | .check_preempt_curr = check_preempt_wakeup, |
4176 | 4238 | ||
@@ -4191,6 +4253,7 @@ static const struct sched_class fair_sched_class = { | |||
4191 | .task_fork = task_fork_fair, | 4253 | .task_fork = task_fork_fair, |
4192 | 4254 | ||
4193 | .prio_changed = prio_changed_fair, | 4255 | .prio_changed = prio_changed_fair, |
4256 | .switched_from = switched_from_fair, | ||
4194 | .switched_to = switched_to_fair, | 4257 | .switched_to = switched_to_fair, |
4195 | 4258 | ||
4196 | .get_rr_interval = get_rr_interval_fair, | 4259 | .get_rr_interval = get_rr_interval_fair, |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 9fa0f402c87c..c82f26c1b7c3 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq) | |||
52 | { | 52 | { |
53 | } | 53 | } |
54 | 54 | ||
55 | static void switched_to_idle(struct rq *rq, struct task_struct *p, | 55 | static void switched_to_idle(struct rq *rq, struct task_struct *p) |
56 | int running) | ||
57 | { | 56 | { |
58 | /* Can this actually happen?? */ | 57 | BUG(); |
59 | if (running) | ||
60 | resched_task(rq->curr); | ||
61 | else | ||
62 | check_preempt_curr(rq, p, 0); | ||
63 | } | 58 | } |
64 | 59 | ||
65 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | 60 | static void |
66 | int oldprio, int running) | 61 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) |
67 | { | 62 | { |
68 | /* This can happen for hot plug CPUS */ | 63 | BUG(); |
69 | |||
70 | /* | ||
71 | * Reschedule if we are currently running on this runqueue and | ||
72 | * our priority decreased, or if we are not currently running on | ||
73 | * this runqueue and our priority is higher than the current's | ||
74 | */ | ||
75 | if (running) { | ||
76 | if (p->prio > oldprio) | ||
77 | resched_task(rq->curr); | ||
78 | } else | ||
79 | check_preempt_curr(rq, p, 0); | ||
80 | } | 64 | } |
81 | 65 | ||
82 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | 66 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 01f75a5f17af..db308cb08b75 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -1599,8 +1599,7 @@ static void rq_offline_rt(struct rq *rq) | |||
1599 | * When switch from the rt queue, we bring ourselves to a position | 1599 | * When switch from the rt queue, we bring ourselves to a position |
1600 | * that we might want to pull RT tasks from other runqueues. | 1600 | * that we might want to pull RT tasks from other runqueues. |
1601 | */ | 1601 | */ |
1602 | static void switched_from_rt(struct rq *rq, struct task_struct *p, | 1602 | static void switched_from_rt(struct rq *rq, struct task_struct *p) |
1603 | int running) | ||
1604 | { | 1603 | { |
1605 | /* | 1604 | /* |
1606 | * If there are other RT tasks then we will reschedule | 1605 | * If there are other RT tasks then we will reschedule |
@@ -1609,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p, | |||
1609 | * we may need to handle the pulling of RT tasks | 1608 | * we may need to handle the pulling of RT tasks |
1610 | * now. | 1609 | * now. |
1611 | */ | 1610 | */ |
1612 | if (!rq->rt.rt_nr_running) | 1611 | if (p->se.on_rq && !rq->rt.rt_nr_running) |
1613 | pull_rt_task(rq); | 1612 | pull_rt_task(rq); |
1614 | } | 1613 | } |
1615 | 1614 | ||
@@ -1628,8 +1627,7 @@ static inline void init_sched_rt_class(void) | |||
1628 | * with RT tasks. In this case we try to push them off to | 1627 | * with RT tasks. In this case we try to push them off to |
1629 | * other runqueues. | 1628 | * other runqueues. |
1630 | */ | 1629 | */ |
1631 | static void switched_to_rt(struct rq *rq, struct task_struct *p, | 1630 | static void switched_to_rt(struct rq *rq, struct task_struct *p) |
1632 | int running) | ||
1633 | { | 1631 | { |
1634 | int check_resched = 1; | 1632 | int check_resched = 1; |
1635 | 1633 | ||
@@ -1640,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, | |||
1640 | * If that current running task is also an RT task | 1638 | * If that current running task is also an RT task |
1641 | * then see if we can move to another run queue. | 1639 | * then see if we can move to another run queue. |
1642 | */ | 1640 | */ |
1643 | if (!running) { | 1641 | if (p->se.on_rq && rq->curr != p) { |
1644 | #ifdef CONFIG_SMP | 1642 | #ifdef CONFIG_SMP |
1645 | if (rq->rt.overloaded && push_rt_task(rq) && | 1643 | if (rq->rt.overloaded && push_rt_task(rq) && |
1646 | /* Don't resched if we changed runqueues */ | 1644 | /* Don't resched if we changed runqueues */ |
@@ -1656,10 +1654,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p, | |||
1656 | * Priority of the task has changed. This may cause | 1654 | * Priority of the task has changed. This may cause |
1657 | * us to initiate a push or pull. | 1655 | * us to initiate a push or pull. |
1658 | */ | 1656 | */ |
1659 | static void prio_changed_rt(struct rq *rq, struct task_struct *p, | 1657 | static void |
1660 | int oldprio, int running) | 1658 | prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) |
1661 | { | 1659 | { |
1662 | if (running) { | 1660 | if (!p->se.on_rq) |
1661 | return; | ||
1662 | |||
1663 | if (rq->curr == p) { | ||
1663 | #ifdef CONFIG_SMP | 1664 | #ifdef CONFIG_SMP |
1664 | /* | 1665 | /* |
1665 | * If our priority decreases while running, we | 1666 | * If our priority decreases while running, we |
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 2bf6b47058c1..84ec9bcf82d9 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c | |||
@@ -59,14 +59,13 @@ static void set_curr_task_stop(struct rq *rq) | |||
59 | { | 59 | { |
60 | } | 60 | } |
61 | 61 | ||
62 | static void switched_to_stop(struct rq *rq, struct task_struct *p, | 62 | static void switched_to_stop(struct rq *rq, struct task_struct *p) |
63 | int running) | ||
64 | { | 63 | { |
65 | BUG(); /* its impossible to change to this class */ | 64 | BUG(); /* its impossible to change to this class */ |
66 | } | 65 | } |
67 | 66 | ||
68 | static void prio_changed_stop(struct rq *rq, struct task_struct *p, | 67 | static void |
69 | int oldprio, int running) | 68 | prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) |
70 | { | 69 | { |
71 | BUG(); /* how!?, what priority? */ | 70 | BUG(); /* how!?, what priority? */ |
72 | } | 71 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec388..0cee50487629 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_stat); | |||
54 | 54 | ||
55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; | 55 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; |
56 | 56 | ||
57 | static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | 57 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); |
58 | 58 | ||
59 | char *softirq_to_name[NR_SOFTIRQS] = { | 59 | char *softirq_to_name[NR_SOFTIRQS] = { |
60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", | 60 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", |
@@ -721,7 +721,6 @@ static int run_ksoftirqd(void * __bind_cpu) | |||
721 | { | 721 | { |
722 | set_current_state(TASK_INTERRUPTIBLE); | 722 | set_current_state(TASK_INTERRUPTIBLE); |
723 | 723 | ||
724 | current->flags |= PF_KSOFTIRQD; | ||
725 | while (!kthread_should_stop()) { | 724 | while (!kthread_should_stop()) { |
726 | preempt_disable(); | 725 | preempt_disable(); |
727 | if (!local_softirq_pending()) { | 726 | if (!local_softirq_pending()) { |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 19b9d85e06cc..51054fea5d99 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -361,20 +361,13 @@ static struct ctl_table kern_table[] = { | |||
361 | .mode = 0644, | 361 | .mode = 0644, |
362 | .proc_handler = sched_rt_handler, | 362 | .proc_handler = sched_rt_handler, |
363 | }, | 363 | }, |
364 | { | ||
365 | .procname = "sched_compat_yield", | ||
366 | .data = &sysctl_sched_compat_yield, | ||
367 | .maxlen = sizeof(unsigned int), | ||
368 | .mode = 0644, | ||
369 | .proc_handler = proc_dointvec, | ||
370 | }, | ||
371 | #ifdef CONFIG_SCHED_AUTOGROUP | 364 | #ifdef CONFIG_SCHED_AUTOGROUP |
372 | { | 365 | { |
373 | .procname = "sched_autogroup_enabled", | 366 | .procname = "sched_autogroup_enabled", |
374 | .data = &sysctl_sched_autogroup_enabled, | 367 | .data = &sysctl_sched_autogroup_enabled, |
375 | .maxlen = sizeof(unsigned int), | 368 | .maxlen = sizeof(unsigned int), |
376 | .mode = 0644, | 369 | .mode = 0644, |
377 | .proc_handler = proc_dointvec, | 370 | .proc_handler = proc_dointvec_minmax, |
378 | .extra1 = &zero, | 371 | .extra1 = &zero, |
379 | .extra2 = &one, | 372 | .extra2 = &one, |
380 | }, | 373 | }, |
diff --git a/kernel/time.c b/kernel/time.c index 32174359576f..55337a816b20 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x) | |||
645 | } | 645 | } |
646 | 646 | ||
647 | /** | 647 | /** |
648 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies | 648 | * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 |
649 | * | 649 | * |
650 | * @n: nsecs in u64 | 650 | * @n: nsecs in u64 |
651 | * | 651 | * |
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x) | |||
657 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) | 657 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) |
658 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years | 658 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years |
659 | */ | 659 | */ |
660 | unsigned long nsecs_to_jiffies(u64 n) | 660 | u64 nsecs_to_jiffies64(u64 n) |
661 | { | 661 | { |
662 | #if (NSEC_PER_SEC % HZ) == 0 | 662 | #if (NSEC_PER_SEC % HZ) == 0 |
663 | /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ | 663 | /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ |
@@ -674,6 +674,25 @@ unsigned long nsecs_to_jiffies(u64 n) | |||
674 | #endif | 674 | #endif |
675 | } | 675 | } |
676 | 676 | ||
677 | |||
678 | /** | ||
679 | * nsecs_to_jiffies - Convert nsecs in u64 to jiffies | ||
680 | * | ||
681 | * @n: nsecs in u64 | ||
682 | * | ||
683 | * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. | ||
684 | * And this doesn't return MAX_JIFFY_OFFSET since this function is designed | ||
685 | * for scheduler, not for use in device drivers to calculate timeout value. | ||
686 | * | ||
687 | * note: | ||
688 | * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) | ||
689 | * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years | ||
690 | */ | ||
691 | unsigned long nsecs_to_jiffies(u64 n) | ||
692 | { | ||
693 | return (unsigned long)nsecs_to_jiffies64(n); | ||
694 | } | ||
695 | |||
677 | #if (BITS_PER_LONG < 64) | 696 | #if (BITS_PER_LONG < 64) |
678 | u64 get_jiffies_64(void) | 697 | u64 get_jiffies_64(void) |
679 | { | 698 | { |