diff options
Diffstat (limited to 'kernel/sched.c')
| -rw-r--r-- | kernel/sched.c | 556 |
1 files changed, 339 insertions, 217 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 1535f3884b88..ff39cadf621e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); | |||
| 309 | */ | 309 | */ |
| 310 | static DEFINE_SPINLOCK(task_group_lock); | 310 | static DEFINE_SPINLOCK(task_group_lock); |
| 311 | 311 | ||
| 312 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 313 | |||
| 312 | #ifdef CONFIG_SMP | 314 | #ifdef CONFIG_SMP |
| 313 | static int root_task_group_empty(void) | 315 | static int root_task_group_empty(void) |
| 314 | { | 316 | { |
| @@ -316,7 +318,6 @@ static int root_task_group_empty(void) | |||
| 316 | } | 318 | } |
| 317 | #endif | 319 | #endif |
| 318 | 320 | ||
| 319 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
| 320 | #ifdef CONFIG_USER_SCHED | 321 | #ifdef CONFIG_USER_SCHED |
| 321 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 322 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
| 322 | #else /* !CONFIG_USER_SCHED */ | 323 | #else /* !CONFIG_USER_SCHED */ |
| @@ -534,14 +535,12 @@ struct rq { | |||
| 534 | #define CPU_LOAD_IDX_MAX 5 | 535 | #define CPU_LOAD_IDX_MAX 5 |
| 535 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 536 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
| 536 | #ifdef CONFIG_NO_HZ | 537 | #ifdef CONFIG_NO_HZ |
| 537 | unsigned long last_tick_seen; | ||
| 538 | unsigned char in_nohz_recently; | 538 | unsigned char in_nohz_recently; |
| 539 | #endif | 539 | #endif |
| 540 | /* capture load from *all* tasks on this cpu: */ | 540 | /* capture load from *all* tasks on this cpu: */ |
| 541 | struct load_weight load; | 541 | struct load_weight load; |
| 542 | unsigned long nr_load_updates; | 542 | unsigned long nr_load_updates; |
| 543 | u64 nr_switches; | 543 | u64 nr_switches; |
| 544 | u64 nr_migrations_in; | ||
| 545 | 544 | ||
| 546 | struct cfs_rq cfs; | 545 | struct cfs_rq cfs; |
| 547 | struct rt_rq rt; | 546 | struct rt_rq rt; |
| @@ -590,6 +589,8 @@ struct rq { | |||
| 590 | 589 | ||
| 591 | u64 rt_avg; | 590 | u64 rt_avg; |
| 592 | u64 age_stamp; | 591 | u64 age_stamp; |
| 592 | u64 idle_stamp; | ||
| 593 | u64 avg_idle; | ||
| 593 | #endif | 594 | #endif |
| 594 | 595 | ||
| 595 | /* calc_load related fields */ | 596 | /* calc_load related fields */ |
| @@ -676,6 +677,7 @@ inline void update_rq_clock(struct rq *rq) | |||
| 676 | 677 | ||
| 677 | /** | 678 | /** |
| 678 | * runqueue_is_locked | 679 | * runqueue_is_locked |
| 680 | * @cpu: the processor in question. | ||
| 679 | * | 681 | * |
| 680 | * Returns true if the current cpu runqueue is locked. | 682 | * Returns true if the current cpu runqueue is locked. |
| 681 | * This interface allows printk to be called with the runqueue lock | 683 | * This interface allows printk to be called with the runqueue lock |
| @@ -770,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
| 770 | if (!sched_feat_names[i]) | 772 | if (!sched_feat_names[i]) |
| 771 | return -EINVAL; | 773 | return -EINVAL; |
| 772 | 774 | ||
| 773 | filp->f_pos += cnt; | 775 | *ppos += cnt; |
| 774 | 776 | ||
| 775 | return cnt; | 777 | return cnt; |
| 776 | } | 778 | } |
| @@ -812,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
| 812 | * default: 0.25ms | 814 | * default: 0.25ms |
| 813 | */ | 815 | */ |
| 814 | unsigned int sysctl_sched_shares_ratelimit = 250000; | 816 | unsigned int sysctl_sched_shares_ratelimit = 250000; |
| 817 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
| 815 | 818 | ||
| 816 | /* | 819 | /* |
| 817 | * Inject some fuzzyness into changing the per-cpu group shares | 820 | * Inject some fuzzyness into changing the per-cpu group shares |
| @@ -1563,11 +1566,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 1563 | 1566 | ||
| 1564 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1567 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 1565 | 1568 | ||
| 1566 | struct update_shares_data { | 1569 | static __read_mostly unsigned long *update_shares_data; |
| 1567 | unsigned long rq_weight[NR_CPUS]; | ||
| 1568 | }; | ||
| 1569 | |||
| 1570 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
| 1571 | 1570 | ||
| 1572 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1571 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
| 1573 | 1572 | ||
| @@ -1577,12 +1576,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); | |||
| 1577 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | 1576 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
| 1578 | unsigned long sd_shares, | 1577 | unsigned long sd_shares, |
| 1579 | unsigned long sd_rq_weight, | 1578 | unsigned long sd_rq_weight, |
| 1580 | struct update_shares_data *usd) | 1579 | unsigned long *usd_rq_weight) |
| 1581 | { | 1580 | { |
| 1582 | unsigned long shares, rq_weight; | 1581 | unsigned long shares, rq_weight; |
| 1583 | int boost = 0; | 1582 | int boost = 0; |
| 1584 | 1583 | ||
| 1585 | rq_weight = usd->rq_weight[cpu]; | 1584 | rq_weight = usd_rq_weight[cpu]; |
| 1586 | if (!rq_weight) { | 1585 | if (!rq_weight) { |
| 1587 | boost = 1; | 1586 | boost = 1; |
| 1588 | rq_weight = NICE_0_LOAD; | 1587 | rq_weight = NICE_0_LOAD; |
| @@ -1616,8 +1615,8 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, | |||
| 1616 | */ | 1615 | */ |
| 1617 | static int tg_shares_up(struct task_group *tg, void *data) | 1616 | static int tg_shares_up(struct task_group *tg, void *data) |
| 1618 | { | 1617 | { |
| 1619 | unsigned long weight, rq_weight = 0, shares = 0; | 1618 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; |
| 1620 | struct update_shares_data *usd; | 1619 | unsigned long *usd_rq_weight; |
| 1621 | struct sched_domain *sd = data; | 1620 | struct sched_domain *sd = data; |
| 1622 | unsigned long flags; | 1621 | unsigned long flags; |
| 1623 | int i; | 1622 | int i; |
| @@ -1626,12 +1625,13 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
| 1626 | return 0; | 1625 | return 0; |
| 1627 | 1626 | ||
| 1628 | local_irq_save(flags); | 1627 | local_irq_save(flags); |
| 1629 | usd = &__get_cpu_var(update_shares_data); | 1628 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); |
| 1630 | 1629 | ||
| 1631 | for_each_cpu(i, sched_domain_span(sd)) { | 1630 | for_each_cpu(i, sched_domain_span(sd)) { |
| 1632 | weight = tg->cfs_rq[i]->load.weight; | 1631 | weight = tg->cfs_rq[i]->load.weight; |
| 1633 | usd->rq_weight[i] = weight; | 1632 | usd_rq_weight[i] = weight; |
| 1634 | 1633 | ||
| 1634 | rq_weight += weight; | ||
| 1635 | /* | 1635 | /* |
| 1636 | * If there are currently no tasks on the cpu pretend there | 1636 | * If there are currently no tasks on the cpu pretend there |
| 1637 | * is one of average load so that when a new task gets to | 1637 | * is one of average load so that when a new task gets to |
| @@ -1640,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
| 1640 | if (!weight) | 1640 | if (!weight) |
| 1641 | weight = NICE_0_LOAD; | 1641 | weight = NICE_0_LOAD; |
| 1642 | 1642 | ||
| 1643 | rq_weight += weight; | 1643 | sum_weight += weight; |
| 1644 | shares += tg->cfs_rq[i]->shares; | 1644 | shares += tg->cfs_rq[i]->shares; |
| 1645 | } | 1645 | } |
| 1646 | 1646 | ||
| 1647 | if (!rq_weight) | ||
| 1648 | rq_weight = sum_weight; | ||
| 1649 | |||
| 1647 | if ((!shares && rq_weight) || shares > tg->shares) | 1650 | if ((!shares && rq_weight) || shares > tg->shares) |
| 1648 | shares = tg->shares; | 1651 | shares = tg->shares; |
| 1649 | 1652 | ||
| @@ -1651,7 +1654,7 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
| 1651 | shares = tg->shares; | 1654 | shares = tg->shares; |
| 1652 | 1655 | ||
| 1653 | for_each_cpu(i, sched_domain_span(sd)) | 1656 | for_each_cpu(i, sched_domain_span(sd)) |
| 1654 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); | 1657 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); |
| 1655 | 1658 | ||
| 1656 | local_irq_restore(flags); | 1659 | local_irq_restore(flags); |
| 1657 | 1660 | ||
| @@ -1812,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
| 1812 | #endif | 1815 | #endif |
| 1813 | 1816 | ||
| 1814 | static void calc_load_account_active(struct rq *this_rq); | 1817 | static void calc_load_account_active(struct rq *this_rq); |
| 1818 | static void update_sysctl(void); | ||
| 1819 | static int get_update_sysctl_factor(void); | ||
| 1820 | |||
| 1821 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
| 1822 | { | ||
| 1823 | set_task_rq(p, cpu); | ||
| 1824 | #ifdef CONFIG_SMP | ||
| 1825 | /* | ||
| 1826 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
| 1827 | * successfuly executed on another CPU. We must ensure that updates of | ||
| 1828 | * per-task data have been completed by this moment. | ||
| 1829 | */ | ||
| 1830 | smp_wmb(); | ||
| 1831 | task_thread_info(p)->cpu = cpu; | ||
| 1832 | #endif | ||
| 1833 | } | ||
| 1815 | 1834 | ||
| 1816 | #include "sched_stats.h" | 1835 | #include "sched_stats.h" |
| 1817 | #include "sched_idletask.c" | 1836 | #include "sched_idletask.c" |
| @@ -1969,20 +1988,6 @@ inline int task_curr(const struct task_struct *p) | |||
| 1969 | return cpu_curr(task_cpu(p)) == p; | 1988 | return cpu_curr(task_cpu(p)) == p; |
| 1970 | } | 1989 | } |
| 1971 | 1990 | ||
| 1972 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | ||
| 1973 | { | ||
| 1974 | set_task_rq(p, cpu); | ||
| 1975 | #ifdef CONFIG_SMP | ||
| 1976 | /* | ||
| 1977 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
| 1978 | * successfuly executed on another CPU. We must ensure that updates of | ||
| 1979 | * per-task data have been completed by this moment. | ||
| 1980 | */ | ||
| 1981 | smp_wmb(); | ||
| 1982 | task_thread_info(p)->cpu = cpu; | ||
| 1983 | #endif | ||
| 1984 | } | ||
| 1985 | |||
| 1986 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | 1991 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, |
| 1987 | const struct sched_class *prev_class, | 1992 | const struct sched_class *prev_class, |
| 1988 | int oldprio, int running) | 1993 | int oldprio, int running) |
| @@ -1995,6 +2000,39 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
| 1995 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2000 | p->sched_class->prio_changed(rq, p, oldprio, running); |
| 1996 | } | 2001 | } |
| 1997 | 2002 | ||
| 2003 | /** | ||
| 2004 | * kthread_bind - bind a just-created kthread to a cpu. | ||
| 2005 | * @p: thread created by kthread_create(). | ||
| 2006 | * @cpu: cpu (might not be online, must be possible) for @k to run on. | ||
| 2007 | * | ||
| 2008 | * Description: This function is equivalent to set_cpus_allowed(), | ||
| 2009 | * except that @cpu doesn't need to be online, and the thread must be | ||
| 2010 | * stopped (i.e., just returned from kthread_create()). | ||
| 2011 | * | ||
| 2012 | * Function lives here instead of kthread.c because it messes with | ||
| 2013 | * scheduler internals which require locking. | ||
| 2014 | */ | ||
| 2015 | void kthread_bind(struct task_struct *p, unsigned int cpu) | ||
| 2016 | { | ||
| 2017 | struct rq *rq = cpu_rq(cpu); | ||
| 2018 | unsigned long flags; | ||
| 2019 | |||
| 2020 | /* Must have done schedule() in kthread() before we set_task_cpu */ | ||
| 2021 | if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { | ||
| 2022 | WARN_ON(1); | ||
| 2023 | return; | ||
| 2024 | } | ||
| 2025 | |||
| 2026 | spin_lock_irqsave(&rq->lock, flags); | ||
| 2027 | update_rq_clock(rq); | ||
| 2028 | set_task_cpu(p, cpu); | ||
| 2029 | p->cpus_allowed = cpumask_of_cpu(cpu); | ||
| 2030 | p->rt.nr_cpus_allowed = 1; | ||
| 2031 | p->flags |= PF_THREAD_BOUND; | ||
| 2032 | spin_unlock_irqrestore(&rq->lock, flags); | ||
| 2033 | } | ||
| 2034 | EXPORT_SYMBOL(kthread_bind); | ||
| 2035 | |||
| 1998 | #ifdef CONFIG_SMP | 2036 | #ifdef CONFIG_SMP |
| 1999 | /* | 2037 | /* |
| 2000 | * Is this task likely cache-hot: | 2038 | * Is this task likely cache-hot: |
| @@ -2007,7 +2045,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 2007 | /* | 2045 | /* |
| 2008 | * Buddy candidates are cache hot: | 2046 | * Buddy candidates are cache hot: |
| 2009 | */ | 2047 | */ |
| 2010 | if (sched_feat(CACHE_HOT_BUDDY) && | 2048 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && |
| 2011 | (&p->se == cfs_rq_of(&p->se)->next || | 2049 | (&p->se == cfs_rq_of(&p->se)->next || |
| 2012 | &p->se == cfs_rq_of(&p->se)->last)) | 2050 | &p->se == cfs_rq_of(&p->se)->last)) |
| 2013 | return 1; | 2051 | return 1; |
| @@ -2029,30 +2067,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
| 2029 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 2067 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
| 2030 | { | 2068 | { |
| 2031 | int old_cpu = task_cpu(p); | 2069 | int old_cpu = task_cpu(p); |
| 2032 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | ||
| 2033 | struct cfs_rq *old_cfsrq = task_cfs_rq(p), | 2070 | struct cfs_rq *old_cfsrq = task_cfs_rq(p), |
| 2034 | *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); | 2071 | *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); |
| 2035 | u64 clock_offset; | ||
| 2036 | |||
| 2037 | clock_offset = old_rq->clock - new_rq->clock; | ||
| 2038 | 2072 | ||
| 2039 | trace_sched_migrate_task(p, new_cpu); | 2073 | trace_sched_migrate_task(p, new_cpu); |
| 2040 | 2074 | ||
| 2041 | #ifdef CONFIG_SCHEDSTATS | ||
| 2042 | if (p->se.wait_start) | ||
| 2043 | p->se.wait_start -= clock_offset; | ||
| 2044 | if (p->se.sleep_start) | ||
| 2045 | p->se.sleep_start -= clock_offset; | ||
| 2046 | if (p->se.block_start) | ||
| 2047 | p->se.block_start -= clock_offset; | ||
| 2048 | #endif | ||
| 2049 | if (old_cpu != new_cpu) { | 2075 | if (old_cpu != new_cpu) { |
| 2050 | p->se.nr_migrations++; | 2076 | p->se.nr_migrations++; |
| 2051 | new_rq->nr_migrations_in++; | ||
| 2052 | #ifdef CONFIG_SCHEDSTATS | ||
| 2053 | if (task_hot(p, old_rq->clock, NULL)) | ||
| 2054 | schedstat_inc(p, se.nr_forced2_migrations); | ||
| 2055 | #endif | ||
| 2056 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, | 2077 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, |
| 2057 | 1, 1, NULL, 0); | 2078 | 1, 1, NULL, 0); |
| 2058 | } | 2079 | } |
| @@ -2085,6 +2106,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | |||
| 2085 | * it is sufficient to simply update the task's cpu field. | 2106 | * it is sufficient to simply update the task's cpu field. |
| 2086 | */ | 2107 | */ |
| 2087 | if (!p->se.on_rq && !task_running(rq, p)) { | 2108 | if (!p->se.on_rq && !task_running(rq, p)) { |
| 2109 | update_rq_clock(rq); | ||
| 2088 | set_task_cpu(p, dest_cpu); | 2110 | set_task_cpu(p, dest_cpu); |
| 2089 | return 0; | 2111 | return 0; |
| 2090 | } | 2112 | } |
| @@ -2292,6 +2314,14 @@ void task_oncpu_function_call(struct task_struct *p, | |||
| 2292 | preempt_enable(); | 2314 | preempt_enable(); |
| 2293 | } | 2315 | } |
| 2294 | 2316 | ||
| 2317 | #ifdef CONFIG_SMP | ||
| 2318 | static inline | ||
| 2319 | int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) | ||
| 2320 | { | ||
| 2321 | return p->sched_class->select_task_rq(p, sd_flags, wake_flags); | ||
| 2322 | } | ||
| 2323 | #endif | ||
| 2324 | |||
| 2295 | /*** | 2325 | /*** |
| 2296 | * try_to_wake_up - wake up a thread | 2326 | * try_to_wake_up - wake up a thread |
| 2297 | * @p: the to-be-woken-up thread | 2327 | * @p: the to-be-woken-up thread |
| @@ -2311,7 +2341,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
| 2311 | { | 2341 | { |
| 2312 | int cpu, orig_cpu, this_cpu, success = 0; | 2342 | int cpu, orig_cpu, this_cpu, success = 0; |
| 2313 | unsigned long flags; | 2343 | unsigned long flags; |
| 2314 | struct rq *rq; | 2344 | struct rq *rq, *orig_rq; |
| 2315 | 2345 | ||
| 2316 | if (!sched_feat(SYNC_WAKEUPS)) | 2346 | if (!sched_feat(SYNC_WAKEUPS)) |
| 2317 | wake_flags &= ~WF_SYNC; | 2347 | wake_flags &= ~WF_SYNC; |
| @@ -2319,7 +2349,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
| 2319 | this_cpu = get_cpu(); | 2349 | this_cpu = get_cpu(); |
| 2320 | 2350 | ||
| 2321 | smp_wmb(); | 2351 | smp_wmb(); |
| 2322 | rq = task_rq_lock(p, &flags); | 2352 | rq = orig_rq = task_rq_lock(p, &flags); |
| 2323 | update_rq_clock(rq); | 2353 | update_rq_clock(rq); |
| 2324 | if (!(p->state & state)) | 2354 | if (!(p->state & state)) |
| 2325 | goto out; | 2355 | goto out; |
| @@ -2343,13 +2373,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, | |||
| 2343 | if (task_contributes_to_load(p)) | 2373 | if (task_contributes_to_load(p)) |
| 2344 | rq->nr_uninterruptible--; | 2374 | rq->nr_uninterruptible--; |
| 2345 | p->state = TASK_WAKING; | 2375 | p->state = TASK_WAKING; |
| 2346 | task_rq_unlock(rq, &flags); | 2376 | __task_rq_unlock(rq); |
| 2347 | 2377 | ||
| 2348 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | 2378 | cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); |
| 2349 | if (cpu != orig_cpu) | 2379 | if (cpu != orig_cpu) |
| 2350 | set_task_cpu(p, cpu); | 2380 | set_task_cpu(p, cpu); |
| 2351 | 2381 | ||
| 2352 | rq = task_rq_lock(p, &flags); | 2382 | rq = __task_rq_lock(p); |
| 2383 | update_rq_clock(rq); | ||
| 2384 | |||
| 2353 | WARN_ON(p->state != TASK_WAKING); | 2385 | WARN_ON(p->state != TASK_WAKING); |
| 2354 | cpu = task_cpu(p); | 2386 | cpu = task_cpu(p); |
| 2355 | 2387 | ||
| @@ -2406,6 +2438,17 @@ out_running: | |||
| 2406 | #ifdef CONFIG_SMP | 2438 | #ifdef CONFIG_SMP |
| 2407 | if (p->sched_class->task_wake_up) | 2439 | if (p->sched_class->task_wake_up) |
| 2408 | p->sched_class->task_wake_up(rq, p); | 2440 | p->sched_class->task_wake_up(rq, p); |
| 2441 | |||
| 2442 | if (unlikely(rq->idle_stamp)) { | ||
| 2443 | u64 delta = rq->clock - rq->idle_stamp; | ||
| 2444 | u64 max = 2*sysctl_sched_migration_cost; | ||
| 2445 | |||
| 2446 | if (delta > max) | ||
| 2447 | rq->avg_idle = max; | ||
| 2448 | else | ||
| 2449 | update_avg(&rq->avg_idle, delta); | ||
| 2450 | rq->idle_stamp = 0; | ||
| 2451 | } | ||
| 2409 | #endif | 2452 | #endif |
| 2410 | out: | 2453 | out: |
| 2411 | task_rq_unlock(rq, &flags); | 2454 | task_rq_unlock(rq, &flags); |
| @@ -2452,7 +2495,6 @@ static void __sched_fork(struct task_struct *p) | |||
| 2452 | p->se.avg_overlap = 0; | 2495 | p->se.avg_overlap = 0; |
| 2453 | p->se.start_runtime = 0; | 2496 | p->se.start_runtime = 0; |
| 2454 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; | 2497 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; |
| 2455 | p->se.avg_running = 0; | ||
| 2456 | 2498 | ||
| 2457 | #ifdef CONFIG_SCHEDSTATS | 2499 | #ifdef CONFIG_SCHEDSTATS |
| 2458 | p->se.wait_start = 0; | 2500 | p->se.wait_start = 0; |
| @@ -2474,7 +2516,6 @@ static void __sched_fork(struct task_struct *p) | |||
| 2474 | p->se.nr_failed_migrations_running = 0; | 2516 | p->se.nr_failed_migrations_running = 0; |
| 2475 | p->se.nr_failed_migrations_hot = 0; | 2517 | p->se.nr_failed_migrations_hot = 0; |
| 2476 | p->se.nr_forced_migrations = 0; | 2518 | p->se.nr_forced_migrations = 0; |
| 2477 | p->se.nr_forced2_migrations = 0; | ||
| 2478 | 2519 | ||
| 2479 | p->se.nr_wakeups = 0; | 2520 | p->se.nr_wakeups = 0; |
| 2480 | p->se.nr_wakeups_sync = 0; | 2521 | p->se.nr_wakeups_sync = 0; |
| @@ -2515,22 +2556,17 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 2515 | __sched_fork(p); | 2556 | __sched_fork(p); |
| 2516 | 2557 | ||
| 2517 | /* | 2558 | /* |
| 2518 | * Make sure we do not leak PI boosting priority to the child. | ||
| 2519 | */ | ||
| 2520 | p->prio = current->normal_prio; | ||
| 2521 | |||
| 2522 | /* | ||
| 2523 | * Revert to default priority/policy on fork if requested. | 2559 | * Revert to default priority/policy on fork if requested. |
| 2524 | */ | 2560 | */ |
| 2525 | if (unlikely(p->sched_reset_on_fork)) { | 2561 | if (unlikely(p->sched_reset_on_fork)) { |
| 2526 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) | 2562 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { |
| 2527 | p->policy = SCHED_NORMAL; | 2563 | p->policy = SCHED_NORMAL; |
| 2528 | 2564 | p->normal_prio = p->static_prio; | |
| 2529 | if (p->normal_prio < DEFAULT_PRIO) | 2565 | } |
| 2530 | p->prio = DEFAULT_PRIO; | ||
| 2531 | 2566 | ||
| 2532 | if (PRIO_TO_NICE(p->static_prio) < 0) { | 2567 | if (PRIO_TO_NICE(p->static_prio) < 0) { |
| 2533 | p->static_prio = NICE_TO_PRIO(0); | 2568 | p->static_prio = NICE_TO_PRIO(0); |
| 2569 | p->normal_prio = p->static_prio; | ||
| 2534 | set_load_weight(p); | 2570 | set_load_weight(p); |
| 2535 | } | 2571 | } |
| 2536 | 2572 | ||
| @@ -2541,11 +2577,19 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
| 2541 | p->sched_reset_on_fork = 0; | 2577 | p->sched_reset_on_fork = 0; |
| 2542 | } | 2578 | } |
| 2543 | 2579 | ||
| 2580 | /* | ||
| 2581 | * Make sure we do not leak PI boosting priority to the child. | ||
| 2582 | */ | ||
| 2583 | p->prio = current->normal_prio; | ||
| 2584 | |||
| 2544 | if (!rt_prio(p->prio)) | 2585 | if (!rt_prio(p->prio)) |
| 2545 | p->sched_class = &fair_sched_class; | 2586 | p->sched_class = &fair_sched_class; |
| 2546 | 2587 | ||
| 2588 | if (p->sched_class->task_fork) | ||
| 2589 | p->sched_class->task_fork(p); | ||
| 2590 | |||
| 2547 | #ifdef CONFIG_SMP | 2591 | #ifdef CONFIG_SMP |
| 2548 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); | 2592 | cpu = select_task_rq(p, SD_BALANCE_FORK, 0); |
| 2549 | #endif | 2593 | #endif |
| 2550 | set_task_cpu(p, cpu); | 2594 | set_task_cpu(p, cpu); |
| 2551 | 2595 | ||
| @@ -2580,19 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 2580 | rq = task_rq_lock(p, &flags); | 2624 | rq = task_rq_lock(p, &flags); |
| 2581 | BUG_ON(p->state != TASK_RUNNING); | 2625 | BUG_ON(p->state != TASK_RUNNING); |
| 2582 | update_rq_clock(rq); | 2626 | update_rq_clock(rq); |
| 2583 | 2627 | activate_task(rq, p, 0); | |
| 2584 | p->prio = effective_prio(p); | ||
| 2585 | |||
| 2586 | if (!p->sched_class->task_new || !current->se.on_rq) { | ||
| 2587 | activate_task(rq, p, 0); | ||
| 2588 | } else { | ||
| 2589 | /* | ||
| 2590 | * Let the scheduling class do new task startup | ||
| 2591 | * management (if any): | ||
| 2592 | */ | ||
| 2593 | p->sched_class->task_new(rq, p); | ||
| 2594 | inc_nr_running(rq); | ||
| 2595 | } | ||
| 2596 | trace_sched_wakeup_new(rq, p, 1); | 2628 | trace_sched_wakeup_new(rq, p, 1); |
| 2597 | check_preempt_curr(rq, p, WF_FORK); | 2629 | check_preempt_curr(rq, p, WF_FORK); |
| 2598 | #ifdef CONFIG_SMP | 2630 | #ifdef CONFIG_SMP |
| @@ -2816,14 +2848,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2816 | */ | 2848 | */ |
| 2817 | arch_start_context_switch(prev); | 2849 | arch_start_context_switch(prev); |
| 2818 | 2850 | ||
| 2819 | if (unlikely(!mm)) { | 2851 | if (likely(!mm)) { |
| 2820 | next->active_mm = oldmm; | 2852 | next->active_mm = oldmm; |
| 2821 | atomic_inc(&oldmm->mm_count); | 2853 | atomic_inc(&oldmm->mm_count); |
| 2822 | enter_lazy_tlb(oldmm, next); | 2854 | enter_lazy_tlb(oldmm, next); |
| 2823 | } else | 2855 | } else |
| 2824 | switch_mm(oldmm, mm, next); | 2856 | switch_mm(oldmm, mm, next); |
| 2825 | 2857 | ||
| 2826 | if (unlikely(!prev->mm)) { | 2858 | if (likely(!prev->mm)) { |
| 2827 | prev->active_mm = NULL; | 2859 | prev->active_mm = NULL; |
| 2828 | rq->prev_mm = oldmm; | 2860 | rq->prev_mm = oldmm; |
| 2829 | } | 2861 | } |
| @@ -2986,15 +3018,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 2986 | } | 3018 | } |
| 2987 | 3019 | ||
| 2988 | /* | 3020 | /* |
| 2989 | * Externally visible per-cpu scheduler statistics: | ||
| 2990 | * cpu_nr_migrations(cpu) - number of migrations into that cpu | ||
| 2991 | */ | ||
| 2992 | u64 cpu_nr_migrations(int cpu) | ||
| 2993 | { | ||
| 2994 | return cpu_rq(cpu)->nr_migrations_in; | ||
| 2995 | } | ||
| 2996 | |||
| 2997 | /* | ||
| 2998 | * Update rq->cpu_load[] statistics. This function is usually called every | 3021 | * Update rq->cpu_load[] statistics. This function is usually called every |
| 2999 | * scheduler tick (TICK_NSEC). | 3022 | * scheduler tick (TICK_NSEC). |
| 3000 | */ | 3023 | */ |
| @@ -3116,7 +3139,7 @@ out: | |||
| 3116 | void sched_exec(void) | 3139 | void sched_exec(void) |
| 3117 | { | 3140 | { |
| 3118 | int new_cpu, this_cpu = get_cpu(); | 3141 | int new_cpu, this_cpu = get_cpu(); |
| 3119 | new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); | 3142 | new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0); |
| 3120 | put_cpu(); | 3143 | put_cpu(); |
| 3121 | if (new_cpu != this_cpu) | 3144 | if (new_cpu != this_cpu) |
| 3122 | sched_migrate_task(current, new_cpu); | 3145 | sched_migrate_task(current, new_cpu); |
| @@ -3132,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, | |||
| 3132 | deactivate_task(src_rq, p, 0); | 3155 | deactivate_task(src_rq, p, 0); |
| 3133 | set_task_cpu(p, this_cpu); | 3156 | set_task_cpu(p, this_cpu); |
| 3134 | activate_task(this_rq, p, 0); | 3157 | activate_task(this_rq, p, 0); |
| 3135 | /* | ||
| 3136 | * Note that idle threads have a prio of MAX_PRIO, for this test | ||
| 3137 | * to be always true for them. | ||
| 3138 | */ | ||
| 3139 | check_preempt_curr(this_rq, p, 0); | 3158 | check_preempt_curr(this_rq, p, 0); |
| 3140 | } | 3159 | } |
| 3141 | 3160 | ||
| @@ -3658,6 +3677,7 @@ static void update_group_power(struct sched_domain *sd, int cpu) | |||
| 3658 | 3677 | ||
| 3659 | /** | 3678 | /** |
| 3660 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3679 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 3680 | * @sd: The sched_domain whose statistics are to be updated. | ||
| 3661 | * @group: sched_group whose statistics are to be updated. | 3681 | * @group: sched_group whose statistics are to be updated. |
| 3662 | * @this_cpu: Cpu for which load balance is currently performed. | 3682 | * @this_cpu: Cpu for which load balance is currently performed. |
| 3663 | * @idle: Idle status of this_cpu | 3683 | * @idle: Idle status of this_cpu |
| @@ -4093,7 +4113,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 4093 | unsigned long flags; | 4113 | unsigned long flags; |
| 4094 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4114 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
| 4095 | 4115 | ||
| 4096 | cpumask_setall(cpus); | 4116 | cpumask_copy(cpus, cpu_active_mask); |
| 4097 | 4117 | ||
| 4098 | /* | 4118 | /* |
| 4099 | * When power savings policy is enabled for the parent domain, idle | 4119 | * When power savings policy is enabled for the parent domain, idle |
| @@ -4256,7 +4276,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
| 4256 | int all_pinned = 0; | 4276 | int all_pinned = 0; |
| 4257 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); | 4277 | struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); |
| 4258 | 4278 | ||
| 4259 | cpumask_setall(cpus); | 4279 | cpumask_copy(cpus, cpu_active_mask); |
| 4260 | 4280 | ||
| 4261 | /* | 4281 | /* |
| 4262 | * When power savings policy is enabled for the parent domain, idle | 4282 | * When power savings policy is enabled for the parent domain, idle |
| @@ -4396,6 +4416,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 4396 | int pulled_task = 0; | 4416 | int pulled_task = 0; |
| 4397 | unsigned long next_balance = jiffies + HZ; | 4417 | unsigned long next_balance = jiffies + HZ; |
| 4398 | 4418 | ||
| 4419 | this_rq->idle_stamp = this_rq->clock; | ||
| 4420 | |||
| 4421 | if (this_rq->avg_idle < sysctl_sched_migration_cost) | ||
| 4422 | return; | ||
| 4423 | |||
| 4399 | for_each_domain(this_cpu, sd) { | 4424 | for_each_domain(this_cpu, sd) { |
| 4400 | unsigned long interval; | 4425 | unsigned long interval; |
| 4401 | 4426 | ||
| @@ -4410,8 +4435,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq) | |||
| 4410 | interval = msecs_to_jiffies(sd->balance_interval); | 4435 | interval = msecs_to_jiffies(sd->balance_interval); |
| 4411 | if (time_after(next_balance, sd->last_balance + interval)) | 4436 | if (time_after(next_balance, sd->last_balance + interval)) |
| 4412 | next_balance = sd->last_balance + interval; | 4437 | next_balance = sd->last_balance + interval; |
| 4413 | if (pulled_task) | 4438 | if (pulled_task) { |
| 4439 | this_rq->idle_stamp = 0; | ||
| 4414 | break; | 4440 | break; |
| 4441 | } | ||
| 4415 | } | 4442 | } |
| 4416 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | 4443 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { |
| 4417 | /* | 4444 | /* |
| @@ -4646,7 +4673,7 @@ int select_nohz_load_balancer(int stop_tick) | |||
| 4646 | cpumask_set_cpu(cpu, nohz.cpu_mask); | 4673 | cpumask_set_cpu(cpu, nohz.cpu_mask); |
| 4647 | 4674 | ||
| 4648 | /* time for ilb owner also to sleep */ | 4675 | /* time for ilb owner also to sleep */ |
| 4649 | if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { | 4676 | if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { |
| 4650 | if (atomic_read(&nohz.load_balancer) == cpu) | 4677 | if (atomic_read(&nohz.load_balancer) == cpu) |
| 4651 | atomic_set(&nohz.load_balancer, -1); | 4678 | atomic_set(&nohz.load_balancer, -1); |
| 4652 | return 0; | 4679 | return 0; |
| @@ -5013,8 +5040,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
| 5013 | p->gtime = cputime_add(p->gtime, cputime); | 5040 | p->gtime = cputime_add(p->gtime, cputime); |
| 5014 | 5041 | ||
| 5015 | /* Add guest time to cpustat. */ | 5042 | /* Add guest time to cpustat. */ |
| 5016 | cpustat->user = cputime64_add(cpustat->user, tmp); | 5043 | if (TASK_NICE(p) > 0) { |
| 5017 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | 5044 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
| 5045 | cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); | ||
| 5046 | } else { | ||
| 5047 | cpustat->user = cputime64_add(cpustat->user, tmp); | ||
| 5048 | cpustat->guest = cputime64_add(cpustat->guest, tmp); | ||
| 5049 | } | ||
| 5018 | } | 5050 | } |
| 5019 | 5051 | ||
| 5020 | /* | 5052 | /* |
| @@ -5129,60 +5161,86 @@ void account_idle_ticks(unsigned long ticks) | |||
| 5129 | * Use precise platform statistics if available: | 5161 | * Use precise platform statistics if available: |
| 5130 | */ | 5162 | */ |
| 5131 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 5163 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
| 5132 | cputime_t task_utime(struct task_struct *p) | 5164 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
| 5133 | { | 5165 | { |
| 5134 | return p->utime; | 5166 | *ut = p->utime; |
| 5167 | *st = p->stime; | ||
| 5135 | } | 5168 | } |
| 5136 | 5169 | ||
| 5137 | cputime_t task_stime(struct task_struct *p) | 5170 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) |
| 5138 | { | 5171 | { |
| 5139 | return p->stime; | 5172 | struct task_cputime cputime; |
| 5173 | |||
| 5174 | thread_group_cputime(p, &cputime); | ||
| 5175 | |||
| 5176 | *ut = cputime.utime; | ||
| 5177 | *st = cputime.stime; | ||
| 5140 | } | 5178 | } |
| 5141 | #else | 5179 | #else |
| 5142 | cputime_t task_utime(struct task_struct *p) | 5180 | |
| 5181 | #ifndef nsecs_to_cputime | ||
| 5182 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
| 5183 | #endif | ||
| 5184 | |||
| 5185 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 5143 | { | 5186 | { |
| 5144 | clock_t utime = cputime_to_clock_t(p->utime), | 5187 | cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); |
| 5145 | total = utime + cputime_to_clock_t(p->stime); | ||
| 5146 | u64 temp; | ||
| 5147 | 5188 | ||
| 5148 | /* | 5189 | /* |
| 5149 | * Use CFS's precise accounting: | 5190 | * Use CFS's precise accounting: |
| 5150 | */ | 5191 | */ |
| 5151 | temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); | 5192 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); |
| 5152 | 5193 | ||
| 5153 | if (total) { | 5194 | if (total) { |
| 5154 | temp *= utime; | 5195 | u64 temp; |
| 5196 | |||
| 5197 | temp = (u64)(rtime * utime); | ||
| 5155 | do_div(temp, total); | 5198 | do_div(temp, total); |
| 5156 | } | 5199 | utime = (cputime_t)temp; |
| 5157 | utime = (clock_t)temp; | 5200 | } else |
| 5201 | utime = rtime; | ||
| 5158 | 5202 | ||
| 5159 | p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); | 5203 | /* |
| 5160 | return p->prev_utime; | 5204 | * Compare with previous values, to keep monotonicity: |
| 5205 | */ | ||
| 5206 | p->prev_utime = max(p->prev_utime, utime); | ||
| 5207 | p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); | ||
| 5208 | |||
| 5209 | *ut = p->prev_utime; | ||
| 5210 | *st = p->prev_stime; | ||
| 5161 | } | 5211 | } |
| 5162 | 5212 | ||
| 5163 | cputime_t task_stime(struct task_struct *p) | 5213 | /* |
| 5214 | * Must be called with siglock held. | ||
| 5215 | */ | ||
| 5216 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 5164 | { | 5217 | { |
| 5165 | clock_t stime; | 5218 | struct signal_struct *sig = p->signal; |
| 5219 | struct task_cputime cputime; | ||
| 5220 | cputime_t rtime, utime, total; | ||
| 5166 | 5221 | ||
| 5167 | /* | 5222 | thread_group_cputime(p, &cputime); |
| 5168 | * Use CFS's precise accounting. (we subtract utime from | ||
| 5169 | * the total, to make sure the total observed by userspace | ||
| 5170 | * grows monotonically - apps rely on that): | ||
| 5171 | */ | ||
| 5172 | stime = nsec_to_clock_t(p->se.sum_exec_runtime) - | ||
| 5173 | cputime_to_clock_t(task_utime(p)); | ||
| 5174 | 5223 | ||
| 5175 | if (stime >= 0) | 5224 | total = cputime_add(cputime.utime, cputime.stime); |
| 5176 | p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); | 5225 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); |
| 5177 | 5226 | ||
| 5178 | return p->prev_stime; | 5227 | if (total) { |
| 5179 | } | 5228 | u64 temp; |
| 5180 | #endif | ||
| 5181 | 5229 | ||
| 5182 | inline cputime_t task_gtime(struct task_struct *p) | 5230 | temp = (u64)(rtime * cputime.utime); |
| 5183 | { | 5231 | do_div(temp, total); |
| 5184 | return p->gtime; | 5232 | utime = (cputime_t)temp; |
| 5233 | } else | ||
| 5234 | utime = rtime; | ||
| 5235 | |||
| 5236 | sig->prev_utime = max(sig->prev_utime, utime); | ||
| 5237 | sig->prev_stime = max(sig->prev_stime, | ||
| 5238 | cputime_sub(rtime, sig->prev_utime)); | ||
| 5239 | |||
| 5240 | *ut = sig->prev_utime; | ||
| 5241 | *st = sig->prev_stime; | ||
| 5185 | } | 5242 | } |
| 5243 | #endif | ||
| 5186 | 5244 | ||
| 5187 | /* | 5245 | /* |
| 5188 | * This function gets called by the timer code, with HZ frequency. | 5246 | * This function gets called by the timer code, with HZ frequency. |
| @@ -5317,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev) | |||
| 5317 | #endif | 5375 | #endif |
| 5318 | } | 5376 | } |
| 5319 | 5377 | ||
| 5320 | static void put_prev_task(struct rq *rq, struct task_struct *p) | 5378 | static void put_prev_task(struct rq *rq, struct task_struct *prev) |
| 5321 | { | 5379 | { |
| 5322 | u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; | 5380 | if (prev->state == TASK_RUNNING) { |
| 5381 | u64 runtime = prev->se.sum_exec_runtime; | ||
| 5323 | 5382 | ||
| 5324 | update_avg(&p->se.avg_running, runtime); | 5383 | runtime -= prev->se.prev_sum_exec_runtime; |
| 5384 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); | ||
| 5325 | 5385 | ||
| 5326 | if (p->state == TASK_RUNNING) { | ||
| 5327 | /* | 5386 | /* |
| 5328 | * In order to avoid avg_overlap growing stale when we are | 5387 | * In order to avoid avg_overlap growing stale when we are |
| 5329 | * indeed overlapping and hence not getting put to sleep, grow | 5388 | * indeed overlapping and hence not getting put to sleep, grow |
| @@ -5333,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p) | |||
| 5333 | * correlates to the amount of cache footprint a task can | 5392 | * correlates to the amount of cache footprint a task can |
| 5334 | * build up. | 5393 | * build up. |
| 5335 | */ | 5394 | */ |
| 5336 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); | 5395 | update_avg(&prev->se.avg_overlap, runtime); |
| 5337 | update_avg(&p->se.avg_overlap, runtime); | ||
| 5338 | } else { | ||
| 5339 | update_avg(&p->se.avg_running, 0); | ||
| 5340 | } | 5396 | } |
| 5341 | p->sched_class->put_prev_task(rq, p); | 5397 | prev->sched_class->put_prev_task(rq, prev); |
| 5342 | } | 5398 | } |
| 5343 | 5399 | ||
| 5344 | /* | 5400 | /* |
| @@ -5448,7 +5504,7 @@ need_resched_nonpreemptible: | |||
| 5448 | } | 5504 | } |
| 5449 | EXPORT_SYMBOL(schedule); | 5505 | EXPORT_SYMBOL(schedule); |
| 5450 | 5506 | ||
| 5451 | #ifdef CONFIG_SMP | 5507 | #ifdef CONFIG_MUTEX_SPIN_ON_OWNER |
| 5452 | /* | 5508 | /* |
| 5453 | * Look out! "owner" is an entirely speculative pointer | 5509 | * Look out! "owner" is an entirely speculative pointer |
| 5454 | * access and not reliable. | 5510 | * access and not reliable. |
| @@ -6142,22 +6198,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | |||
| 6142 | BUG_ON(p->se.on_rq); | 6198 | BUG_ON(p->se.on_rq); |
| 6143 | 6199 | ||
| 6144 | p->policy = policy; | 6200 | p->policy = policy; |
| 6145 | switch (p->policy) { | ||
| 6146 | case SCHED_NORMAL: | ||
| 6147 | case SCHED_BATCH: | ||
| 6148 | case SCHED_IDLE: | ||
| 6149 | p->sched_class = &fair_sched_class; | ||
| 6150 | break; | ||
| 6151 | case SCHED_FIFO: | ||
| 6152 | case SCHED_RR: | ||
| 6153 | p->sched_class = &rt_sched_class; | ||
| 6154 | break; | ||
| 6155 | } | ||
| 6156 | |||
| 6157 | p->rt_priority = prio; | 6201 | p->rt_priority = prio; |
| 6158 | p->normal_prio = normal_prio(p); | 6202 | p->normal_prio = normal_prio(p); |
| 6159 | /* we are holding p->pi_lock already */ | 6203 | /* we are holding p->pi_lock already */ |
| 6160 | p->prio = rt_mutex_getprio(p); | 6204 | p->prio = rt_mutex_getprio(p); |
| 6205 | if (rt_prio(p->prio)) | ||
| 6206 | p->sched_class = &rt_sched_class; | ||
| 6207 | else | ||
| 6208 | p->sched_class = &fair_sched_class; | ||
| 6161 | set_load_weight(p); | 6209 | set_load_weight(p); |
| 6162 | } | 6210 | } |
| 6163 | 6211 | ||
| @@ -6560,6 +6608,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, | |||
| 6560 | long sched_getaffinity(pid_t pid, struct cpumask *mask) | 6608 | long sched_getaffinity(pid_t pid, struct cpumask *mask) |
| 6561 | { | 6609 | { |
| 6562 | struct task_struct *p; | 6610 | struct task_struct *p; |
| 6611 | unsigned long flags; | ||
| 6612 | struct rq *rq; | ||
| 6563 | int retval; | 6613 | int retval; |
| 6564 | 6614 | ||
| 6565 | get_online_cpus(); | 6615 | get_online_cpus(); |
| @@ -6574,7 +6624,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
| 6574 | if (retval) | 6624 | if (retval) |
| 6575 | goto out_unlock; | 6625 | goto out_unlock; |
| 6576 | 6626 | ||
| 6627 | rq = task_rq_lock(p, &flags); | ||
| 6577 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); | 6628 | cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); |
| 6629 | task_rq_unlock(rq, &flags); | ||
| 6578 | 6630 | ||
| 6579 | out_unlock: | 6631 | out_unlock: |
| 6580 | read_unlock(&tasklist_lock); | 6632 | read_unlock(&tasklist_lock); |
| @@ -6720,9 +6772,6 @@ EXPORT_SYMBOL(yield); | |||
| 6720 | /* | 6772 | /* |
| 6721 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 6773 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
| 6722 | * that process accounting knows that this is a task in IO wait state. | 6774 | * that process accounting knows that this is a task in IO wait state. |
| 6723 | * | ||
| 6724 | * But don't do that if it is a deliberate, throttling IO wait (this task | ||
| 6725 | * has set its backing_dev_info: the queue against which it should throttle) | ||
| 6726 | */ | 6775 | */ |
| 6727 | void __sched io_schedule(void) | 6776 | void __sched io_schedule(void) |
| 6728 | { | 6777 | { |
| @@ -6815,6 +6864,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
| 6815 | { | 6864 | { |
| 6816 | struct task_struct *p; | 6865 | struct task_struct *p; |
| 6817 | unsigned int time_slice; | 6866 | unsigned int time_slice; |
| 6867 | unsigned long flags; | ||
| 6868 | struct rq *rq; | ||
| 6818 | int retval; | 6869 | int retval; |
| 6819 | struct timespec t; | 6870 | struct timespec t; |
| 6820 | 6871 | ||
| @@ -6831,7 +6882,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
| 6831 | if (retval) | 6882 | if (retval) |
| 6832 | goto out_unlock; | 6883 | goto out_unlock; |
| 6833 | 6884 | ||
| 6834 | time_slice = p->sched_class->get_rr_interval(p); | 6885 | rq = task_rq_lock(p, &flags); |
| 6886 | time_slice = p->sched_class->get_rr_interval(rq, p); | ||
| 6887 | task_rq_unlock(rq, &flags); | ||
| 6835 | 6888 | ||
| 6836 | read_unlock(&tasklist_lock); | 6889 | read_unlock(&tasklist_lock); |
| 6837 | jiffies_to_timespec(time_slice, &t); | 6890 | jiffies_to_timespec(time_slice, &t); |
| @@ -6905,7 +6958,7 @@ void show_state_filter(unsigned long state_filter) | |||
| 6905 | /* | 6958 | /* |
| 6906 | * Only show locks if all tasks are dumped: | 6959 | * Only show locks if all tasks are dumped: |
| 6907 | */ | 6960 | */ |
| 6908 | if (state_filter == -1) | 6961 | if (!state_filter) |
| 6909 | debug_show_all_locks(); | 6962 | debug_show_all_locks(); |
| 6910 | } | 6963 | } |
| 6911 | 6964 | ||
| @@ -6932,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
| 6932 | __sched_fork(idle); | 6985 | __sched_fork(idle); |
| 6933 | idle->se.exec_start = sched_clock(); | 6986 | idle->se.exec_start = sched_clock(); |
| 6934 | 6987 | ||
| 6935 | idle->prio = idle->normal_prio = MAX_PRIO; | ||
| 6936 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 6988 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); |
| 6937 | __set_task_cpu(idle, cpu); | 6989 | __set_task_cpu(idle, cpu); |
| 6938 | 6990 | ||
| @@ -6973,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask; | |||
| 6973 | * | 7025 | * |
| 6974 | * This idea comes from the SD scheduler of Con Kolivas: | 7026 | * This idea comes from the SD scheduler of Con Kolivas: |
| 6975 | */ | 7027 | */ |
| 6976 | static inline void sched_init_granularity(void) | 7028 | static int get_update_sysctl_factor(void) |
| 6977 | { | 7029 | { |
| 6978 | unsigned int factor = 1 + ilog2(num_online_cpus()); | 7030 | unsigned int cpus = min_t(int, num_online_cpus(), 8); |
| 6979 | const unsigned long limit = 200000000; | 7031 | unsigned int factor; |
| 7032 | |||
| 7033 | switch (sysctl_sched_tunable_scaling) { | ||
| 7034 | case SCHED_TUNABLESCALING_NONE: | ||
| 7035 | factor = 1; | ||
| 7036 | break; | ||
| 7037 | case SCHED_TUNABLESCALING_LINEAR: | ||
| 7038 | factor = cpus; | ||
| 7039 | break; | ||
| 7040 | case SCHED_TUNABLESCALING_LOG: | ||
| 7041 | default: | ||
| 7042 | factor = 1 + ilog2(cpus); | ||
| 7043 | break; | ||
| 7044 | } | ||
| 6980 | 7045 | ||
| 6981 | sysctl_sched_min_granularity *= factor; | 7046 | return factor; |
| 6982 | if (sysctl_sched_min_granularity > limit) | 7047 | } |
| 6983 | sysctl_sched_min_granularity = limit; | ||
| 6984 | 7048 | ||
| 6985 | sysctl_sched_latency *= factor; | 7049 | static void update_sysctl(void) |
| 6986 | if (sysctl_sched_latency > limit) | 7050 | { |
| 6987 | sysctl_sched_latency = limit; | 7051 | unsigned int factor = get_update_sysctl_factor(); |
| 6988 | 7052 | ||
| 6989 | sysctl_sched_wakeup_granularity *= factor; | 7053 | #define SET_SYSCTL(name) \ |
| 7054 | (sysctl_##name = (factor) * normalized_sysctl_##name) | ||
| 7055 | SET_SYSCTL(sched_min_granularity); | ||
| 7056 | SET_SYSCTL(sched_latency); | ||
| 7057 | SET_SYSCTL(sched_wakeup_granularity); | ||
| 7058 | SET_SYSCTL(sched_shares_ratelimit); | ||
| 7059 | #undef SET_SYSCTL | ||
| 7060 | } | ||
| 6990 | 7061 | ||
| 6991 | sysctl_sched_shares_ratelimit *= factor; | 7062 | static inline void sched_init_granularity(void) |
| 7063 | { | ||
| 7064 | update_sysctl(); | ||
| 6992 | } | 7065 | } |
| 6993 | 7066 | ||
| 6994 | #ifdef CONFIG_SMP | 7067 | #ifdef CONFIG_SMP |
| @@ -7025,7 +7098,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 7025 | int ret = 0; | 7098 | int ret = 0; |
| 7026 | 7099 | ||
| 7027 | rq = task_rq_lock(p, &flags); | 7100 | rq = task_rq_lock(p, &flags); |
| 7028 | if (!cpumask_intersects(new_mask, cpu_online_mask)) { | 7101 | if (!cpumask_intersects(new_mask, cpu_active_mask)) { |
| 7029 | ret = -EINVAL; | 7102 | ret = -EINVAL; |
| 7030 | goto out; | 7103 | goto out; |
| 7031 | } | 7104 | } |
| @@ -7047,7 +7120,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
| 7047 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 7120 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
| 7048 | goto out; | 7121 | goto out; |
| 7049 | 7122 | ||
| 7050 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7123 | if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { |
| 7051 | /* Need help from migration thread: drop lock and wait. */ | 7124 | /* Need help from migration thread: drop lock and wait. */ |
| 7052 | struct task_struct *mt = rq->migration_thread; | 7125 | struct task_struct *mt = rq->migration_thread; |
| 7053 | 7126 | ||
| @@ -7201,19 +7274,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
| 7201 | 7274 | ||
| 7202 | again: | 7275 | again: |
| 7203 | /* Look for allowed, online CPU in same node. */ | 7276 | /* Look for allowed, online CPU in same node. */ |
| 7204 | for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) | 7277 | for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) |
| 7205 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 7278 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) |
| 7206 | goto move; | 7279 | goto move; |
| 7207 | 7280 | ||
| 7208 | /* Any allowed, online CPU? */ | 7281 | /* Any allowed, online CPU? */ |
| 7209 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); | 7282 | dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); |
| 7210 | if (dest_cpu < nr_cpu_ids) | 7283 | if (dest_cpu < nr_cpu_ids) |
| 7211 | goto move; | 7284 | goto move; |
| 7212 | 7285 | ||
| 7213 | /* No more Mr. Nice Guy. */ | 7286 | /* No more Mr. Nice Guy. */ |
| 7214 | if (dest_cpu >= nr_cpu_ids) { | 7287 | if (dest_cpu >= nr_cpu_ids) { |
| 7215 | cpuset_cpus_allowed_locked(p, &p->cpus_allowed); | 7288 | cpuset_cpus_allowed_locked(p, &p->cpus_allowed); |
| 7216 | dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); | 7289 | dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); |
| 7217 | 7290 | ||
| 7218 | /* | 7291 | /* |
| 7219 | * Don't tell them about moving exiting tasks or | 7292 | * Don't tell them about moving exiting tasks or |
| @@ -7242,7 +7315,7 @@ move: | |||
| 7242 | */ | 7315 | */ |
| 7243 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 7316 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
| 7244 | { | 7317 | { |
| 7245 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); | 7318 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
| 7246 | unsigned long flags; | 7319 | unsigned long flags; |
| 7247 | 7320 | ||
| 7248 | local_irq_save(flags); | 7321 | local_irq_save(flags); |
| @@ -7376,17 +7449,16 @@ static struct ctl_table sd_ctl_dir[] = { | |||
| 7376 | .procname = "sched_domain", | 7449 | .procname = "sched_domain", |
| 7377 | .mode = 0555, | 7450 | .mode = 0555, |
| 7378 | }, | 7451 | }, |
| 7379 | {0, }, | 7452 | {} |
| 7380 | }; | 7453 | }; |
| 7381 | 7454 | ||
| 7382 | static struct ctl_table sd_ctl_root[] = { | 7455 | static struct ctl_table sd_ctl_root[] = { |
| 7383 | { | 7456 | { |
| 7384 | .ctl_name = CTL_KERN, | ||
| 7385 | .procname = "kernel", | 7457 | .procname = "kernel", |
| 7386 | .mode = 0555, | 7458 | .mode = 0555, |
| 7387 | .child = sd_ctl_dir, | 7459 | .child = sd_ctl_dir, |
| 7388 | }, | 7460 | }, |
| 7389 | {0, }, | 7461 | {} |
| 7390 | }; | 7462 | }; |
| 7391 | 7463 | ||
| 7392 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 7464 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
| @@ -7496,7 +7568,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
| 7496 | static struct ctl_table_header *sd_sysctl_header; | 7568 | static struct ctl_table_header *sd_sysctl_header; |
| 7497 | static void register_sched_domain_sysctl(void) | 7569 | static void register_sched_domain_sysctl(void) |
| 7498 | { | 7570 | { |
| 7499 | int i, cpu_num = num_online_cpus(); | 7571 | int i, cpu_num = num_possible_cpus(); |
| 7500 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 7572 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
| 7501 | char buf[32]; | 7573 | char buf[32]; |
| 7502 | 7574 | ||
| @@ -7506,7 +7578,7 @@ static void register_sched_domain_sysctl(void) | |||
| 7506 | if (entry == NULL) | 7578 | if (entry == NULL) |
| 7507 | return; | 7579 | return; |
| 7508 | 7580 | ||
| 7509 | for_each_online_cpu(i) { | 7581 | for_each_possible_cpu(i) { |
| 7510 | snprintf(buf, 32, "cpu%d", i); | 7582 | snprintf(buf, 32, "cpu%d", i); |
| 7511 | entry->procname = kstrdup(buf, GFP_KERNEL); | 7583 | entry->procname = kstrdup(buf, GFP_KERNEL); |
| 7512 | entry->mode = 0555; | 7584 | entry->mode = 0555; |
| @@ -7636,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 7636 | spin_lock_irq(&rq->lock); | 7708 | spin_lock_irq(&rq->lock); |
| 7637 | update_rq_clock(rq); | 7709 | update_rq_clock(rq); |
| 7638 | deactivate_task(rq, rq->idle, 0); | 7710 | deactivate_task(rq, rq->idle, 0); |
| 7639 | rq->idle->static_prio = MAX_PRIO; | ||
| 7640 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | 7711 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); |
| 7641 | rq->idle->sched_class = &idle_sched_class; | 7712 | rq->idle->sched_class = &idle_sched_class; |
| 7642 | migrate_dead_tasks(cpu); | 7713 | migrate_dead_tasks(cpu); |
| @@ -7710,6 +7781,16 @@ early_initcall(migration_init); | |||
| 7710 | 7781 | ||
| 7711 | #ifdef CONFIG_SCHED_DEBUG | 7782 | #ifdef CONFIG_SCHED_DEBUG |
| 7712 | 7783 | ||
| 7784 | static __read_mostly int sched_domain_debug_enabled; | ||
| 7785 | |||
| 7786 | static int __init sched_domain_debug_setup(char *str) | ||
| 7787 | { | ||
| 7788 | sched_domain_debug_enabled = 1; | ||
| 7789 | |||
| 7790 | return 0; | ||
| 7791 | } | ||
| 7792 | early_param("sched_debug", sched_domain_debug_setup); | ||
| 7793 | |||
| 7713 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 7794 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
| 7714 | struct cpumask *groupmask) | 7795 | struct cpumask *groupmask) |
| 7715 | { | 7796 | { |
| @@ -7796,6 +7877,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 7796 | cpumask_var_t groupmask; | 7877 | cpumask_var_t groupmask; |
| 7797 | int level = 0; | 7878 | int level = 0; |
| 7798 | 7879 | ||
| 7880 | if (!sched_domain_debug_enabled) | ||
| 7881 | return; | ||
| 7882 | |||
| 7799 | if (!sd) { | 7883 | if (!sd) { |
| 7800 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | 7884 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); |
| 7801 | return; | 7885 | return; |
| @@ -7875,6 +7959,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
| 7875 | 7959 | ||
| 7876 | static void free_rootdomain(struct root_domain *rd) | 7960 | static void free_rootdomain(struct root_domain *rd) |
| 7877 | { | 7961 | { |
| 7962 | synchronize_sched(); | ||
| 7963 | |||
| 7878 | cpupri_cleanup(&rd->cpupri); | 7964 | cpupri_cleanup(&rd->cpupri); |
| 7879 | 7965 | ||
| 7880 | free_cpumask_var(rd->rto_mask); | 7966 | free_cpumask_var(rd->rto_mask); |
| @@ -8015,6 +8101,7 @@ static cpumask_var_t cpu_isolated_map; | |||
| 8015 | /* Setup the mask of cpus configured for isolated domains */ | 8101 | /* Setup the mask of cpus configured for isolated domains */ |
| 8016 | static int __init isolated_cpu_setup(char *str) | 8102 | static int __init isolated_cpu_setup(char *str) |
| 8017 | { | 8103 | { |
| 8104 | alloc_bootmem_cpumask_var(&cpu_isolated_map); | ||
| 8018 | cpulist_parse(str, cpu_isolated_map); | 8105 | cpulist_parse(str, cpu_isolated_map); |
| 8019 | return 1; | 8106 | return 1; |
| 8020 | } | 8107 | } |
| @@ -8851,7 +8938,7 @@ static int build_sched_domains(const struct cpumask *cpu_map) | |||
| 8851 | return __build_sched_domains(cpu_map, NULL); | 8938 | return __build_sched_domains(cpu_map, NULL); |
| 8852 | } | 8939 | } |
| 8853 | 8940 | ||
| 8854 | static struct cpumask *doms_cur; /* current sched domains */ | 8941 | static cpumask_var_t *doms_cur; /* current sched domains */ |
| 8855 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 8942 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
| 8856 | static struct sched_domain_attr *dattr_cur; | 8943 | static struct sched_domain_attr *dattr_cur; |
| 8857 | /* attribues of custom domains in 'doms_cur' */ | 8944 | /* attribues of custom domains in 'doms_cur' */ |
| @@ -8873,6 +8960,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void) | |||
| 8873 | return 0; | 8960 | return 0; |
| 8874 | } | 8961 | } |
| 8875 | 8962 | ||
| 8963 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) | ||
| 8964 | { | ||
| 8965 | int i; | ||
| 8966 | cpumask_var_t *doms; | ||
| 8967 | |||
| 8968 | doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); | ||
| 8969 | if (!doms) | ||
| 8970 | return NULL; | ||
| 8971 | for (i = 0; i < ndoms; i++) { | ||
| 8972 | if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { | ||
| 8973 | free_sched_domains(doms, i); | ||
| 8974 | return NULL; | ||
| 8975 | } | ||
| 8976 | } | ||
| 8977 | return doms; | ||
| 8978 | } | ||
| 8979 | |||
| 8980 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) | ||
| 8981 | { | ||
| 8982 | unsigned int i; | ||
| 8983 | for (i = 0; i < ndoms; i++) | ||
| 8984 | free_cpumask_var(doms[i]); | ||
| 8985 | kfree(doms); | ||
| 8986 | } | ||
| 8987 | |||
| 8876 | /* | 8988 | /* |
| 8877 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 8989 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
| 8878 | * For now this just excludes isolated cpus, but could be used to | 8990 | * For now this just excludes isolated cpus, but could be used to |
| @@ -8884,12 +8996,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map) | |||
| 8884 | 8996 | ||
| 8885 | arch_update_cpu_topology(); | 8997 | arch_update_cpu_topology(); |
| 8886 | ndoms_cur = 1; | 8998 | ndoms_cur = 1; |
| 8887 | doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); | 8999 | doms_cur = alloc_sched_domains(ndoms_cur); |
| 8888 | if (!doms_cur) | 9000 | if (!doms_cur) |
| 8889 | doms_cur = fallback_doms; | 9001 | doms_cur = &fallback_doms; |
| 8890 | cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); | 9002 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
| 8891 | dattr_cur = NULL; | 9003 | dattr_cur = NULL; |
| 8892 | err = build_sched_domains(doms_cur); | 9004 | err = build_sched_domains(doms_cur[0]); |
| 8893 | register_sched_domain_sysctl(); | 9005 | register_sched_domain_sysctl(); |
| 8894 | 9006 | ||
| 8895 | return err; | 9007 | return err; |
| @@ -8939,19 +9051,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
| 8939 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | 9051 | * doms_new[] to the current sched domain partitioning, doms_cur[]. |
| 8940 | * It destroys each deleted domain and builds each new domain. | 9052 | * It destroys each deleted domain and builds each new domain. |
| 8941 | * | 9053 | * |
| 8942 | * 'doms_new' is an array of cpumask's of length 'ndoms_new'. | 9054 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. |
| 8943 | * The masks don't intersect (don't overlap.) We should setup one | 9055 | * The masks don't intersect (don't overlap.) We should setup one |
| 8944 | * sched domain for each mask. CPUs not in any of the cpumasks will | 9056 | * sched domain for each mask. CPUs not in any of the cpumasks will |
| 8945 | * not be load balanced. If the same cpumask appears both in the | 9057 | * not be load balanced. If the same cpumask appears both in the |
| 8946 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | 9058 | * current 'doms_cur' domains and in the new 'doms_new', we can leave |
| 8947 | * it as it is. | 9059 | * it as it is. |
| 8948 | * | 9060 | * |
| 8949 | * The passed in 'doms_new' should be kmalloc'd. This routine takes | 9061 | * The passed in 'doms_new' should be allocated using |
| 8950 | * ownership of it and will kfree it when done with it. If the caller | 9062 | * alloc_sched_domains. This routine takes ownership of it and will |
| 8951 | * failed the kmalloc call, then it can pass in doms_new == NULL && | 9063 | * free_sched_domains it when done with it. If the caller failed the |
| 8952 | * ndoms_new == 1, and partition_sched_domains() will fallback to | 9064 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, |
| 8953 | * the single partition 'fallback_doms', it also forces the domains | 9065 | * and partition_sched_domains() will fallback to the single partition |
| 8954 | * to be rebuilt. | 9066 | * 'fallback_doms', it also forces the domains to be rebuilt. |
| 8955 | * | 9067 | * |
| 8956 | * If doms_new == NULL it will be replaced with cpu_online_mask. | 9068 | * If doms_new == NULL it will be replaced with cpu_online_mask. |
| 8957 | * ndoms_new == 0 is a special case for destroying existing domains, | 9069 | * ndoms_new == 0 is a special case for destroying existing domains, |
| @@ -8959,8 +9071,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
| 8959 | * | 9071 | * |
| 8960 | * Call with hotplug lock held | 9072 | * Call with hotplug lock held |
| 8961 | */ | 9073 | */ |
| 8962 | /* FIXME: Change to struct cpumask *doms_new[] */ | 9074 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
| 8963 | void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, | ||
| 8964 | struct sched_domain_attr *dattr_new) | 9075 | struct sched_domain_attr *dattr_new) |
| 8965 | { | 9076 | { |
| 8966 | int i, j, n; | 9077 | int i, j, n; |
| @@ -8979,40 +9090,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, | |||
| 8979 | /* Destroy deleted domains */ | 9090 | /* Destroy deleted domains */ |
| 8980 | for (i = 0; i < ndoms_cur; i++) { | 9091 | for (i = 0; i < ndoms_cur; i++) { |
| 8981 | for (j = 0; j < n && !new_topology; j++) { | 9092 | for (j = 0; j < n && !new_topology; j++) { |
| 8982 | if (cpumask_equal(&doms_cur[i], &doms_new[j]) | 9093 | if (cpumask_equal(doms_cur[i], doms_new[j]) |
| 8983 | && dattrs_equal(dattr_cur, i, dattr_new, j)) | 9094 | && dattrs_equal(dattr_cur, i, dattr_new, j)) |
| 8984 | goto match1; | 9095 | goto match1; |
| 8985 | } | 9096 | } |
| 8986 | /* no match - a current sched domain not in new doms_new[] */ | 9097 | /* no match - a current sched domain not in new doms_new[] */ |
| 8987 | detach_destroy_domains(doms_cur + i); | 9098 | detach_destroy_domains(doms_cur[i]); |
| 8988 | match1: | 9099 | match1: |
| 8989 | ; | 9100 | ; |
| 8990 | } | 9101 | } |
| 8991 | 9102 | ||
| 8992 | if (doms_new == NULL) { | 9103 | if (doms_new == NULL) { |
| 8993 | ndoms_cur = 0; | 9104 | ndoms_cur = 0; |
| 8994 | doms_new = fallback_doms; | 9105 | doms_new = &fallback_doms; |
| 8995 | cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); | 9106 | cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); |
| 8996 | WARN_ON_ONCE(dattr_new); | 9107 | WARN_ON_ONCE(dattr_new); |
| 8997 | } | 9108 | } |
| 8998 | 9109 | ||
| 8999 | /* Build new domains */ | 9110 | /* Build new domains */ |
| 9000 | for (i = 0; i < ndoms_new; i++) { | 9111 | for (i = 0; i < ndoms_new; i++) { |
| 9001 | for (j = 0; j < ndoms_cur && !new_topology; j++) { | 9112 | for (j = 0; j < ndoms_cur && !new_topology; j++) { |
| 9002 | if (cpumask_equal(&doms_new[i], &doms_cur[j]) | 9113 | if (cpumask_equal(doms_new[i], doms_cur[j]) |
| 9003 | && dattrs_equal(dattr_new, i, dattr_cur, j)) | 9114 | && dattrs_equal(dattr_new, i, dattr_cur, j)) |
| 9004 | goto match2; | 9115 | goto match2; |
| 9005 | } | 9116 | } |
| 9006 | /* no match - add a new doms_new */ | 9117 | /* no match - add a new doms_new */ |
| 9007 | __build_sched_domains(doms_new + i, | 9118 | __build_sched_domains(doms_new[i], |
| 9008 | dattr_new ? dattr_new + i : NULL); | 9119 | dattr_new ? dattr_new + i : NULL); |
| 9009 | match2: | 9120 | match2: |
| 9010 | ; | 9121 | ; |
| 9011 | } | 9122 | } |
| 9012 | 9123 | ||
| 9013 | /* Remember the new sched domains */ | 9124 | /* Remember the new sched domains */ |
| 9014 | if (doms_cur != fallback_doms) | 9125 | if (doms_cur != &fallback_doms) |
| 9015 | kfree(doms_cur); | 9126 | free_sched_domains(doms_cur, ndoms_cur); |
| 9016 | kfree(dattr_cur); /* kfree(NULL) is safe */ | 9127 | kfree(dattr_cur); /* kfree(NULL) is safe */ |
| 9017 | doms_cur = doms_new; | 9128 | doms_cur = doms_new; |
| 9018 | dattr_cur = dattr_new; | 9129 | dattr_cur = dattr_new; |
| @@ -9123,8 +9234,10 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
| 9123 | switch (action) { | 9234 | switch (action) { |
| 9124 | case CPU_ONLINE: | 9235 | case CPU_ONLINE: |
| 9125 | case CPU_ONLINE_FROZEN: | 9236 | case CPU_ONLINE_FROZEN: |
| 9126 | case CPU_DEAD: | 9237 | case CPU_DOWN_PREPARE: |
| 9127 | case CPU_DEAD_FROZEN: | 9238 | case CPU_DOWN_PREPARE_FROZEN: |
| 9239 | case CPU_DOWN_FAILED: | ||
| 9240 | case CPU_DOWN_FAILED_FROZEN: | ||
| 9128 | partition_sched_domains(1, NULL, NULL); | 9241 | partition_sched_domains(1, NULL, NULL); |
| 9129 | return NOTIFY_OK; | 9242 | return NOTIFY_OK; |
| 9130 | 9243 | ||
| @@ -9171,7 +9284,7 @@ void __init sched_init_smp(void) | |||
| 9171 | #endif | 9284 | #endif |
| 9172 | get_online_cpus(); | 9285 | get_online_cpus(); |
| 9173 | mutex_lock(&sched_domains_mutex); | 9286 | mutex_lock(&sched_domains_mutex); |
| 9174 | arch_init_sched_domains(cpu_online_mask); | 9287 | arch_init_sched_domains(cpu_active_mask); |
| 9175 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); | 9288 | cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); |
| 9176 | if (cpumask_empty(non_isolated_cpus)) | 9289 | if (cpumask_empty(non_isolated_cpus)) |
| 9177 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); | 9290 | cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); |
| @@ -9334,10 +9447,6 @@ void __init sched_init(void) | |||
| 9334 | #ifdef CONFIG_CPUMASK_OFFSTACK | 9447 | #ifdef CONFIG_CPUMASK_OFFSTACK |
| 9335 | alloc_size += num_possible_cpus() * cpumask_size(); | 9448 | alloc_size += num_possible_cpus() * cpumask_size(); |
| 9336 | #endif | 9449 | #endif |
| 9337 | /* | ||
| 9338 | * As sched_init() is called before page_alloc is setup, | ||
| 9339 | * we use alloc_bootmem(). | ||
| 9340 | */ | ||
| 9341 | if (alloc_size) { | 9450 | if (alloc_size) { |
| 9342 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 9451 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
| 9343 | 9452 | ||
| @@ -9406,6 +9515,10 @@ void __init sched_init(void) | |||
| 9406 | #endif /* CONFIG_USER_SCHED */ | 9515 | #endif /* CONFIG_USER_SCHED */ |
| 9407 | #endif /* CONFIG_GROUP_SCHED */ | 9516 | #endif /* CONFIG_GROUP_SCHED */ |
| 9408 | 9517 | ||
| 9518 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
| 9519 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
| 9520 | __alignof__(unsigned long)); | ||
| 9521 | #endif | ||
| 9409 | for_each_possible_cpu(i) { | 9522 | for_each_possible_cpu(i) { |
| 9410 | struct rq *rq; | 9523 | struct rq *rq; |
| 9411 | 9524 | ||
| @@ -9488,6 +9601,8 @@ void __init sched_init(void) | |||
| 9488 | rq->cpu = i; | 9601 | rq->cpu = i; |
| 9489 | rq->online = 0; | 9602 | rq->online = 0; |
| 9490 | rq->migration_thread = NULL; | 9603 | rq->migration_thread = NULL; |
| 9604 | rq->idle_stamp = 0; | ||
| 9605 | rq->avg_idle = 2*sysctl_sched_migration_cost; | ||
| 9491 | INIT_LIST_HEAD(&rq->migration_queue); | 9606 | INIT_LIST_HEAD(&rq->migration_queue); |
| 9492 | rq_attach_root(rq, &def_root_domain); | 9607 | rq_attach_root(rq, &def_root_domain); |
| 9493 | #endif | 9608 | #endif |
| @@ -9531,13 +9646,15 @@ void __init sched_init(void) | |||
| 9531 | current->sched_class = &fair_sched_class; | 9646 | current->sched_class = &fair_sched_class; |
| 9532 | 9647 | ||
| 9533 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 9648 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
| 9534 | alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 9649 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
| 9535 | #ifdef CONFIG_SMP | 9650 | #ifdef CONFIG_SMP |
| 9536 | #ifdef CONFIG_NO_HZ | 9651 | #ifdef CONFIG_NO_HZ |
| 9537 | alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 9652 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); |
| 9538 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 9653 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); |
| 9539 | #endif | 9654 | #endif |
| 9540 | alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 9655 | /* May be allocated at isolcpus cmdline parse time */ |
| 9656 | if (cpu_isolated_map == NULL) | ||
| 9657 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | ||
| 9541 | #endif /* SMP */ | 9658 | #endif /* SMP */ |
| 9542 | 9659 | ||
| 9543 | perf_event_init(); | 9660 | perf_event_init(); |
| @@ -9731,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 9731 | se = kzalloc_node(sizeof(struct sched_entity), | 9848 | se = kzalloc_node(sizeof(struct sched_entity), |
| 9732 | GFP_KERNEL, cpu_to_node(i)); | 9849 | GFP_KERNEL, cpu_to_node(i)); |
| 9733 | if (!se) | 9850 | if (!se) |
| 9734 | goto err; | 9851 | goto err_free_rq; |
| 9735 | 9852 | ||
| 9736 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 9853 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); |
| 9737 | } | 9854 | } |
| 9738 | 9855 | ||
| 9739 | return 1; | 9856 | return 1; |
| 9740 | 9857 | ||
| 9858 | err_free_rq: | ||
| 9859 | kfree(cfs_rq); | ||
| 9741 | err: | 9860 | err: |
| 9742 | return 0; | 9861 | return 0; |
| 9743 | } | 9862 | } |
| @@ -9819,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
| 9819 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), | 9938 | rt_se = kzalloc_node(sizeof(struct sched_rt_entity), |
| 9820 | GFP_KERNEL, cpu_to_node(i)); | 9939 | GFP_KERNEL, cpu_to_node(i)); |
| 9821 | if (!rt_se) | 9940 | if (!rt_se) |
| 9822 | goto err; | 9941 | goto err_free_rq; |
| 9823 | 9942 | ||
| 9824 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 9943 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); |
| 9825 | } | 9944 | } |
| 9826 | 9945 | ||
| 9827 | return 1; | 9946 | return 1; |
| 9828 | 9947 | ||
| 9948 | err_free_rq: | ||
| 9949 | kfree(rt_rq); | ||
| 9829 | err: | 9950 | err: |
| 9830 | return 0; | 9951 | return 0; |
| 9831 | } | 9952 | } |
| @@ -10867,6 +10988,7 @@ void synchronize_sched_expedited(void) | |||
| 10867 | spin_unlock_irqrestore(&rq->lock, flags); | 10988 | spin_unlock_irqrestore(&rq->lock, flags); |
| 10868 | } | 10989 | } |
| 10869 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | 10990 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; |
| 10991 | synchronize_sched_expedited_count++; | ||
| 10870 | mutex_unlock(&rcu_sched_expedited_mutex); | 10992 | mutex_unlock(&rcu_sched_expedited_mutex); |
| 10871 | put_online_cpus(); | 10993 | put_online_cpus(); |
| 10872 | if (need_full_sync) | 10994 | if (need_full_sync) |
