aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c466
1 files changed, 277 insertions, 189 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c11ae0a948d..ff39cadf621e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -535,14 +535,12 @@ struct rq {
535 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
536 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
537#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
538 unsigned long last_tick_seen;
539 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
540#endif 539#endif
541 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
542 struct load_weight load; 541 struct load_weight load;
543 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
544 u64 nr_switches; 543 u64 nr_switches;
545 u64 nr_migrations_in;
546 544
547 struct cfs_rq cfs; 545 struct cfs_rq cfs;
548 struct rt_rq rt; 546 struct rt_rq rt;
@@ -591,6 +589,8 @@ struct rq {
591 589
592 u64 rt_avg; 590 u64 rt_avg;
593 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
594#endif 594#endif
595 595
596 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -772,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
772 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
773 return -EINVAL; 773 return -EINVAL;
774 774
775 filp->f_pos += cnt; 775 *ppos += cnt;
776 776
777 return cnt; 777 return cnt;
778} 778}
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
814 * default: 0.25ms 814 * default: 0.25ms
815 */ 815 */
816unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
817 818
818/* 819/*
819 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1614 */ 1615 */
1615static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1616{ 1617{
1617 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1618 unsigned long *usd_rq_weight; 1619 unsigned long *usd_rq_weight;
1619 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1620 unsigned long flags; 1621 unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1630 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1632 1633
1634 rq_weight += weight;
1633 /* 1635 /*
1634 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1635 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1638 if (!weight) 1640 if (!weight)
1639 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1640 1642
1641 rq_weight += weight; 1643 sum_weight += weight;
1642 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1643 } 1645 }
1644 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1645 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1646 shares = tg->shares; 1651 shares = tg->shares;
1647 1652
@@ -1810,6 +1815,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1810#endif 1815#endif
1811 1816
1812static void calc_load_account_active(struct rq *this_rq); 1817static void calc_load_account_active(struct rq *this_rq);
1818static void update_sysctl(void);
1819static int get_update_sysctl_factor(void);
1820
1821static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1822{
1823 set_task_rq(p, cpu);
1824#ifdef CONFIG_SMP
1825 /*
1826 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1827 * successfuly executed on another CPU. We must ensure that updates of
1828 * per-task data have been completed by this moment.
1829 */
1830 smp_wmb();
1831 task_thread_info(p)->cpu = cpu;
1832#endif
1833}
1813 1834
1814#include "sched_stats.h" 1835#include "sched_stats.h"
1815#include "sched_idletask.c" 1836#include "sched_idletask.c"
@@ -1967,20 +1988,6 @@ inline int task_curr(const struct task_struct *p)
1967 return cpu_curr(task_cpu(p)) == p; 1988 return cpu_curr(task_cpu(p)) == p;
1968} 1989}
1969 1990
1970static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1971{
1972 set_task_rq(p, cpu);
1973#ifdef CONFIG_SMP
1974 /*
1975 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1976 * successfuly executed on another CPU. We must ensure that updates of
1977 * per-task data have been completed by this moment.
1978 */
1979 smp_wmb();
1980 task_thread_info(p)->cpu = cpu;
1981#endif
1982}
1983
1984static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1991static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1985 const struct sched_class *prev_class, 1992 const struct sched_class *prev_class,
1986 int oldprio, int running) 1993 int oldprio, int running)
@@ -2017,6 +2024,7 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
2017 } 2024 }
2018 2025
2019 spin_lock_irqsave(&rq->lock, flags); 2026 spin_lock_irqsave(&rq->lock, flags);
2027 update_rq_clock(rq);
2020 set_task_cpu(p, cpu); 2028 set_task_cpu(p, cpu);
2021 p->cpus_allowed = cpumask_of_cpu(cpu); 2029 p->cpus_allowed = cpumask_of_cpu(cpu);
2022 p->rt.nr_cpus_allowed = 1; 2030 p->rt.nr_cpus_allowed = 1;
@@ -2059,30 +2067,13 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2059void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2067void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2060{ 2068{
2061 int old_cpu = task_cpu(p); 2069 int old_cpu = task_cpu(p);
2062 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2063 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2070 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2064 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2071 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2065 u64 clock_offset;
2066
2067 clock_offset = old_rq->clock - new_rq->clock;
2068 2072
2069 trace_sched_migrate_task(p, new_cpu); 2073 trace_sched_migrate_task(p, new_cpu);
2070 2074
2071#ifdef CONFIG_SCHEDSTATS
2072 if (p->se.wait_start)
2073 p->se.wait_start -= clock_offset;
2074 if (p->se.sleep_start)
2075 p->se.sleep_start -= clock_offset;
2076 if (p->se.block_start)
2077 p->se.block_start -= clock_offset;
2078#endif
2079 if (old_cpu != new_cpu) { 2075 if (old_cpu != new_cpu) {
2080 p->se.nr_migrations++; 2076 p->se.nr_migrations++;
2081 new_rq->nr_migrations_in++;
2082#ifdef CONFIG_SCHEDSTATS
2083 if (task_hot(p, old_rq->clock, NULL))
2084 schedstat_inc(p, se.nr_forced2_migrations);
2085#endif
2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2077 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2087 1, 1, NULL, 0); 2078 1, 1, NULL, 0);
2088 } 2079 }
@@ -2115,6 +2106,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2115 * it is sufficient to simply update the task's cpu field. 2106 * it is sufficient to simply update the task's cpu field.
2116 */ 2107 */
2117 if (!p->se.on_rq && !task_running(rq, p)) { 2108 if (!p->se.on_rq && !task_running(rq, p)) {
2109 update_rq_clock(rq);
2118 set_task_cpu(p, dest_cpu); 2110 set_task_cpu(p, dest_cpu);
2119 return 0; 2111 return 0;
2120 } 2112 }
@@ -2322,6 +2314,14 @@ void task_oncpu_function_call(struct task_struct *p,
2322 preempt_enable(); 2314 preempt_enable();
2323} 2315}
2324 2316
2317#ifdef CONFIG_SMP
2318static inline
2319int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2320{
2321 return p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2322}
2323#endif
2324
2325/*** 2325/***
2326 * try_to_wake_up - wake up a thread 2326 * try_to_wake_up - wake up a thread
2327 * @p: the to-be-woken-up thread 2327 * @p: the to-be-woken-up thread
@@ -2373,16 +2373,14 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2373 if (task_contributes_to_load(p)) 2373 if (task_contributes_to_load(p))
2374 rq->nr_uninterruptible--; 2374 rq->nr_uninterruptible--;
2375 p->state = TASK_WAKING; 2375 p->state = TASK_WAKING;
2376 task_rq_unlock(rq, &flags); 2376 __task_rq_unlock(rq);
2377 2377
2378 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2378 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2379 if (cpu != orig_cpu) 2379 if (cpu != orig_cpu)
2380 set_task_cpu(p, cpu); 2380 set_task_cpu(p, cpu);
2381 2381
2382 rq = task_rq_lock(p, &flags); 2382 rq = __task_rq_lock(p);
2383 2383 update_rq_clock(rq);
2384 if (rq != orig_rq)
2385 update_rq_clock(rq);
2386 2384
2387 WARN_ON(p->state != TASK_WAKING); 2385 WARN_ON(p->state != TASK_WAKING);
2388 cpu = task_cpu(p); 2386 cpu = task_cpu(p);
@@ -2440,6 +2438,17 @@ out_running:
2440#ifdef CONFIG_SMP 2438#ifdef CONFIG_SMP
2441 if (p->sched_class->task_wake_up) 2439 if (p->sched_class->task_wake_up)
2442 p->sched_class->task_wake_up(rq, p); 2440 p->sched_class->task_wake_up(rq, p);
2441
2442 if (unlikely(rq->idle_stamp)) {
2443 u64 delta = rq->clock - rq->idle_stamp;
2444 u64 max = 2*sysctl_sched_migration_cost;
2445
2446 if (delta > max)
2447 rq->avg_idle = max;
2448 else
2449 update_avg(&rq->avg_idle, delta);
2450 rq->idle_stamp = 0;
2451 }
2443#endif 2452#endif
2444out: 2453out:
2445 task_rq_unlock(rq, &flags); 2454 task_rq_unlock(rq, &flags);
@@ -2486,7 +2495,6 @@ static void __sched_fork(struct task_struct *p)
2486 p->se.avg_overlap = 0; 2495 p->se.avg_overlap = 0;
2487 p->se.start_runtime = 0; 2496 p->se.start_runtime = 0;
2488 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2497 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2489 p->se.avg_running = 0;
2490 2498
2491#ifdef CONFIG_SCHEDSTATS 2499#ifdef CONFIG_SCHEDSTATS
2492 p->se.wait_start = 0; 2500 p->se.wait_start = 0;
@@ -2508,7 +2516,6 @@ static void __sched_fork(struct task_struct *p)
2508 p->se.nr_failed_migrations_running = 0; 2516 p->se.nr_failed_migrations_running = 0;
2509 p->se.nr_failed_migrations_hot = 0; 2517 p->se.nr_failed_migrations_hot = 0;
2510 p->se.nr_forced_migrations = 0; 2518 p->se.nr_forced_migrations = 0;
2511 p->se.nr_forced2_migrations = 0;
2512 2519
2513 p->se.nr_wakeups = 0; 2520 p->se.nr_wakeups = 0;
2514 p->se.nr_wakeups_sync = 0; 2521 p->se.nr_wakeups_sync = 0;
@@ -2578,8 +2585,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2578 if (!rt_prio(p->prio)) 2585 if (!rt_prio(p->prio))
2579 p->sched_class = &fair_sched_class; 2586 p->sched_class = &fair_sched_class;
2580 2587
2588 if (p->sched_class->task_fork)
2589 p->sched_class->task_fork(p);
2590
2581#ifdef CONFIG_SMP 2591#ifdef CONFIG_SMP
2582 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2592 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2583#endif 2593#endif
2584 set_task_cpu(p, cpu); 2594 set_task_cpu(p, cpu);
2585 2595
@@ -2614,17 +2624,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2614 rq = task_rq_lock(p, &flags); 2624 rq = task_rq_lock(p, &flags);
2615 BUG_ON(p->state != TASK_RUNNING); 2625 BUG_ON(p->state != TASK_RUNNING);
2616 update_rq_clock(rq); 2626 update_rq_clock(rq);
2617 2627 activate_task(rq, p, 0);
2618 if (!p->sched_class->task_new || !current->se.on_rq) {
2619 activate_task(rq, p, 0);
2620 } else {
2621 /*
2622 * Let the scheduling class do new task startup
2623 * management (if any):
2624 */
2625 p->sched_class->task_new(rq, p);
2626 inc_nr_running(rq);
2627 }
2628 trace_sched_wakeup_new(rq, p, 1); 2628 trace_sched_wakeup_new(rq, p, 1);
2629 check_preempt_curr(rq, p, WF_FORK); 2629 check_preempt_curr(rq, p, WF_FORK);
2630#ifdef CONFIG_SMP 2630#ifdef CONFIG_SMP
@@ -2848,14 +2848,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2848 */ 2848 */
2849 arch_start_context_switch(prev); 2849 arch_start_context_switch(prev);
2850 2850
2851 if (unlikely(!mm)) { 2851 if (likely(!mm)) {
2852 next->active_mm = oldmm; 2852 next->active_mm = oldmm;
2853 atomic_inc(&oldmm->mm_count); 2853 atomic_inc(&oldmm->mm_count);
2854 enter_lazy_tlb(oldmm, next); 2854 enter_lazy_tlb(oldmm, next);
2855 } else 2855 } else
2856 switch_mm(oldmm, mm, next); 2856 switch_mm(oldmm, mm, next);
2857 2857
2858 if (unlikely(!prev->mm)) { 2858 if (likely(!prev->mm)) {
2859 prev->active_mm = NULL; 2859 prev->active_mm = NULL;
2860 rq->prev_mm = oldmm; 2860 rq->prev_mm = oldmm;
2861 } 2861 }
@@ -3018,15 +3018,6 @@ static void calc_load_account_active(struct rq *this_rq)
3018} 3018}
3019 3019
3020/* 3020/*
3021 * Externally visible per-cpu scheduler statistics:
3022 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3023 */
3024u64 cpu_nr_migrations(int cpu)
3025{
3026 return cpu_rq(cpu)->nr_migrations_in;
3027}
3028
3029/*
3030 * Update rq->cpu_load[] statistics. This function is usually called every 3021 * Update rq->cpu_load[] statistics. This function is usually called every
3031 * scheduler tick (TICK_NSEC). 3022 * scheduler tick (TICK_NSEC).
3032 */ 3023 */
@@ -3148,7 +3139,7 @@ out:
3148void sched_exec(void) 3139void sched_exec(void)
3149{ 3140{
3150 int new_cpu, this_cpu = get_cpu(); 3141 int new_cpu, this_cpu = get_cpu();
3151 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); 3142 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3152 put_cpu(); 3143 put_cpu();
3153 if (new_cpu != this_cpu) 3144 if (new_cpu != this_cpu)
3154 sched_migrate_task(current, new_cpu); 3145 sched_migrate_task(current, new_cpu);
@@ -3164,10 +3155,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3164 deactivate_task(src_rq, p, 0); 3155 deactivate_task(src_rq, p, 0);
3165 set_task_cpu(p, this_cpu); 3156 set_task_cpu(p, this_cpu);
3166 activate_task(this_rq, p, 0); 3157 activate_task(this_rq, p, 0);
3167 /*
3168 * Note that idle threads have a prio of MAX_PRIO, for this test
3169 * to be always true for them.
3170 */
3171 check_preempt_curr(this_rq, p, 0); 3158 check_preempt_curr(this_rq, p, 0);
3172} 3159}
3173 3160
@@ -4126,7 +4113,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4126 unsigned long flags; 4113 unsigned long flags;
4127 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4114 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4128 4115
4129 cpumask_setall(cpus); 4116 cpumask_copy(cpus, cpu_active_mask);
4130 4117
4131 /* 4118 /*
4132 * When power savings policy is enabled for the parent domain, idle 4119 * When power savings policy is enabled for the parent domain, idle
@@ -4289,7 +4276,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4289 int all_pinned = 0; 4276 int all_pinned = 0;
4290 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4277 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4291 4278
4292 cpumask_setall(cpus); 4279 cpumask_copy(cpus, cpu_active_mask);
4293 4280
4294 /* 4281 /*
4295 * When power savings policy is enabled for the parent domain, idle 4282 * When power savings policy is enabled for the parent domain, idle
@@ -4429,6 +4416,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4429 int pulled_task = 0; 4416 int pulled_task = 0;
4430 unsigned long next_balance = jiffies + HZ; 4417 unsigned long next_balance = jiffies + HZ;
4431 4418
4419 this_rq->idle_stamp = this_rq->clock;
4420
4421 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4422 return;
4423
4432 for_each_domain(this_cpu, sd) { 4424 for_each_domain(this_cpu, sd) {
4433 unsigned long interval; 4425 unsigned long interval;
4434 4426
@@ -4443,8 +4435,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4443 interval = msecs_to_jiffies(sd->balance_interval); 4435 interval = msecs_to_jiffies(sd->balance_interval);
4444 if (time_after(next_balance, sd->last_balance + interval)) 4436 if (time_after(next_balance, sd->last_balance + interval))
4445 next_balance = sd->last_balance + interval; 4437 next_balance = sd->last_balance + interval;
4446 if (pulled_task) 4438 if (pulled_task) {
4439 this_rq->idle_stamp = 0;
4447 break; 4440 break;
4441 }
4448 } 4442 }
4449 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4443 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4450 /* 4444 /*
@@ -4679,7 +4673,7 @@ int select_nohz_load_balancer(int stop_tick)
4679 cpumask_set_cpu(cpu, nohz.cpu_mask); 4673 cpumask_set_cpu(cpu, nohz.cpu_mask);
4680 4674
4681 /* time for ilb owner also to sleep */ 4675 /* time for ilb owner also to sleep */
4682 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4676 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4683 if (atomic_read(&nohz.load_balancer) == cpu) 4677 if (atomic_read(&nohz.load_balancer) == cpu)
4684 atomic_set(&nohz.load_balancer, -1); 4678 atomic_set(&nohz.load_balancer, -1);
4685 return 0; 4679 return 0;
@@ -5046,8 +5040,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5046 p->gtime = cputime_add(p->gtime, cputime); 5040 p->gtime = cputime_add(p->gtime, cputime);
5047 5041
5048 /* Add guest time to cpustat. */ 5042 /* Add guest time to cpustat. */
5049 cpustat->user = cputime64_add(cpustat->user, tmp); 5043 if (TASK_NICE(p) > 0) {
5050 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5044 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5045 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5046 } else {
5047 cpustat->user = cputime64_add(cpustat->user, tmp);
5048 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5049 }
5051} 5050}
5052 5051
5053/* 5052/*
@@ -5162,60 +5161,86 @@ void account_idle_ticks(unsigned long ticks)
5162 * Use precise platform statistics if available: 5161 * Use precise platform statistics if available:
5163 */ 5162 */
5164#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5163#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5165cputime_t task_utime(struct task_struct *p) 5164void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5166{ 5165{
5167 return p->utime; 5166 *ut = p->utime;
5167 *st = p->stime;
5168} 5168}
5169 5169
5170cputime_t task_stime(struct task_struct *p) 5170void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5171{ 5171{
5172 return p->stime; 5172 struct task_cputime cputime;
5173
5174 thread_group_cputime(p, &cputime);
5175
5176 *ut = cputime.utime;
5177 *st = cputime.stime;
5173} 5178}
5174#else 5179#else
5175cputime_t task_utime(struct task_struct *p) 5180
5181#ifndef nsecs_to_cputime
5182# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5183#endif
5184
5185void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5176{ 5186{
5177 clock_t utime = cputime_to_clock_t(p->utime), 5187 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5178 total = utime + cputime_to_clock_t(p->stime);
5179 u64 temp;
5180 5188
5181 /* 5189 /*
5182 * Use CFS's precise accounting: 5190 * Use CFS's precise accounting:
5183 */ 5191 */
5184 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5192 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5185 5193
5186 if (total) { 5194 if (total) {
5187 temp *= utime; 5195 u64 temp;
5196
5197 temp = (u64)(rtime * utime);
5188 do_div(temp, total); 5198 do_div(temp, total);
5189 } 5199 utime = (cputime_t)temp;
5190 utime = (clock_t)temp; 5200 } else
5201 utime = rtime;
5191 5202
5192 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5203 /*
5193 return p->prev_utime; 5204 * Compare with previous values, to keep monotonicity:
5205 */
5206 p->prev_utime = max(p->prev_utime, utime);
5207 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5208
5209 *ut = p->prev_utime;
5210 *st = p->prev_stime;
5194} 5211}
5195 5212
5196cputime_t task_stime(struct task_struct *p) 5213/*
5214 * Must be called with siglock held.
5215 */
5216void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5197{ 5217{
5198 clock_t stime; 5218 struct signal_struct *sig = p->signal;
5219 struct task_cputime cputime;
5220 cputime_t rtime, utime, total;
5199 5221
5200 /* 5222 thread_group_cputime(p, &cputime);
5201 * Use CFS's precise accounting. (we subtract utime from
5202 * the total, to make sure the total observed by userspace
5203 * grows monotonically - apps rely on that):
5204 */
5205 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5206 cputime_to_clock_t(task_utime(p));
5207 5223
5208 if (stime >= 0) 5224 total = cputime_add(cputime.utime, cputime.stime);
5209 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5225 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5210 5226
5211 return p->prev_stime; 5227 if (total) {
5212} 5228 u64 temp;
5213#endif
5214 5229
5215inline cputime_t task_gtime(struct task_struct *p) 5230 temp = (u64)(rtime * cputime.utime);
5216{ 5231 do_div(temp, total);
5217 return p->gtime; 5232 utime = (cputime_t)temp;
5233 } else
5234 utime = rtime;
5235
5236 sig->prev_utime = max(sig->prev_utime, utime);
5237 sig->prev_stime = max(sig->prev_stime,
5238 cputime_sub(rtime, sig->prev_utime));
5239
5240 *ut = sig->prev_utime;
5241 *st = sig->prev_stime;
5218} 5242}
5243#endif
5219 5244
5220/* 5245/*
5221 * This function gets called by the timer code, with HZ frequency. 5246 * This function gets called by the timer code, with HZ frequency.
@@ -5350,13 +5375,14 @@ static inline void schedule_debug(struct task_struct *prev)
5350#endif 5375#endif
5351} 5376}
5352 5377
5353static void put_prev_task(struct rq *rq, struct task_struct *p) 5378static void put_prev_task(struct rq *rq, struct task_struct *prev)
5354{ 5379{
5355 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5380 if (prev->state == TASK_RUNNING) {
5381 u64 runtime = prev->se.sum_exec_runtime;
5356 5382
5357 update_avg(&p->se.avg_running, runtime); 5383 runtime -= prev->se.prev_sum_exec_runtime;
5384 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5358 5385
5359 if (p->state == TASK_RUNNING) {
5360 /* 5386 /*
5361 * In order to avoid avg_overlap growing stale when we are 5387 * In order to avoid avg_overlap growing stale when we are
5362 * indeed overlapping and hence not getting put to sleep, grow 5388 * indeed overlapping and hence not getting put to sleep, grow
@@ -5366,12 +5392,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5366 * correlates to the amount of cache footprint a task can 5392 * correlates to the amount of cache footprint a task can
5367 * build up. 5393 * build up.
5368 */ 5394 */
5369 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5395 update_avg(&prev->se.avg_overlap, runtime);
5370 update_avg(&p->se.avg_overlap, runtime);
5371 } else {
5372 update_avg(&p->se.avg_running, 0);
5373 } 5396 }
5374 p->sched_class->put_prev_task(rq, p); 5397 prev->sched_class->put_prev_task(rq, prev);
5375} 5398}
5376 5399
5377/* 5400/*
@@ -5481,7 +5504,7 @@ need_resched_nonpreemptible:
5481} 5504}
5482EXPORT_SYMBOL(schedule); 5505EXPORT_SYMBOL(schedule);
5483 5506
5484#ifdef CONFIG_SMP 5507#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5485/* 5508/*
5486 * Look out! "owner" is an entirely speculative pointer 5509 * Look out! "owner" is an entirely speculative pointer
5487 * access and not reliable. 5510 * access and not reliable.
@@ -6175,22 +6198,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6175 BUG_ON(p->se.on_rq); 6198 BUG_ON(p->se.on_rq);
6176 6199
6177 p->policy = policy; 6200 p->policy = policy;
6178 switch (p->policy) {
6179 case SCHED_NORMAL:
6180 case SCHED_BATCH:
6181 case SCHED_IDLE:
6182 p->sched_class = &fair_sched_class;
6183 break;
6184 case SCHED_FIFO:
6185 case SCHED_RR:
6186 p->sched_class = &rt_sched_class;
6187 break;
6188 }
6189
6190 p->rt_priority = prio; 6201 p->rt_priority = prio;
6191 p->normal_prio = normal_prio(p); 6202 p->normal_prio = normal_prio(p);
6192 /* we are holding p->pi_lock already */ 6203 /* we are holding p->pi_lock already */
6193 p->prio = rt_mutex_getprio(p); 6204 p->prio = rt_mutex_getprio(p);
6205 if (rt_prio(p->prio))
6206 p->sched_class = &rt_sched_class;
6207 else
6208 p->sched_class = &fair_sched_class;
6194 set_load_weight(p); 6209 set_load_weight(p);
6195} 6210}
6196 6211
@@ -6593,6 +6608,8 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6593long sched_getaffinity(pid_t pid, struct cpumask *mask) 6608long sched_getaffinity(pid_t pid, struct cpumask *mask)
6594{ 6609{
6595 struct task_struct *p; 6610 struct task_struct *p;
6611 unsigned long flags;
6612 struct rq *rq;
6596 int retval; 6613 int retval;
6597 6614
6598 get_online_cpus(); 6615 get_online_cpus();
@@ -6607,7 +6624,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6607 if (retval) 6624 if (retval)
6608 goto out_unlock; 6625 goto out_unlock;
6609 6626
6627 rq = task_rq_lock(p, &flags);
6610 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6628 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6629 task_rq_unlock(rq, &flags);
6611 6630
6612out_unlock: 6631out_unlock:
6613 read_unlock(&tasklist_lock); 6632 read_unlock(&tasklist_lock);
@@ -6845,6 +6864,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6845{ 6864{
6846 struct task_struct *p; 6865 struct task_struct *p;
6847 unsigned int time_slice; 6866 unsigned int time_slice;
6867 unsigned long flags;
6868 struct rq *rq;
6848 int retval; 6869 int retval;
6849 struct timespec t; 6870 struct timespec t;
6850 6871
@@ -6861,7 +6882,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6861 if (retval) 6882 if (retval)
6862 goto out_unlock; 6883 goto out_unlock;
6863 6884
6864 time_slice = p->sched_class->get_rr_interval(p); 6885 rq = task_rq_lock(p, &flags);
6886 time_slice = p->sched_class->get_rr_interval(rq, p);
6887 task_rq_unlock(rq, &flags);
6865 6888
6866 read_unlock(&tasklist_lock); 6889 read_unlock(&tasklist_lock);
6867 jiffies_to_timespec(time_slice, &t); 6890 jiffies_to_timespec(time_slice, &t);
@@ -6935,7 +6958,7 @@ void show_state_filter(unsigned long state_filter)
6935 /* 6958 /*
6936 * Only show locks if all tasks are dumped: 6959 * Only show locks if all tasks are dumped:
6937 */ 6960 */
6938 if (state_filter == -1) 6961 if (!state_filter)
6939 debug_show_all_locks(); 6962 debug_show_all_locks();
6940} 6963}
6941 6964
@@ -6962,7 +6985,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6962 __sched_fork(idle); 6985 __sched_fork(idle);
6963 idle->se.exec_start = sched_clock(); 6986 idle->se.exec_start = sched_clock();
6964 6987
6965 idle->prio = idle->normal_prio = MAX_PRIO;
6966 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6988 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6967 __set_task_cpu(idle, cpu); 6989 __set_task_cpu(idle, cpu);
6968 6990
@@ -7003,22 +7025,43 @@ cpumask_var_t nohz_cpu_mask;
7003 * 7025 *
7004 * This idea comes from the SD scheduler of Con Kolivas: 7026 * This idea comes from the SD scheduler of Con Kolivas:
7005 */ 7027 */
7006static inline void sched_init_granularity(void) 7028static int get_update_sysctl_factor(void)
7007{ 7029{
7008 unsigned int factor = 1 + ilog2(num_online_cpus()); 7030 unsigned int cpus = min_t(int, num_online_cpus(), 8);
7009 const unsigned long limit = 200000000; 7031 unsigned int factor;
7010 7032
7011 sysctl_sched_min_granularity *= factor; 7033 switch (sysctl_sched_tunable_scaling) {
7012 if (sysctl_sched_min_granularity > limit) 7034 case SCHED_TUNABLESCALING_NONE:
7013 sysctl_sched_min_granularity = limit; 7035 factor = 1;
7036 break;
7037 case SCHED_TUNABLESCALING_LINEAR:
7038 factor = cpus;
7039 break;
7040 case SCHED_TUNABLESCALING_LOG:
7041 default:
7042 factor = 1 + ilog2(cpus);
7043 break;
7044 }
7014 7045
7015 sysctl_sched_latency *= factor; 7046 return factor;
7016 if (sysctl_sched_latency > limit) 7047}
7017 sysctl_sched_latency = limit;
7018 7048
7019 sysctl_sched_wakeup_granularity *= factor; 7049static void update_sysctl(void)
7050{
7051 unsigned int factor = get_update_sysctl_factor();
7052
7053#define SET_SYSCTL(name) \
7054 (sysctl_##name = (factor) * normalized_sysctl_##name)
7055 SET_SYSCTL(sched_min_granularity);
7056 SET_SYSCTL(sched_latency);
7057 SET_SYSCTL(sched_wakeup_granularity);
7058 SET_SYSCTL(sched_shares_ratelimit);
7059#undef SET_SYSCTL
7060}
7020 7061
7021 sysctl_sched_shares_ratelimit *= factor; 7062static inline void sched_init_granularity(void)
7063{
7064 update_sysctl();
7022} 7065}
7023 7066
7024#ifdef CONFIG_SMP 7067#ifdef CONFIG_SMP
@@ -7055,7 +7098,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7055 int ret = 0; 7098 int ret = 0;
7056 7099
7057 rq = task_rq_lock(p, &flags); 7100 rq = task_rq_lock(p, &flags);
7058 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7101 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7059 ret = -EINVAL; 7102 ret = -EINVAL;
7060 goto out; 7103 goto out;
7061 } 7104 }
@@ -7077,7 +7120,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7077 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7120 if (cpumask_test_cpu(task_cpu(p), new_mask))
7078 goto out; 7121 goto out;
7079 7122
7080 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7123 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7081 /* Need help from migration thread: drop lock and wait. */ 7124 /* Need help from migration thread: drop lock and wait. */
7082 struct task_struct *mt = rq->migration_thread; 7125 struct task_struct *mt = rq->migration_thread;
7083 7126
@@ -7231,19 +7274,19 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7231 7274
7232again: 7275again:
7233 /* Look for allowed, online CPU in same node. */ 7276 /* Look for allowed, online CPU in same node. */
7234 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) 7277 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7235 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7278 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7236 goto move; 7279 goto move;
7237 7280
7238 /* Any allowed, online CPU? */ 7281 /* Any allowed, online CPU? */
7239 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); 7282 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7240 if (dest_cpu < nr_cpu_ids) 7283 if (dest_cpu < nr_cpu_ids)
7241 goto move; 7284 goto move;
7242 7285
7243 /* No more Mr. Nice Guy. */ 7286 /* No more Mr. Nice Guy. */
7244 if (dest_cpu >= nr_cpu_ids) { 7287 if (dest_cpu >= nr_cpu_ids) {
7245 cpuset_cpus_allowed_locked(p, &p->cpus_allowed); 7288 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7246 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); 7289 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7247 7290
7248 /* 7291 /*
7249 * Don't tell them about moving exiting tasks or 7292 * Don't tell them about moving exiting tasks or
@@ -7272,7 +7315,7 @@ move:
7272 */ 7315 */
7273static void migrate_nr_uninterruptible(struct rq *rq_src) 7316static void migrate_nr_uninterruptible(struct rq *rq_src)
7274{ 7317{
7275 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7318 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7276 unsigned long flags; 7319 unsigned long flags;
7277 7320
7278 local_irq_save(flags); 7321 local_irq_save(flags);
@@ -7406,17 +7449,16 @@ static struct ctl_table sd_ctl_dir[] = {
7406 .procname = "sched_domain", 7449 .procname = "sched_domain",
7407 .mode = 0555, 7450 .mode = 0555,
7408 }, 7451 },
7409 {0, }, 7452 {}
7410}; 7453};
7411 7454
7412static struct ctl_table sd_ctl_root[] = { 7455static struct ctl_table sd_ctl_root[] = {
7413 { 7456 {
7414 .ctl_name = CTL_KERN,
7415 .procname = "kernel", 7457 .procname = "kernel",
7416 .mode = 0555, 7458 .mode = 0555,
7417 .child = sd_ctl_dir, 7459 .child = sd_ctl_dir,
7418 }, 7460 },
7419 {0, }, 7461 {}
7420}; 7462};
7421 7463
7422static struct ctl_table *sd_alloc_ctl_entry(int n) 7464static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7526,7 +7568,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7526static struct ctl_table_header *sd_sysctl_header; 7568static struct ctl_table_header *sd_sysctl_header;
7527static void register_sched_domain_sysctl(void) 7569static void register_sched_domain_sysctl(void)
7528{ 7570{
7529 int i, cpu_num = num_online_cpus(); 7571 int i, cpu_num = num_possible_cpus();
7530 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7572 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7531 char buf[32]; 7573 char buf[32];
7532 7574
@@ -7536,7 +7578,7 @@ static void register_sched_domain_sysctl(void)
7536 if (entry == NULL) 7578 if (entry == NULL)
7537 return; 7579 return;
7538 7580
7539 for_each_online_cpu(i) { 7581 for_each_possible_cpu(i) {
7540 snprintf(buf, 32, "cpu%d", i); 7582 snprintf(buf, 32, "cpu%d", i);
7541 entry->procname = kstrdup(buf, GFP_KERNEL); 7583 entry->procname = kstrdup(buf, GFP_KERNEL);
7542 entry->mode = 0555; 7584 entry->mode = 0555;
@@ -7666,7 +7708,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7666 spin_lock_irq(&rq->lock); 7708 spin_lock_irq(&rq->lock);
7667 update_rq_clock(rq); 7709 update_rq_clock(rq);
7668 deactivate_task(rq, rq->idle, 0); 7710 deactivate_task(rq, rq->idle, 0);
7669 rq->idle->static_prio = MAX_PRIO;
7670 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7711 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7671 rq->idle->sched_class = &idle_sched_class; 7712 rq->idle->sched_class = &idle_sched_class;
7672 migrate_dead_tasks(cpu); 7713 migrate_dead_tasks(cpu);
@@ -7740,6 +7781,16 @@ early_initcall(migration_init);
7740 7781
7741#ifdef CONFIG_SCHED_DEBUG 7782#ifdef CONFIG_SCHED_DEBUG
7742 7783
7784static __read_mostly int sched_domain_debug_enabled;
7785
7786static int __init sched_domain_debug_setup(char *str)
7787{
7788 sched_domain_debug_enabled = 1;
7789
7790 return 0;
7791}
7792early_param("sched_debug", sched_domain_debug_setup);
7793
7743static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7794static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7744 struct cpumask *groupmask) 7795 struct cpumask *groupmask)
7745{ 7796{
@@ -7826,6 +7877,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7826 cpumask_var_t groupmask; 7877 cpumask_var_t groupmask;
7827 int level = 0; 7878 int level = 0;
7828 7879
7880 if (!sched_domain_debug_enabled)
7881 return;
7882
7829 if (!sd) { 7883 if (!sd) {
7830 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7884 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7831 return; 7885 return;
@@ -7905,6 +7959,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7905 7959
7906static void free_rootdomain(struct root_domain *rd) 7960static void free_rootdomain(struct root_domain *rd)
7907{ 7961{
7962 synchronize_sched();
7963
7908 cpupri_cleanup(&rd->cpupri); 7964 cpupri_cleanup(&rd->cpupri);
7909 7965
7910 free_cpumask_var(rd->rto_mask); 7966 free_cpumask_var(rd->rto_mask);
@@ -8045,6 +8101,7 @@ static cpumask_var_t cpu_isolated_map;
8045/* Setup the mask of cpus configured for isolated domains */ 8101/* Setup the mask of cpus configured for isolated domains */
8046static int __init isolated_cpu_setup(char *str) 8102static int __init isolated_cpu_setup(char *str)
8047{ 8103{
8104 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8048 cpulist_parse(str, cpu_isolated_map); 8105 cpulist_parse(str, cpu_isolated_map);
8049 return 1; 8106 return 1;
8050} 8107}
@@ -8881,7 +8938,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8881 return __build_sched_domains(cpu_map, NULL); 8938 return __build_sched_domains(cpu_map, NULL);
8882} 8939}
8883 8940
8884static struct cpumask *doms_cur; /* current sched domains */ 8941static cpumask_var_t *doms_cur; /* current sched domains */
8885static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8942static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8886static struct sched_domain_attr *dattr_cur; 8943static struct sched_domain_attr *dattr_cur;
8887 /* attribues of custom domains in 'doms_cur' */ 8944 /* attribues of custom domains in 'doms_cur' */
@@ -8903,6 +8960,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8903 return 0; 8960 return 0;
8904} 8961}
8905 8962
8963cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8964{
8965 int i;
8966 cpumask_var_t *doms;
8967
8968 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8969 if (!doms)
8970 return NULL;
8971 for (i = 0; i < ndoms; i++) {
8972 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8973 free_sched_domains(doms, i);
8974 return NULL;
8975 }
8976 }
8977 return doms;
8978}
8979
8980void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8981{
8982 unsigned int i;
8983 for (i = 0; i < ndoms; i++)
8984 free_cpumask_var(doms[i]);
8985 kfree(doms);
8986}
8987
8906/* 8988/*
8907 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8989 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8908 * For now this just excludes isolated cpus, but could be used to 8990 * For now this just excludes isolated cpus, but could be used to
@@ -8914,12 +8996,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8914 8996
8915 arch_update_cpu_topology(); 8997 arch_update_cpu_topology();
8916 ndoms_cur = 1; 8998 ndoms_cur = 1;
8917 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 8999 doms_cur = alloc_sched_domains(ndoms_cur);
8918 if (!doms_cur) 9000 if (!doms_cur)
8919 doms_cur = fallback_doms; 9001 doms_cur = &fallback_doms;
8920 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 9002 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8921 dattr_cur = NULL; 9003 dattr_cur = NULL;
8922 err = build_sched_domains(doms_cur); 9004 err = build_sched_domains(doms_cur[0]);
8923 register_sched_domain_sysctl(); 9005 register_sched_domain_sysctl();
8924 9006
8925 return err; 9007 return err;
@@ -8969,19 +9051,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8969 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9051 * doms_new[] to the current sched domain partitioning, doms_cur[].
8970 * It destroys each deleted domain and builds each new domain. 9052 * It destroys each deleted domain and builds each new domain.
8971 * 9053 *
8972 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9054 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8973 * The masks don't intersect (don't overlap.) We should setup one 9055 * The masks don't intersect (don't overlap.) We should setup one
8974 * sched domain for each mask. CPUs not in any of the cpumasks will 9056 * sched domain for each mask. CPUs not in any of the cpumasks will
8975 * not be load balanced. If the same cpumask appears both in the 9057 * not be load balanced. If the same cpumask appears both in the
8976 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9058 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8977 * it as it is. 9059 * it as it is.
8978 * 9060 *
8979 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9061 * The passed in 'doms_new' should be allocated using
8980 * ownership of it and will kfree it when done with it. If the caller 9062 * alloc_sched_domains. This routine takes ownership of it and will
8981 * failed the kmalloc call, then it can pass in doms_new == NULL && 9063 * free_sched_domains it when done with it. If the caller failed the
8982 * ndoms_new == 1, and partition_sched_domains() will fallback to 9064 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8983 * the single partition 'fallback_doms', it also forces the domains 9065 * and partition_sched_domains() will fallback to the single partition
8984 * to be rebuilt. 9066 * 'fallback_doms', it also forces the domains to be rebuilt.
8985 * 9067 *
8986 * If doms_new == NULL it will be replaced with cpu_online_mask. 9068 * If doms_new == NULL it will be replaced with cpu_online_mask.
8987 * ndoms_new == 0 is a special case for destroying existing domains, 9069 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8989,8 +9071,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8989 * 9071 *
8990 * Call with hotplug lock held 9072 * Call with hotplug lock held
8991 */ 9073 */
8992/* FIXME: Change to struct cpumask *doms_new[] */ 9074void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8993void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8994 struct sched_domain_attr *dattr_new) 9075 struct sched_domain_attr *dattr_new)
8995{ 9076{
8996 int i, j, n; 9077 int i, j, n;
@@ -9009,40 +9090,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
9009 /* Destroy deleted domains */ 9090 /* Destroy deleted domains */
9010 for (i = 0; i < ndoms_cur; i++) { 9091 for (i = 0; i < ndoms_cur; i++) {
9011 for (j = 0; j < n && !new_topology; j++) { 9092 for (j = 0; j < n && !new_topology; j++) {
9012 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9093 if (cpumask_equal(doms_cur[i], doms_new[j])
9013 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9094 && dattrs_equal(dattr_cur, i, dattr_new, j))
9014 goto match1; 9095 goto match1;
9015 } 9096 }
9016 /* no match - a current sched domain not in new doms_new[] */ 9097 /* no match - a current sched domain not in new doms_new[] */
9017 detach_destroy_domains(doms_cur + i); 9098 detach_destroy_domains(doms_cur[i]);
9018match1: 9099match1:
9019 ; 9100 ;
9020 } 9101 }
9021 9102
9022 if (doms_new == NULL) { 9103 if (doms_new == NULL) {
9023 ndoms_cur = 0; 9104 ndoms_cur = 0;
9024 doms_new = fallback_doms; 9105 doms_new = &fallback_doms;
9025 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9106 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
9026 WARN_ON_ONCE(dattr_new); 9107 WARN_ON_ONCE(dattr_new);
9027 } 9108 }
9028 9109
9029 /* Build new domains */ 9110 /* Build new domains */
9030 for (i = 0; i < ndoms_new; i++) { 9111 for (i = 0; i < ndoms_new; i++) {
9031 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9112 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9032 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9113 if (cpumask_equal(doms_new[i], doms_cur[j])
9033 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9114 && dattrs_equal(dattr_new, i, dattr_cur, j))
9034 goto match2; 9115 goto match2;
9035 } 9116 }
9036 /* no match - add a new doms_new */ 9117 /* no match - add a new doms_new */
9037 __build_sched_domains(doms_new + i, 9118 __build_sched_domains(doms_new[i],
9038 dattr_new ? dattr_new + i : NULL); 9119 dattr_new ? dattr_new + i : NULL);
9039match2: 9120match2:
9040 ; 9121 ;
9041 } 9122 }
9042 9123
9043 /* Remember the new sched domains */ 9124 /* Remember the new sched domains */
9044 if (doms_cur != fallback_doms) 9125 if (doms_cur != &fallback_doms)
9045 kfree(doms_cur); 9126 free_sched_domains(doms_cur, ndoms_cur);
9046 kfree(dattr_cur); /* kfree(NULL) is safe */ 9127 kfree(dattr_cur); /* kfree(NULL) is safe */
9047 doms_cur = doms_new; 9128 doms_cur = doms_new;
9048 dattr_cur = dattr_new; 9129 dattr_cur = dattr_new;
@@ -9153,8 +9234,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9153 switch (action) { 9234 switch (action) {
9154 case CPU_ONLINE: 9235 case CPU_ONLINE:
9155 case CPU_ONLINE_FROZEN: 9236 case CPU_ONLINE_FROZEN:
9156 case CPU_DEAD: 9237 case CPU_DOWN_PREPARE:
9157 case CPU_DEAD_FROZEN: 9238 case CPU_DOWN_PREPARE_FROZEN:
9239 case CPU_DOWN_FAILED:
9240 case CPU_DOWN_FAILED_FROZEN:
9158 partition_sched_domains(1, NULL, NULL); 9241 partition_sched_domains(1, NULL, NULL);
9159 return NOTIFY_OK; 9242 return NOTIFY_OK;
9160 9243
@@ -9201,7 +9284,7 @@ void __init sched_init_smp(void)
9201#endif 9284#endif
9202 get_online_cpus(); 9285 get_online_cpus();
9203 mutex_lock(&sched_domains_mutex); 9286 mutex_lock(&sched_domains_mutex);
9204 arch_init_sched_domains(cpu_online_mask); 9287 arch_init_sched_domains(cpu_active_mask);
9205 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9288 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9206 if (cpumask_empty(non_isolated_cpus)) 9289 if (cpumask_empty(non_isolated_cpus))
9207 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9290 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9364,10 +9447,6 @@ void __init sched_init(void)
9364#ifdef CONFIG_CPUMASK_OFFSTACK 9447#ifdef CONFIG_CPUMASK_OFFSTACK
9365 alloc_size += num_possible_cpus() * cpumask_size(); 9448 alloc_size += num_possible_cpus() * cpumask_size();
9366#endif 9449#endif
9367 /*
9368 * As sched_init() is called before page_alloc is setup,
9369 * we use alloc_bootmem().
9370 */
9371 if (alloc_size) { 9450 if (alloc_size) {
9372 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9451 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9373 9452
@@ -9522,6 +9601,8 @@ void __init sched_init(void)
9522 rq->cpu = i; 9601 rq->cpu = i;
9523 rq->online = 0; 9602 rq->online = 0;
9524 rq->migration_thread = NULL; 9603 rq->migration_thread = NULL;
9604 rq->idle_stamp = 0;
9605 rq->avg_idle = 2*sysctl_sched_migration_cost;
9525 INIT_LIST_HEAD(&rq->migration_queue); 9606 INIT_LIST_HEAD(&rq->migration_queue);
9526 rq_attach_root(rq, &def_root_domain); 9607 rq_attach_root(rq, &def_root_domain);
9527#endif 9608#endif
@@ -9571,7 +9652,9 @@ void __init sched_init(void)
9571 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9652 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9572 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9653 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9573#endif 9654#endif
9574 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9655 /* May be allocated at isolcpus cmdline parse time */
9656 if (cpu_isolated_map == NULL)
9657 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9575#endif /* SMP */ 9658#endif /* SMP */
9576 9659
9577 perf_event_init(); 9660 perf_event_init();
@@ -9765,13 +9848,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9765 se = kzalloc_node(sizeof(struct sched_entity), 9848 se = kzalloc_node(sizeof(struct sched_entity),
9766 GFP_KERNEL, cpu_to_node(i)); 9849 GFP_KERNEL, cpu_to_node(i));
9767 if (!se) 9850 if (!se)
9768 goto err; 9851 goto err_free_rq;
9769 9852
9770 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9853 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9771 } 9854 }
9772 9855
9773 return 1; 9856 return 1;
9774 9857
9858 err_free_rq:
9859 kfree(cfs_rq);
9775 err: 9860 err:
9776 return 0; 9861 return 0;
9777} 9862}
@@ -9853,13 +9938,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9853 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9938 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9854 GFP_KERNEL, cpu_to_node(i)); 9939 GFP_KERNEL, cpu_to_node(i));
9855 if (!rt_se) 9940 if (!rt_se)
9856 goto err; 9941 goto err_free_rq;
9857 9942
9858 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9943 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9859 } 9944 }
9860 9945
9861 return 1; 9946 return 1;
9862 9947
9948 err_free_rq:
9949 kfree(rt_rq);
9863 err: 9950 err:
9864 return 0; 9951 return 0;
9865} 9952}
@@ -10901,6 +10988,7 @@ void synchronize_sched_expedited(void)
10901 spin_unlock_irqrestore(&rq->lock, flags); 10988 spin_unlock_irqrestore(&rq->lock, flags);
10902 } 10989 }
10903 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10990 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10991 synchronize_sched_expedited_count++;
10904 mutex_unlock(&rcu_sched_expedited_mutex); 10992 mutex_unlock(&rcu_sched_expedited_mutex);
10905 put_online_cpus(); 10993 put_online_cpus();
10906 if (need_full_sync) 10994 if (need_full_sync)